extzstd 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/contrib/zstd/CHANGELOG +188 -1
  4. data/contrib/zstd/CONTRIBUTING.md +157 -74
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +81 -58
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +59 -35
  9. data/contrib/zstd/TESTING.md +2 -3
  10. data/contrib/zstd/appveyor.yml +49 -136
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +87 -181
  13. data/contrib/zstd/lib/README.md +23 -6
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +33 -59
  17. data/contrib/zstd/lib/common/compiler.h +115 -45
  18. data/contrib/zstd/lib/common/cpu.h +1 -1
  19. data/contrib/zstd/lib/common/debug.c +1 -1
  20. data/contrib/zstd/lib/common/debug.h +1 -1
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +82 -3
  24. data/contrib/zstd/lib/common/fse.h +9 -85
  25. data/contrib/zstd/lib/common/fse_decompress.c +29 -111
  26. data/contrib/zstd/lib/common/huf.h +84 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -49
  28. data/contrib/zstd/lib/common/pool.c +37 -16
  29. data/contrib/zstd/lib/common/pool.h +9 -3
  30. data/contrib/zstd/lib/common/portability_macros.h +156 -0
  31. data/contrib/zstd/lib/common/threading.c +68 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +7 -809
  34. data/contrib/zstd/lib/common/xxhash.h +5568 -167
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +64 -150
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +69 -150
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +773 -251
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2650 -826
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +509 -180
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +33 -305
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +266 -85
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +369 -132
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +3 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +722 -258
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1105 -360
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +41 -1
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +272 -208
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +324 -197
  63. data/contrib/zstd/lib/compress/zstd_opt.h +1 -1
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +109 -53
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1071 -539
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +576 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +507 -82
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +962 -310
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +54 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +44 -32
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -5
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +24 -16
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +88 -95
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +8 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +16 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +24 -69
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +25 -72
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +23 -69
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +35 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +42 -87
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +35 -82
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +214 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +922 -293
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +7 -6
  105. data/ext/extzstd.c +13 -10
  106. data/ext/libzstd_conf.h +0 -1
  107. data/ext/zstd_decompress_asm.S +1 -0
  108. metadata +16 -5
@@ -1,6 +1,6 @@
1
1
  /* ******************************************************************
2
2
  * Huffman encoder, part of New Generation Entropy library
3
- * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
4
  *
5
5
  * You can contact the author at :
6
6
  * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -29,9 +29,9 @@
29
29
  #include "hist.h"
30
30
  #define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */
31
31
  #include "../common/fse.h" /* header compression */
32
- #define HUF_STATIC_LINKING_ONLY
33
32
  #include "../common/huf.h"
34
33
  #include "../common/error_private.h"
34
+ #include "../common/bits.h" /* ZSTD_highbit32 */
35
35
 
36
36
 
37
37
  /* **************************************************************
@@ -42,24 +42,111 @@
42
42
 
43
43
 
44
44
  /* **************************************************************
45
- * Utils
45
+ * Required declarations
46
46
  ****************************************************************/
47
- unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
47
+ typedef struct nodeElt_s {
48
+ U32 count;
49
+ U16 parent;
50
+ BYTE byte;
51
+ BYTE nbBits;
52
+ } nodeElt;
53
+
54
+
55
+ /* **************************************************************
56
+ * Debug Traces
57
+ ****************************************************************/
58
+
59
+ #if DEBUGLEVEL >= 2
60
+
61
+ static size_t showU32(const U32* arr, size_t size)
62
+ {
63
+ size_t u;
64
+ for (u=0; u<size; u++) {
65
+ RAWLOG(6, " %u", arr[u]); (void)arr;
66
+ }
67
+ RAWLOG(6, " \n");
68
+ return size;
69
+ }
70
+
71
+ static size_t HUF_getNbBits(HUF_CElt elt);
72
+
73
+ static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
74
+ {
75
+ size_t u;
76
+ for (u=0; u<size; u++) {
77
+ RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
78
+ }
79
+ RAWLOG(6, " \n");
80
+ return size;
81
+
82
+ }
83
+
84
+ static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
85
+ {
86
+ size_t u;
87
+ for (u=0; u<size; u++) {
88
+ RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
89
+ }
90
+ RAWLOG(6, " \n");
91
+ return size;
92
+ }
93
+
94
+ static size_t showHNodeBits(const nodeElt* hnode, size_t size)
48
95
  {
49
- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
96
+ size_t u;
97
+ for (u=0; u<size; u++) {
98
+ RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
99
+ }
100
+ RAWLOG(6, " \n");
101
+ return size;
50
102
  }
51
103
 
104
+ #endif
105
+
52
106
 
53
107
  /* *******************************************************
54
108
  * HUF : Huffman block compression
55
109
  *********************************************************/
110
+ #define HUF_WORKSPACE_MAX_ALIGNMENT 8
111
+
112
+ static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align)
113
+ {
114
+ size_t const mask = align - 1;
115
+ size_t const rem = (size_t)workspace & mask;
116
+ size_t const add = (align - rem) & mask;
117
+ BYTE* const aligned = (BYTE*)workspace + add;
118
+ assert((align & (align - 1)) == 0); /* pow 2 */
119
+ assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
120
+ if (*workspaceSizePtr >= add) {
121
+ assert(add < align);
122
+ assert(((size_t)aligned & mask) == 0);
123
+ *workspaceSizePtr -= add;
124
+ return aligned;
125
+ } else {
126
+ *workspaceSizePtr = 0;
127
+ return NULL;
128
+ }
129
+ }
130
+
131
+
56
132
  /* HUF_compressWeights() :
57
133
  * Same as FSE_compress(), but dedicated to huff0's weights compression.
58
134
  * The use case needs much less stack memory.
59
135
  * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
60
136
  */
61
137
  #define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
62
- static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
138
+
139
+ typedef struct {
140
+ FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
141
+ U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)];
142
+ unsigned count[HUF_TABLELOG_MAX+1];
143
+ S16 norm[HUF_TABLELOG_MAX+1];
144
+ } HUF_CompressWeightsWksp;
145
+
146
+ static size_t
147
+ HUF_compressWeights(void* dst, size_t dstSize,
148
+ const void* weightTable, size_t wtSize,
149
+ void* workspace, size_t workspaceSize)
63
150
  {
64
151
  BYTE* const ostart = (BYTE*) dst;
65
152
  BYTE* op = ostart;
@@ -67,33 +154,30 @@ static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weight
67
154
 
68
155
  unsigned maxSymbolValue = HUF_TABLELOG_MAX;
69
156
  U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
157
+ HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
70
158
 
71
- FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
72
- BYTE scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)];
73
-
74
- unsigned count[HUF_TABLELOG_MAX+1];
75
- S16 norm[HUF_TABLELOG_MAX+1];
159
+ if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC);
76
160
 
77
161
  /* init conditions */
78
162
  if (wtSize <= 1) return 0; /* Not compressible */
79
163
 
80
164
  /* Scan input and build symbol stats */
81
- { unsigned const maxCount = HIST_count_simple(count, &maxSymbolValue, weightTable, wtSize); /* never fails */
165
+ { unsigned const maxCount = HIST_count_simple(wksp->count, &maxSymbolValue, weightTable, wtSize); /* never fails */
82
166
  if (maxCount == wtSize) return 1; /* only a single symbol in src : rle */
83
167
  if (maxCount == 1) return 0; /* each symbol present maximum once => not compressible */
84
168
  }
85
169
 
86
170
  tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
87
- CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) );
171
+ CHECK_F( FSE_normalizeCount(wksp->norm, tableLog, wksp->count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) );
88
172
 
89
173
  /* Write table description header */
90
- { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), norm, maxSymbolValue, tableLog) );
174
+ { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), wksp->norm, maxSymbolValue, tableLog) );
91
175
  op += hSize;
92
176
  }
93
177
 
94
178
  /* Compress */
95
- CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) );
96
- { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, CTable) );
179
+ CHECK_F( FSE_buildCTable_wksp(wksp->CTable, wksp->norm, maxSymbolValue, tableLog, wksp->scratchBuffer, sizeof(wksp->scratchBuffer)) );
180
+ { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, wksp->CTable) );
97
181
  if (cSize == 0) return 0; /* not enough space for compressed data */
98
182
  op += cSize;
99
183
  }
@@ -101,30 +185,72 @@ static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weight
101
185
  return (size_t)(op-ostart);
102
186
  }
103
187
 
188
+ static size_t HUF_getNbBits(HUF_CElt elt)
189
+ {
190
+ return elt & 0xFF;
191
+ }
192
+
193
+ static size_t HUF_getNbBitsFast(HUF_CElt elt)
194
+ {
195
+ return elt;
196
+ }
197
+
198
+ static size_t HUF_getValue(HUF_CElt elt)
199
+ {
200
+ return elt & ~(size_t)0xFF;
201
+ }
202
+
203
+ static size_t HUF_getValueFast(HUF_CElt elt)
204
+ {
205
+ return elt;
206
+ }
207
+
208
+ static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
209
+ {
210
+ assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
211
+ *elt = nbBits;
212
+ }
104
213
 
105
- /*! HUF_writeCTable() :
106
- `CTable` : Huffman tree to save, using huf representation.
107
- @return : size of saved CTable */
108
- size_t HUF_writeCTable (void* dst, size_t maxDstSize,
109
- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
214
+ static void HUF_setValue(HUF_CElt* elt, size_t value)
110
215
  {
216
+ size_t const nbBits = HUF_getNbBits(*elt);
217
+ if (nbBits > 0) {
218
+ assert((value >> nbBits) == 0);
219
+ *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
220
+ }
221
+ }
222
+
223
+ typedef struct {
224
+ HUF_CompressWeightsWksp wksp;
111
225
  BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */
112
226
  BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
227
+ } HUF_WriteCTableWksp;
228
+
229
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
230
+ const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
231
+ void* workspace, size_t workspaceSize)
232
+ {
233
+ HUF_CElt const* const ct = CTable + 1;
113
234
  BYTE* op = (BYTE*)dst;
114
235
  U32 n;
236
+ HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
115
237
 
116
- /* check conditions */
238
+ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
239
+
240
+ /* check conditions */
241
+ if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
117
242
  if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
118
243
 
119
244
  /* convert to weight */
120
- bitsToWeight[0] = 0;
245
+ wksp->bitsToWeight[0] = 0;
121
246
  for (n=1; n<huffLog+1; n++)
122
- bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
247
+ wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
123
248
  for (n=0; n<maxSymbolValue; n++)
124
- huffWeight[n] = bitsToWeight[CTable[n].nbBits];
249
+ wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
125
250
 
126
251
  /* attempt weights compression by FSE */
127
- { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) );
252
+ if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
253
+ { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );
128
254
  if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */
129
255
  op[0] = (BYTE)hSize;
130
256
  return hSize+1;
@@ -134,9 +260,9 @@ size_t HUF_writeCTable (void* dst, size_t maxDstSize,
134
260
  if (maxSymbolValue > (256-128)) return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */
135
261
  if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */
136
262
  op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
137
- huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */
263
+ wksp->huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */
138
264
  for (n=0; n<maxSymbolValue; n+=2)
139
- op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
265
+ op[(n/2)+1] = (BYTE)((wksp->huffWeight[n] << 4) + wksp->huffWeight[n+1]);
140
266
  return ((maxSymbolValue+1)/2) + 1;
141
267
  }
142
268
 
@@ -147,6 +273,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
147
273
  U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
148
274
  U32 tableLog = 0;
149
275
  U32 nbSymbols = 0;
276
+ HUF_CElt* const ct = CTable + 1;
150
277
 
151
278
  /* get symbol weights */
152
279
  CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
@@ -156,6 +283,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
156
283
  if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
157
284
  if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
158
285
 
286
+ CTable[0] = tableLog;
287
+
159
288
  /* Prepare base value per rank */
160
289
  { U32 n, nextRankStart = 0;
161
290
  for (n=1; n<=tableLog; n++) {
@@ -167,13 +296,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
167
296
  /* fill nbBits */
168
297
  { U32 n; for (n=0; n<nbSymbols; n++) {
169
298
  const U32 w = huffWeight[n];
170
- CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0);
299
+ HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
171
300
  } }
172
301
 
173
302
  /* fill val */
174
303
  { U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */
175
304
  U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
176
- { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
305
+ { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }
177
306
  /* determine stating value per rank */
178
307
  valPerRank[tableLog+1] = 0; /* for w==0 */
179
308
  { U16 min = 0;
@@ -183,77 +312,73 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
183
312
  min >>= 1;
184
313
  } }
185
314
  /* assign value within rank, symbol order */
186
- { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
315
+ { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
187
316
  }
188
317
 
189
318
  *maxSymbolValuePtr = nbSymbols - 1;
190
319
  return readSize;
191
320
  }
192
321
 
193
- U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
322
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
194
323
  {
195
- const HUF_CElt* table = (const HUF_CElt*)symbolTable;
324
+ const HUF_CElt* const ct = CTable + 1;
196
325
  assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
197
- return table[symbolValue].nbBits;
326
+ return (U32)HUF_getNbBits(ct[symbolValue]);
198
327
  }
199
328
 
200
329
 
201
- typedef struct nodeElt_s {
202
- U32 count;
203
- U16 parent;
204
- BYTE byte;
205
- BYTE nbBits;
206
- } nodeElt;
207
-
208
330
  /**
209
331
  * HUF_setMaxHeight():
210
- * Enforces maxNbBits on the Huffman tree described in huffNode.
332
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
211
333
  *
212
- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
213
- * the tree to so that it is a valid canonical Huffman tree.
334
+ * It attempts to convert all nodes with nbBits > @targetNbBits
335
+ * to employ @targetNbBits instead. Then it adjusts the tree
336
+ * so that it remains a valid canonical Huffman tree.
214
337
  *
215
338
  * @pre The sum of the ranks of each symbol == 2^largestBits,
216
339
  * where largestBits == huffNode[lastNonNull].nbBits.
217
340
  * @post The sum of the ranks of each symbol == 2^largestBits,
218
- * where largestBits is the return value <= maxNbBits.
341
+ * where largestBits is the return value (expected <= targetNbBits).
219
342
  *
220
- * @param huffNode The Huffman tree modified in place to enforce maxNbBits.
343
+ * @param huffNode The Huffman tree modified in place to enforce targetNbBits.
344
+ * It's presumed sorted, from most frequent to rarest symbol.
221
345
  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
222
- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree
346
+ * @param targetNbBits The allowed number of bits, which the Huffman tree
223
347
  * may not respect. After this function the Huffman tree will
224
- * respect maxNbBits.
225
- * @return The maximum number of bits of the Huffman tree after adjustment,
226
- * necessarily no more than maxNbBits.
348
+ * respect targetNbBits.
349
+ * @return The maximum number of bits of the Huffman tree after adjustment.
227
350
  */
228
- static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
351
+ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
229
352
  {
230
353
  const U32 largestBits = huffNode[lastNonNull].nbBits;
231
- /* early exit : no elt > maxNbBits, so the tree is already valid. */
232
- if (largestBits <= maxNbBits) return largestBits;
354
+ /* early exit : no elt > targetNbBits, so the tree is already valid. */
355
+ if (largestBits <= targetNbBits) return largestBits;
356
+
357
+ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
233
358
 
234
359
  /* there are several too large elements (at least >= 2) */
235
360
  { int totalCost = 0;
236
- const U32 baseCost = 1 << (largestBits - maxNbBits);
361
+ const U32 baseCost = 1 << (largestBits - targetNbBits);
237
362
  int n = (int)lastNonNull;
238
363
 
239
- /* Adjust any ranks > maxNbBits to maxNbBits.
364
+ /* Adjust any ranks > targetNbBits to targetNbBits.
240
365
  * Compute totalCost, which is how far the sum of the ranks is
241
366
  * we are over 2^largestBits after adjust the offending ranks.
242
367
  */
243
- while (huffNode[n].nbBits > maxNbBits) {
368
+ while (huffNode[n].nbBits > targetNbBits) {
244
369
  totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
245
- huffNode[n].nbBits = (BYTE)maxNbBits;
370
+ huffNode[n].nbBits = (BYTE)targetNbBits;
246
371
  n--;
247
372
  }
248
- /* n stops at huffNode[n].nbBits <= maxNbBits */
249
- assert(huffNode[n].nbBits <= maxNbBits);
250
- /* n end at index of smallest symbol using < maxNbBits */
251
- while (huffNode[n].nbBits == maxNbBits) --n;
373
+ /* n stops at huffNode[n].nbBits <= targetNbBits */
374
+ assert(huffNode[n].nbBits <= targetNbBits);
375
+ /* n end at index of smallest symbol using < targetNbBits */
376
+ while (huffNode[n].nbBits == targetNbBits) --n;
252
377
 
253
- /* renorm totalCost from 2^largestBits to 2^maxNbBits
378
+ /* renorm totalCost from 2^largestBits to 2^targetNbBits
254
379
  * note : totalCost is necessarily a multiple of baseCost */
255
- assert((totalCost & (baseCost - 1)) == 0);
256
- totalCost >>= (largestBits - maxNbBits);
380
+ assert(((U32)totalCost & (baseCost - 1)) == 0);
381
+ totalCost >>= (largestBits - targetNbBits);
257
382
  assert(totalCost > 0);
258
383
 
259
384
  /* repay normalized cost */
@@ -262,19 +387,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
262
387
 
263
388
  /* Get pos of last (smallest = lowest cum. count) symbol per rank */
264
389
  ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
265
- { U32 currentNbBits = maxNbBits;
390
+ { U32 currentNbBits = targetNbBits;
266
391
  int pos;
267
392
  for (pos=n ; pos >= 0; pos--) {
268
393
  if (huffNode[pos].nbBits >= currentNbBits) continue;
269
- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */
270
- rankLast[maxNbBits-currentNbBits] = (U32)pos;
394
+ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */
395
+ rankLast[targetNbBits-currentNbBits] = (U32)pos;
271
396
  } }
272
397
 
273
398
  while (totalCost > 0) {
274
399
  /* Try to reduce the next power of 2 above totalCost because we
275
400
  * gain back half the rank.
276
401
  */
277
- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
402
+ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
278
403
  for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
279
404
  U32 const highPos = rankLast[nBitsToDecrease];
280
405
  U32 const lowPos = rankLast[nBitsToDecrease-1];
@@ -314,7 +439,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
314
439
  rankLast[nBitsToDecrease] = noSymbol;
315
440
  else {
316
441
  rankLast[nBitsToDecrease]--;
317
- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
442
+ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
318
443
  rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
319
444
  }
320
445
  } /* while (totalCost > 0) */
@@ -326,11 +451,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
326
451
  * TODO.
327
452
  */
328
453
  while (totalCost < 0) { /* Sometimes, cost correction overshoot */
329
- /* special case : no rank 1 symbol (using maxNbBits-1);
330
- * let's create one from largest rank 0 (using maxNbBits).
454
+ /* special case : no rank 1 symbol (using targetNbBits-1);
455
+ * let's create one from largest rank 0 (using targetNbBits).
331
456
  */
332
457
  if (rankLast[1] == noSymbol) {
333
- while (huffNode[n].nbBits == maxNbBits) n--;
458
+ while (huffNode[n].nbBits == targetNbBits) n--;
334
459
  huffNode[n+1].nbBits--;
335
460
  assert(n >= 0);
336
461
  rankLast[1] = (U32)(n+1);
@@ -344,26 +469,122 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
344
469
  } /* repay normalized cost */
345
470
  } /* there are several too large elements (at least >= 2) */
346
471
 
347
- return maxNbBits;
472
+ return targetNbBits;
348
473
  }
349
474
 
350
475
  typedef struct {
351
- U32 base;
352
- U32 curr;
476
+ U16 base;
477
+ U16 curr;
353
478
  } rankPos;
354
479
 
355
- typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
480
+ typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
356
481
 
357
- #define RANK_POSITION_TABLE_SIZE 32
482
+ /* Number of buckets available for HUF_sort() */
483
+ #define RANK_POSITION_TABLE_SIZE 192
358
484
 
359
485
  typedef struct {
360
486
  huffNodeTable huffNodeTbl;
361
487
  rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
362
488
  } HUF_buildCTable_wksp_tables;
363
489
 
490
+ /* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
491
+ * Strategy is to use as many buckets as possible for representing distinct
492
+ * counts while using the remainder to represent all "large" counts.
493
+ *
494
+ * To satisfy this requirement for 192 buckets, we can do the following:
495
+ * Let buckets 0-166 represent distinct counts of [0, 166]
496
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
497
+ */
498
+ #define RANK_POSITION_MAX_COUNT_LOG 32
499
+ #define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
500
+ #define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
501
+
502
+ /* Return the appropriate bucket index for a given count. See definition of
503
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
504
+ */
505
+ static U32 HUF_getIndex(U32 const count) {
506
+ return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
507
+ ? count
508
+ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
509
+ }
510
+
511
+ /* Helper swap function for HUF_quickSortPartition() */
512
+ static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
513
+ nodeElt tmp = *a;
514
+ *a = *b;
515
+ *b = tmp;
516
+ }
517
+
518
+ /* Returns 0 if the huffNode array is not sorted by descending count */
519
+ MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) {
520
+ U32 i;
521
+ for (i = 1; i < maxSymbolValue1; ++i) {
522
+ if (huffNode[i].count > huffNode[i-1].count) {
523
+ return 0;
524
+ }
525
+ }
526
+ return 1;
527
+ }
528
+
529
+ /* Insertion sort by descending order */
530
+ HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) {
531
+ int i;
532
+ int const size = high-low+1;
533
+ huffNode += low;
534
+ for (i = 1; i < size; ++i) {
535
+ nodeElt const key = huffNode[i];
536
+ int j = i - 1;
537
+ while (j >= 0 && huffNode[j].count < key.count) {
538
+ huffNode[j + 1] = huffNode[j];
539
+ j--;
540
+ }
541
+ huffNode[j + 1] = key;
542
+ }
543
+ }
544
+
545
+ /* Pivot helper function for quicksort. */
546
+ static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) {
547
+ /* Simply select rightmost element as pivot. "Better" selectors like
548
+ * median-of-three don't experimentally appear to have any benefit.
549
+ */
550
+ U32 const pivot = arr[high].count;
551
+ int i = low - 1;
552
+ int j = low;
553
+ for ( ; j < high; j++) {
554
+ if (arr[j].count > pivot) {
555
+ i++;
556
+ HUF_swapNodes(&arr[i], &arr[j]);
557
+ }
558
+ }
559
+ HUF_swapNodes(&arr[i + 1], &arr[high]);
560
+ return i + 1;
561
+ }
562
+
563
+ /* Classic quicksort by descending with partially iterative calls
564
+ * to reduce worst case callstack size.
565
+ */
566
+ static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
567
+ int const kInsertionSortThreshold = 8;
568
+ if (high - low < kInsertionSortThreshold) {
569
+ HUF_insertionSort(arr, low, high);
570
+ return;
571
+ }
572
+ while (low < high) {
573
+ int const idx = HUF_quickSortPartition(arr, low, high);
574
+ if (idx - low < high - idx) {
575
+ HUF_simpleQuickSort(arr, low, idx - 1);
576
+ low = idx + 1;
577
+ } else {
578
+ HUF_simpleQuickSort(arr, idx + 1, high);
579
+ high = idx - 1;
580
+ }
581
+ }
582
+ }
583
+
364
584
  /**
365
585
  * HUF_sort():
366
586
  * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
587
+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
367
588
  *
368
589
  * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
369
590
  * Must have (maxSymbolValue + 1) entries.
@@ -371,42 +592,51 @@ typedef struct {
371
592
  * @param[in] maxSymbolValue Maximum symbol value.
372
593
  * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
373
594
  */
374
- static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition)
375
- {
376
- int n;
377
- int const maxSymbolValue1 = (int)maxSymbolValue + 1;
595
+ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
596
+ U32 n;
597
+ U32 const maxSymbolValue1 = maxSymbolValue+1;
378
598
 
379
599
  /* Compute base and set curr to base.
380
- * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1.
381
- * Then 2^lowerRank <= count[n]+1 <= 2^rank.
600
+ * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
601
+ * See HUF_getIndex to see bucketing strategy.
382
602
  * We attribute each symbol to lowerRank's base value, because we want to know where
383
603
  * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
384
604
  */
385
605
  ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
386
606
  for (n = 0; n < maxSymbolValue1; ++n) {
387
- U32 lowerRank = BIT_highbit32(count[n] + 1);
607
+ U32 lowerRank = HUF_getIndex(count[n]);
608
+ assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
388
609
  rankPosition[lowerRank].base++;
389
610
  }
611
+
390
612
  assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
613
+ /* Set up the rankPosition table */
391
614
  for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
392
615
  rankPosition[n-1].base += rankPosition[n].base;
393
616
  rankPosition[n-1].curr = rankPosition[n-1].base;
394
617
  }
395
- /* Sort */
618
+
619
+ /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
396
620
  for (n = 0; n < maxSymbolValue1; ++n) {
397
621
  U32 const c = count[n];
398
- U32 const r = BIT_highbit32(c+1) + 1;
399
- U32 pos = rankPosition[r].curr++;
400
- /* Insert into the correct position in the rank.
401
- * We have at most 256 symbols, so this insertion should be fine.
402
- */
403
- while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) {
404
- huffNode[pos] = huffNode[pos-1];
405
- pos--;
406
- }
622
+ U32 const r = HUF_getIndex(c) + 1;
623
+ U32 const pos = rankPosition[r].curr++;
624
+ assert(pos < maxSymbolValue1);
407
625
  huffNode[pos].count = c;
408
626
  huffNode[pos].byte = (BYTE)n;
409
627
  }
628
+
629
+ /* Sort each bucket. */
630
+ for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
631
+ int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
632
+ U32 const bucketStartIdx = rankPosition[n].base;
633
+ if (bucketSize > 1) {
634
+ assert(bucketStartIdx < maxSymbolValue1);
635
+ HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1);
636
+ }
637
+ }
638
+
639
+ assert(HUF_isSorted(huffNode, maxSymbolValue1));
410
640
  }
411
641
 
412
642
 
@@ -430,6 +660,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
430
660
  int lowS, lowN;
431
661
  int nodeNb = STARTNODE;
432
662
  int n, nodeRoot;
663
+ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
433
664
  /* init for parents */
434
665
  nonNullRank = (int)maxSymbolValue;
435
666
  while(huffNode[nonNullRank].count == 0) nonNullRank--;
@@ -456,6 +687,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
456
687
  for (n=0; n<=nonNullRank; n++)
457
688
  huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
458
689
 
690
+ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
691
+
459
692
  return nonNullRank;
460
693
  }
461
694
 
@@ -471,6 +704,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
471
704
  */
472
705
  static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
473
706
  {
707
+ HUF_CElt* const ct = CTable + 1;
474
708
  /* fill result into ctable (val, nbBits) */
475
709
  int n;
476
710
  U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
@@ -486,127 +720,373 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
486
720
  min >>= 1;
487
721
  } }
488
722
  for (n=0; n<alphabetSize; n++)
489
- CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
723
+ HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */
490
724
  for (n=0; n<alphabetSize; n++)
491
- CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */
725
+ HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */
726
+ CTable[0] = maxNbBits;
492
727
  }
493
728
 
494
- size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
729
+ size_t
730
+ HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
731
+ void* workSpace, size_t wkspSize)
495
732
  {
496
- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace;
733
+ HUF_buildCTable_wksp_tables* const wksp_tables =
734
+ (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
497
735
  nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
498
736
  nodeElt* const huffNode = huffNode0+1;
499
737
  int nonNullRank;
500
738
 
739
+ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
740
+
741
+ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
742
+
501
743
  /* safety checks */
502
- if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
503
744
  if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
504
- return ERROR(workSpace_tooSmall);
745
+ return ERROR(workSpace_tooSmall);
505
746
  if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
506
747
  if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
507
- return ERROR(maxSymbolValue_tooLarge);
748
+ return ERROR(maxSymbolValue_tooLarge);
508
749
  ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
509
750
 
510
751
  /* sort, decreasing order */
511
752
  HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
753
+ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
512
754
 
513
755
  /* build tree */
514
756
  nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
515
757
 
516
- /* enforce maxTableLog */
758
+ /* determine and enforce maxTableLog */
517
759
  maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
518
760
  if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
519
761
 
520
- HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
762
+ HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
521
763
 
522
764
  return maxNbBits;
523
765
  }
524
766
 
525
767
  size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
526
768
  {
769
+ HUF_CElt const* ct = CTable + 1;
527
770
  size_t nbBits = 0;
528
771
  int s;
529
772
  for (s = 0; s <= (int)maxSymbolValue; ++s) {
530
- nbBits += CTable[s].nbBits * count[s];
773
+ nbBits += HUF_getNbBits(ct[s]) * count[s];
531
774
  }
532
775
  return nbBits >> 3;
533
776
  }
534
777
 
535
778
  int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
779
+ HUF_CElt const* ct = CTable + 1;
536
780
  int bad = 0;
537
781
  int s;
538
782
  for (s = 0; s <= (int)maxSymbolValue; ++s) {
539
- bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
783
+ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
540
784
  }
541
785
  return !bad;
542
786
  }
543
787
 
544
788
  size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
545
789
 
790
+ /** HUF_CStream_t:
791
+ * Huffman uses its own BIT_CStream_t implementation.
792
+ * There are three major differences from BIT_CStream_t:
793
+ * 1. HUF_addBits() takes a HUF_CElt (size_t) which is
794
+ * the pair (nbBits, value) in the format:
795
+ * format:
796
+ * - Bits [0, 4) = nbBits
797
+ * - Bits [4, 64 - nbBits) = 0
798
+ * - Bits [64 - nbBits, 64) = value
799
+ * 2. The bitContainer is built from the upper bits and
800
+ * right shifted. E.g. to add a new value of N bits
801
+ * you right shift the bitContainer by N, then or in
802
+ * the new value into the N upper bits.
803
+ * 3. The bitstream has two bit containers. You can add
804
+ * bits to the second container and merge them into
805
+ * the first container.
806
+ */
807
+
808
+ #define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
809
+
810
+ typedef struct {
811
+ size_t bitContainer[2];
812
+ size_t bitPos[2];
813
+
814
+ BYTE* startPtr;
815
+ BYTE* ptr;
816
+ BYTE* endPtr;
817
+ } HUF_CStream_t;
818
+
819
+ /**! HUF_initCStream():
820
+ * Initializes the bitstream.
821
+ * @returns 0 or an error code.
822
+ */
823
+ static size_t HUF_initCStream(HUF_CStream_t* bitC,
824
+ void* startPtr, size_t dstCapacity)
825
+ {
826
+ ZSTD_memset(bitC, 0, sizeof(*bitC));
827
+ bitC->startPtr = (BYTE*)startPtr;
828
+ bitC->ptr = bitC->startPtr;
829
+ bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
830
+ if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall);
831
+ return 0;
832
+ }
833
+
834
+ /*! HUF_addBits():
835
+ * Adds the symbol stored in HUF_CElt elt to the bitstream.
836
+ *
837
+ * @param elt The element we're adding. This is a (nbBits, value) pair.
838
+ * See the HUF_CStream_t docs for the format.
839
+ * @param idx Insert into the bitstream at this idx.
840
+ * @param kFast This is a template parameter. If the bitstream is guaranteed
841
+ * to have at least 4 unused bits after this call it may be 1,
842
+ * otherwise it must be 0. HUF_addBits() is faster when fast is set.
843
+ */
844
+ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast)
845
+ {
846
+ assert(idx <= 1);
847
+ assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
848
+ /* This is efficient on x86-64 with BMI2 because shrx
849
+ * only reads the low 6 bits of the register. The compiler
850
+ * knows this and elides the mask. When fast is set,
851
+ * every operation can use the same value loaded from elt.
852
+ */
853
+ bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
854
+ bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
855
+ /* We only read the low 8 bits of bitC->bitPos[idx] so it
856
+ * doesn't matter that the high bits have noise from the value.
857
+ */
858
+ bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
859
+ assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
860
+ /* The last 4-bits of elt are dirty if fast is set,
861
+ * so we must not be overwriting bits that have already been
862
+ * inserted into the bit container.
863
+ */
864
+ #if DEBUGLEVEL >= 1
865
+ {
866
+ size_t const nbBits = HUF_getNbBits(elt);
867
+ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
868
+ (void)dirtyBits;
869
+ /* Middle bits are 0. */
870
+ assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
871
+ /* We didn't overwrite any bits in the bit container. */
872
+ assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
873
+ (void)dirtyBits;
874
+ }
875
+ #endif
876
+ }
877
+
878
+ FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
879
+ {
880
+ bitC->bitContainer[1] = 0;
881
+ bitC->bitPos[1] = 0;
882
+ }
883
+
884
+ /*! HUF_mergeIndex1() :
885
+ * Merges the bit container @ index 1 into the bit container @ index 0
886
+ * and zeros the bit container @ index 1.
887
+ */
888
+ FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
889
+ {
890
+ assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
891
+ bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
892
+ bitC->bitContainer[0] |= bitC->bitContainer[1];
893
+ bitC->bitPos[0] += bitC->bitPos[1];
894
+ assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
895
+ }
896
+
897
+ /*! HUF_flushBits() :
898
+ * Flushes the bits in the bit container @ index 0.
899
+ *
900
+ * @post bitPos will be < 8.
901
+ * @param kFast If kFast is set then we must know a-priori that
902
+ * the bit container will not overflow.
903
+ */
904
+ FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
905
+ {
906
+ /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
907
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
908
+ size_t const nbBytes = nbBits >> 3;
909
+ /* The top nbBits bits of bitContainer are the ones we need. */
910
+ size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
911
+ /* Mask bitPos to account for the bytes we consumed. */
912
+ bitC->bitPos[0] &= 7;
913
+ assert(nbBits > 0);
914
+ assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
915
+ assert(bitC->ptr <= bitC->endPtr);
916
+ MEM_writeLEST(bitC->ptr, bitContainer);
917
+ bitC->ptr += nbBytes;
918
+ assert(!kFast || bitC->ptr <= bitC->endPtr);
919
+ if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
920
+ /* bitContainer doesn't need to be modified because the leftover
921
+ * bits are already the top bitPos bits. And we don't care about
922
+ * noise in the lower values.
923
+ */
924
+ }
925
+
926
+ /*! HUF_endMark()
927
+ * @returns The Huffman stream end mark: A 1-bit value = 1.
928
+ */
929
+ static HUF_CElt HUF_endMark(void)
930
+ {
931
+ HUF_CElt endMark;
932
+ HUF_setNbBits(&endMark, 1);
933
+ HUF_setValue(&endMark, 1);
934
+ return endMark;
935
+ }
936
+
937
+ /*! HUF_closeCStream() :
938
+ * @return Size of CStream, in bytes,
939
+ * or 0 if it could not fit into dstBuffer */
940
+ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
941
+ {
942
+ HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
943
+ HUF_flushBits(bitC, /* kFast */ 0);
944
+ {
945
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
946
+ if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
947
+ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
948
+ }
949
+ }
950
+
546
951
  FORCE_INLINE_TEMPLATE void
547
- HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
952
+ HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)
548
953
  {
549
- BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
954
+ HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
550
955
  }
551
956
 
552
- #define HUF_FLUSHBITS(s) BIT_flushBits(s)
957
+ FORCE_INLINE_TEMPLATE void
958
+ HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
959
+ const BYTE* ip, size_t srcSize,
960
+ const HUF_CElt* ct,
961
+ int kUnroll, int kFastFlush, int kLastFast)
962
+ {
963
+ /* Join to kUnroll */
964
+ int n = (int)srcSize;
965
+ int rem = n % kUnroll;
966
+ if (rem > 0) {
967
+ for (; rem > 0; --rem) {
968
+ HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
969
+ }
970
+ HUF_flushBits(bitC, kFastFlush);
971
+ }
972
+ assert(n % kUnroll == 0);
553
973
 
554
- #define HUF_FLUSHBITS_1(stream) \
555
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
974
+ /* Join to 2 * kUnroll */
975
+ if (n % (2 * kUnroll)) {
976
+ int u;
977
+ for (u = 1; u < kUnroll; ++u) {
978
+ HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
979
+ }
980
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
981
+ HUF_flushBits(bitC, kFastFlush);
982
+ n -= kUnroll;
983
+ }
984
+ assert(n % (2 * kUnroll) == 0);
985
+
986
+ for (; n>0; n-= 2 * kUnroll) {
987
+ /* Encode kUnroll symbols into the bitstream @ index 0. */
988
+ int u;
989
+ for (u = 1; u < kUnroll; ++u) {
990
+ HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
991
+ }
992
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
993
+ HUF_flushBits(bitC, kFastFlush);
994
+ /* Encode kUnroll symbols into the bitstream @ index 1.
995
+ * This allows us to start filling the bit container
996
+ * without any data dependencies.
997
+ */
998
+ HUF_zeroIndex1(bitC);
999
+ for (u = 1; u < kUnroll; ++u) {
1000
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
1001
+ }
1002
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
1003
+ /* Merge bitstream @ index 1 into the bitstream @ index 0 */
1004
+ HUF_mergeIndex1(bitC);
1005
+ HUF_flushBits(bitC, kFastFlush);
1006
+ }
1007
+ assert(n == 0);
1008
+
1009
+ }
1010
+
1011
+ /**
1012
+ * Returns a tight upper bound on the output space needed by Huffman
1013
+ * with 8 bytes buffer to handle over-writes. If the output is at least
1014
+ * this large we don't need to do bounds checks during Huffman encoding.
1015
+ */
1016
+ static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
1017
+ {
1018
+ return ((srcSize * tableLog) >> 3) + 8;
1019
+ }
556
1020
 
557
- #define HUF_FLUSHBITS_2(stream) \
558
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
559
1021
 
560
1022
  FORCE_INLINE_TEMPLATE size_t
561
1023
  HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
562
1024
  const void* src, size_t srcSize,
563
1025
  const HUF_CElt* CTable)
564
1026
  {
1027
+ U32 const tableLog = (U32)CTable[0];
1028
+ HUF_CElt const* ct = CTable + 1;
565
1029
  const BYTE* ip = (const BYTE*) src;
566
1030
  BYTE* const ostart = (BYTE*)dst;
567
1031
  BYTE* const oend = ostart + dstSize;
568
1032
  BYTE* op = ostart;
569
- size_t n;
570
- BIT_CStream_t bitC;
1033
+ HUF_CStream_t bitC;
571
1034
 
572
1035
  /* init */
573
1036
  if (dstSize < 8) return 0; /* not enough space to compress */
574
- { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op));
1037
+ { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
575
1038
  if (HUF_isError(initErr)) return 0; }
576
1039
 
577
- n = srcSize & ~3; /* join to mod 4 */
578
- switch (srcSize & 3)
579
- {
580
- case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
581
- HUF_FLUSHBITS_2(&bitC);
582
- /* fall-through */
583
- case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
584
- HUF_FLUSHBITS_1(&bitC);
585
- /* fall-through */
586
- case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
587
- HUF_FLUSHBITS(&bitC);
588
- /* fall-through */
589
- case 0 : /* fall-through */
590
- default: break;
591
- }
592
-
593
- for (; n>0; n-=4) { /* note : n&3==0 at this stage */
594
- HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
595
- HUF_FLUSHBITS_1(&bitC);
596
- HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
597
- HUF_FLUSHBITS_2(&bitC);
598
- HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
599
- HUF_FLUSHBITS_1(&bitC);
600
- HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
601
- HUF_FLUSHBITS(&bitC);
1040
+ if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
1041
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
1042
+ else {
1043
+ if (MEM_32bits()) {
1044
+ switch (tableLog) {
1045
+ case 11:
1046
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
1047
+ break;
1048
+ case 10: ZSTD_FALLTHROUGH;
1049
+ case 9: ZSTD_FALLTHROUGH;
1050
+ case 8:
1051
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
1052
+ break;
1053
+ case 7: ZSTD_FALLTHROUGH;
1054
+ default:
1055
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
1056
+ break;
1057
+ }
1058
+ } else {
1059
+ switch (tableLog) {
1060
+ case 11:
1061
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
1062
+ break;
1063
+ case 10:
1064
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
1065
+ break;
1066
+ case 9:
1067
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
1068
+ break;
1069
+ case 8:
1070
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
1071
+ break;
1072
+ case 7:
1073
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
1074
+ break;
1075
+ case 6: ZSTD_FALLTHROUGH;
1076
+ default:
1077
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
1078
+ break;
1079
+ }
1080
+ }
602
1081
  }
1082
+ assert(bitC.ptr <= bitC.endPtr);
603
1083
 
604
- return BIT_closeCStream(&bitC);
1084
+ return HUF_closeCStream(&bitC);
605
1085
  }
606
1086
 
607
1087
  #if DYNAMIC_BMI2
608
1088
 
609
- static TARGET_ATTRIBUTE("bmi2") size_t
1089
+ static BMI2_TARGET_ATTRIBUTE size_t
610
1090
  HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
611
1091
  const void* src, size_t srcSize,
612
1092
  const HUF_CElt* CTable)
@@ -625,9 +1105,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
625
1105
  static size_t
626
1106
  HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
627
1107
  const void* src, size_t srcSize,
628
- const HUF_CElt* CTable, const int bmi2)
1108
+ const HUF_CElt* CTable, const int flags)
629
1109
  {
630
- if (bmi2) {
1110
+ if (flags & HUF_flags_bmi2) {
631
1111
  return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
632
1112
  }
633
1113
  return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
@@ -638,24 +1118,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
638
1118
  static size_t
639
1119
  HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
640
1120
  const void* src, size_t srcSize,
641
- const HUF_CElt* CTable, const int bmi2)
1121
+ const HUF_CElt* CTable, const int flags)
642
1122
  {
643
- (void)bmi2;
1123
+ (void)flags;
644
1124
  return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
645
1125
  }
646
1126
 
647
1127
  #endif
648
1128
 
649
- size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
1129
+ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
650
1130
  {
651
- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1131
+ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
652
1132
  }
653
1133
 
654
-
655
1134
  static size_t
656
1135
  HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
657
1136
  const void* src, size_t srcSize,
658
- const HUF_CElt* CTable, int bmi2)
1137
+ const HUF_CElt* CTable, int flags)
659
1138
  {
660
1139
  size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */
661
1140
  const BYTE* ip = (const BYTE*) src;
@@ -669,27 +1148,24 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
669
1148
  op += 6; /* jumpTable */
670
1149
 
671
1150
  assert(op <= oend);
672
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
673
- if (cSize==0) return 0;
674
- assert(cSize <= 65535);
1151
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
1152
+ if (cSize == 0 || cSize > 65535) return 0;
675
1153
  MEM_writeLE16(ostart, (U16)cSize);
676
1154
  op += cSize;
677
1155
  }
678
1156
 
679
1157
  ip += segmentSize;
680
1158
  assert(op <= oend);
681
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
682
- if (cSize==0) return 0;
683
- assert(cSize <= 65535);
1159
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
1160
+ if (cSize == 0 || cSize > 65535) return 0;
684
1161
  MEM_writeLE16(ostart+2, (U16)cSize);
685
1162
  op += cSize;
686
1163
  }
687
1164
 
688
1165
  ip += segmentSize;
689
1166
  assert(op <= oend);
690
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
691
- if (cSize==0) return 0;
692
- assert(cSize <= 65535);
1167
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
1168
+ if (cSize == 0 || cSize > 65535) return 0;
693
1169
  MEM_writeLE16(ostart+4, (U16)cSize);
694
1170
  op += cSize;
695
1171
  }
@@ -697,17 +1173,17 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
697
1173
  ip += segmentSize;
698
1174
  assert(op <= oend);
699
1175
  assert(ip <= iend);
700
- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
701
- if (cSize==0) return 0;
1176
+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
1177
+ if (cSize == 0 || cSize > 65535) return 0;
702
1178
  op += cSize;
703
1179
  }
704
1180
 
705
1181
  return (size_t)(op-ostart);
706
1182
  }
707
1183
 
708
- size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
1184
+ size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
709
1185
  {
710
- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1186
+ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
711
1187
  }
712
1188
 
713
1189
  typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -715,11 +1191,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
715
1191
  static size_t HUF_compressCTable_internal(
716
1192
  BYTE* const ostart, BYTE* op, BYTE* const oend,
717
1193
  const void* src, size_t srcSize,
718
- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
1194
+ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
719
1195
  {
720
1196
  size_t const cSize = (nbStreams==HUF_singleStream) ?
721
- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
722
- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
1197
+ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
1198
+ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
723
1199
  if (HUF_isError(cSize)) { return cSize; }
724
1200
  if (cSize==0) { return 0; } /* uncompressible */
725
1201
  op += cSize;
@@ -731,31 +1207,111 @@ static size_t HUF_compressCTable_internal(
731
1207
 
732
1208
  typedef struct {
733
1209
  unsigned count[HUF_SYMBOLVALUE_MAX + 1];
734
- HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
735
- HUF_buildCTable_wksp_tables buildCTable_wksp;
1210
+ HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
1211
+ union {
1212
+ HUF_buildCTable_wksp_tables buildCTable_wksp;
1213
+ HUF_WriteCTableWksp writeCTable_wksp;
1214
+ U32 hist_wksp[HIST_WKSP_SIZE_U32];
1215
+ } wksps;
736
1216
  } HUF_compress_tables_t;
737
1217
 
1218
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
1219
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */
1220
+
1221
+ unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
1222
+ {
1223
+ unsigned cardinality = 0;
1224
+ unsigned i;
1225
+
1226
+ for (i = 0; i < maxSymbolValue + 1; i++) {
1227
+ if (count[i] != 0) cardinality += 1;
1228
+ }
1229
+
1230
+ return cardinality;
1231
+ }
1232
+
1233
+ unsigned HUF_minTableLog(unsigned symbolCardinality)
1234
+ {
1235
+ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
1236
+ return minBitsSymbols;
1237
+ }
1238
+
1239
+ unsigned HUF_optimalTableLog(
1240
+ unsigned maxTableLog,
1241
+ size_t srcSize,
1242
+ unsigned maxSymbolValue,
1243
+ void* workSpace, size_t wkspSize,
1244
+ HUF_CElt* table,
1245
+ const unsigned* count,
1246
+ int flags)
1247
+ {
1248
+ assert(srcSize > 1); /* Not supported, RLE should be used instead */
1249
+ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
1250
+
1251
+ if (!(flags & HUF_flags_optimalDepth)) {
1252
+ /* cheap evaluation, based on FSE */
1253
+ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
1254
+ }
1255
+
1256
+ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
1257
+ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
1258
+ size_t maxBits, hSize, newSize;
1259
+ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
1260
+ const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
1261
+ size_t optSize = ((size_t) ~0) - 1;
1262
+ unsigned optLog = maxTableLog, optLogGuess;
1263
+
1264
+ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
1265
+
1266
+ /* Search until size increases */
1267
+ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
1268
+ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
1269
+ maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
1270
+ if (ERR_isError(maxBits)) continue;
1271
+
1272
+ if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
1273
+
1274
+ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
1275
+
1276
+ if (ERR_isError(hSize)) continue;
1277
+
1278
+ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
1279
+
1280
+ if (newSize > optSize + 1) {
1281
+ break;
1282
+ }
1283
+
1284
+ if (newSize < optSize) {
1285
+ optSize = newSize;
1286
+ optLog = optLogGuess;
1287
+ }
1288
+ }
1289
+ assert(optLog <= HUF_TABLELOG_MAX);
1290
+ return optLog;
1291
+ }
1292
+ }
1293
+
738
1294
  /* HUF_compress_internal() :
739
- * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
1295
+ * `workSpace_align4` must be aligned on 4-bytes boundaries,
1296
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
740
1297
  static size_t
741
1298
  HUF_compress_internal (void* dst, size_t dstSize,
742
1299
  const void* src, size_t srcSize,
743
1300
  unsigned maxSymbolValue, unsigned huffLog,
744
1301
  HUF_nbStreams_e nbStreams,
745
1302
  void* workSpace, size_t wkspSize,
746
- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
747
- const int bmi2)
1303
+ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
748
1304
  {
749
- HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace;
1305
+ HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
750
1306
  BYTE* const ostart = (BYTE*)dst;
751
1307
  BYTE* const oend = ostart + dstSize;
752
1308
  BYTE* op = ostart;
753
1309
 
754
- HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE);
1310
+ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
1311
+ HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
755
1312
 
756
1313
  /* checks & inits */
757
- if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
758
- if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
1314
+ if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
759
1315
  if (!srcSize) return 0; /* Uncompressed */
760
1316
  if (!dstSize) return 0; /* cannot fit anything within dst budget */
761
1317
  if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */
@@ -765,17 +1321,34 @@ HUF_compress_internal (void* dst, size_t dstSize,
765
1321
  if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
766
1322
 
767
1323
  /* Heuristic : If old table is valid, use it for small inputs */
768
- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
1324
+ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
769
1325
  return HUF_compressCTable_internal(ostart, op, oend,
770
1326
  src, srcSize,
771
- nbStreams, oldHufTable, bmi2);
1327
+ nbStreams, oldHufTable, flags);
1328
+ }
1329
+
1330
+ /* If uncompressible data is suspected, do a smaller sampling first */
1331
+ DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
1332
+ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
1333
+ size_t largestTotal = 0;
1334
+ DEBUGLOG(5, "input suspected incompressible : sampling to check");
1335
+ { unsigned maxSymbolValueBegin = maxSymbolValue;
1336
+ CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
1337
+ largestTotal += largestBegin;
1338
+ }
1339
+ { unsigned maxSymbolValueEnd = maxSymbolValue;
1340
+ CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
1341
+ largestTotal += largestEnd;
1342
+ }
1343
+ if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */
772
1344
  }
773
1345
 
774
1346
  /* Scan input and build symbol stats */
775
- { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace, wkspSize) );
1347
+ { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );
776
1348
  if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
777
1349
  if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
778
1350
  }
1351
+ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
779
1352
 
780
1353
  /* Check validity of previous table */
781
1354
  if ( repeat
@@ -784,26 +1357,31 @@ HUF_compress_internal (void* dst, size_t dstSize,
784
1357
  *repeat = HUF_repeat_none;
785
1358
  }
786
1359
  /* Heuristic : use existing table for small inputs */
787
- if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
1360
+ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
788
1361
  return HUF_compressCTable_internal(ostart, op, oend,
789
1362
  src, srcSize,
790
- nbStreams, oldHufTable, bmi2);
1363
+ nbStreams, oldHufTable, flags);
791
1364
  }
792
1365
 
793
1366
  /* Build Huffman Tree */
794
- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
1367
+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
795
1368
  { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
796
1369
  maxSymbolValue, huffLog,
797
- &table->buildCTable_wksp, sizeof(table->buildCTable_wksp));
1370
+ &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
798
1371
  CHECK_F(maxBits);
799
1372
  huffLog = (U32)maxBits;
800
- /* Zero unused symbols in CTable, so we can check it for validity */
801
- ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0,
802
- sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
1373
+ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
1374
+ }
1375
+ /* Zero unused symbols in CTable, so we can check it for validity */
1376
+ {
1377
+ size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
1378
+ size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
1379
+ ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
803
1380
  }
804
1381
 
805
1382
  /* Write table description header */
806
- { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) );
1383
+ { CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog,
1384
+ &table->wksps.writeCTable_wksp, sizeof(table->wksps.writeCTable_wksp)) );
807
1385
  /* Check if using previous huffman table is beneficial */
808
1386
  if (repeat && *repeat != HUF_repeat_none) {
809
1387
  size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
@@ -811,7 +1389,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
811
1389
  if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
812
1390
  return HUF_compressCTable_internal(ostart, op, oend,
813
1391
  src, srcSize,
814
- nbStreams, oldHufTable, bmi2);
1392
+ nbStreams, oldHufTable, flags);
815
1393
  } }
816
1394
 
817
1395
  /* Use the new huffman table */
@@ -823,91 +1401,35 @@ HUF_compress_internal (void* dst, size_t dstSize,
823
1401
  }
824
1402
  return HUF_compressCTable_internal(ostart, op, oend,
825
1403
  src, srcSize,
826
- nbStreams, table->CTable, bmi2);
827
- }
828
-
829
-
830
- size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
831
- const void* src, size_t srcSize,
832
- unsigned maxSymbolValue, unsigned huffLog,
833
- void* workSpace, size_t wkspSize)
834
- {
835
- return HUF_compress_internal(dst, dstSize, src, srcSize,
836
- maxSymbolValue, huffLog, HUF_singleStream,
837
- workSpace, wkspSize,
838
- NULL, NULL, 0, 0 /*bmi2*/);
1404
+ nbStreams, table->CTable, flags);
839
1405
  }
840
1406
 
841
1407
  size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
842
1408
  const void* src, size_t srcSize,
843
1409
  unsigned maxSymbolValue, unsigned huffLog,
844
1410
  void* workSpace, size_t wkspSize,
845
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
1411
+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
846
1412
  {
1413
+ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
847
1414
  return HUF_compress_internal(dst, dstSize, src, srcSize,
848
1415
  maxSymbolValue, huffLog, HUF_singleStream,
849
1416
  workSpace, wkspSize, hufTable,
850
- repeat, preferRepeat, bmi2);
851
- }
852
-
853
- /* HUF_compress4X_repeat():
854
- * compress input using 4 streams.
855
- * provide workspace to generate compression tables */
856
- size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
857
- const void* src, size_t srcSize,
858
- unsigned maxSymbolValue, unsigned huffLog,
859
- void* workSpace, size_t wkspSize)
860
- {
861
- return HUF_compress_internal(dst, dstSize, src, srcSize,
862
- maxSymbolValue, huffLog, HUF_fourStreams,
863
- workSpace, wkspSize,
864
- NULL, NULL, 0, 0 /*bmi2*/);
1417
+ repeat, flags);
865
1418
  }
866
1419
 
867
1420
  /* HUF_compress4X_repeat():
868
1421
  * compress input using 4 streams.
1422
+ * consider skipping quickly
869
1423
  * re-use an existing huffman compression table */
870
1424
  size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
871
1425
  const void* src, size_t srcSize,
872
1426
  unsigned maxSymbolValue, unsigned huffLog,
873
1427
  void* workSpace, size_t wkspSize,
874
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
1428
+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
875
1429
  {
1430
+ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
876
1431
  return HUF_compress_internal(dst, dstSize, src, srcSize,
877
1432
  maxSymbolValue, huffLog, HUF_fourStreams,
878
1433
  workSpace, wkspSize,
879
- hufTable, repeat, preferRepeat, bmi2);
1434
+ hufTable, repeat, flags);
880
1435
  }
881
-
882
- #ifndef ZSTD_NO_UNUSED_FUNCTIONS
883
- /** HUF_buildCTable() :
884
- * @return : maxNbBits
885
- * Note : count is used before tree is written, so they can safely overlap
886
- */
887
- size_t HUF_buildCTable (HUF_CElt* tree, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits)
888
- {
889
- HUF_buildCTable_wksp_tables workspace;
890
- return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, &workspace, sizeof(workspace));
891
- }
892
-
893
- size_t HUF_compress1X (void* dst, size_t dstSize,
894
- const void* src, size_t srcSize,
895
- unsigned maxSymbolValue, unsigned huffLog)
896
- {
897
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
898
- return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
899
- }
900
-
901
- size_t HUF_compress2 (void* dst, size_t dstSize,
902
- const void* src, size_t srcSize,
903
- unsigned maxSymbolValue, unsigned huffLog)
904
- {
905
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
906
- return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
907
- }
908
-
909
- size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
910
- {
911
- return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
912
- }
913
- #endif