zstd-ruby 1.4.5.0 → 1.5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/README.md +78 -5
  4. data/Rakefile +8 -2
  5. data/ext/zstdruby/common.h +15 -0
  6. data/ext/zstdruby/extconf.rb +3 -2
  7. data/ext/zstdruby/libzstd/common/allocations.h +55 -0
  8. data/ext/zstdruby/libzstd/common/bits.h +200 -0
  9. data/ext/zstdruby/libzstd/common/bitstream.h +45 -62
  10. data/ext/zstdruby/libzstd/common/compiler.h +205 -22
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  13. data/ext/zstdruby/libzstd/common/debug.h +12 -19
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +172 -48
  15. data/ext/zstdruby/libzstd/common/error_private.c +10 -2
  16. data/ext/zstdruby/libzstd/common/error_private.h +82 -3
  17. data/ext/zstdruby/libzstd/common/fse.h +37 -86
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +117 -92
  19. data/ext/zstdruby/libzstd/common/huf.h +99 -166
  20. data/ext/zstdruby/libzstd/common/mem.h +124 -142
  21. data/ext/zstdruby/libzstd/common/pool.c +54 -27
  22. data/ext/zstdruby/libzstd/common/pool.h +10 -4
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +156 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +74 -19
  25. data/ext/zstdruby/libzstd/common/threading.h +5 -10
  26. data/ext/zstdruby/libzstd/common/xxhash.c +7 -847
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +2 -37
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +132 -187
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +83 -157
  34. data/ext/zstdruby/libzstd/compress/hist.c +27 -29
  35. data/ext/zstdruby/libzstd/compress/hist.h +2 -2
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +916 -279
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3773 -1019
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +610 -203
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +119 -42
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -6
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +42 -19
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +49 -317
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +320 -103
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +388 -151
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +3 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +729 -265
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +3 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1270 -251
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +61 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +324 -219
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +9 -2
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +481 -209
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +181 -457
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +34 -113
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1199 -565
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +576 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +12 -12
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +2 -2
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +627 -157
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1086 -326
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +19 -5
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +62 -13
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +73 -52
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +7 -6
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +44 -35
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +103 -111
  72. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +203 -34
  73. data/ext/zstdruby/libzstd/zstd.h +1217 -287
  74. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +28 -8
  75. data/ext/zstdruby/main.c +20 -0
  76. data/ext/zstdruby/skippable_frame.c +63 -0
  77. data/ext/zstdruby/streaming_compress.c +177 -0
  78. data/ext/zstdruby/streaming_compress.h +5 -0
  79. data/ext/zstdruby/streaming_decompress.c +123 -0
  80. data/ext/zstdruby/zstdruby.c +114 -32
  81. data/lib/zstd-ruby/version.rb +1 -1
  82. data/lib/zstd-ruby.rb +0 -1
  83. data/zstd-ruby.gemspec +1 -1
  84. metadata +19 -36
  85. data/.travis.yml +0 -14
  86. data/ext/zstdruby/libzstd/.gitignore +0 -3
  87. data/ext/zstdruby/libzstd/BUCK +0 -234
  88. data/ext/zstdruby/libzstd/Makefile +0 -354
  89. data/ext/zstdruby/libzstd/README.md +0 -179
  90. data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
  91. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
  92. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -147
  93. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
  94. data/ext/zstdruby/libzstd/dll/example/Makefile +0 -48
  95. data/ext/zstdruby/libzstd/dll/example/README.md +0 -69
  96. data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
  97. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
  98. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
  99. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
  100. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2158
  101. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
  102. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3518
  103. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
  104. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3160
  105. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
  106. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3647
  107. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
  108. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4050
  109. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
  110. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4154
  111. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
  112. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4541
  113. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
  114. data/ext/zstdruby/libzstd/libzstd.pc.in +0 -15
  115. data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,18 +14,18 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
- #include <string.h> /* memcpy, memmove, memset */
17
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
18
  #include "../common/compiler.h" /* prefetch */
19
19
  #include "../common/cpu.h" /* bmi2 */
20
20
  #include "../common/mem.h" /* low level memory routines */
21
21
  #define FSE_STATIC_LINKING_ONLY
22
22
  #include "../common/fse.h"
23
- #define HUF_STATIC_LINKING_ONLY
24
23
  #include "../common/huf.h"
25
24
  #include "../common/zstd_internal.h"
26
25
  #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
26
  #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
27
  #include "zstd_decompress_block.h"
28
+ #include "../common/bits.h" /* ZSTD_highbit32 */
29
29
 
30
30
  /*_*******************************************************
31
31
  * Macros
@@ -44,7 +44,7 @@
44
44
  /*_*******************************************************
45
45
  * Memory operations
46
46
  **********************************************************/
47
- static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
47
+ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
48
 
49
49
 
50
50
  /*-*************************************************************
@@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
69
69
  }
70
70
  }
71
71
 
72
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
+ {
76
+ if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
+ {
78
+ /* room for litbuffer to fit without read faulting */
79
+ dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
81
+ dctx->litBufferLocation = ZSTD_in_dst;
82
+ }
83
+ else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
+ {
85
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
+ if (splitImmediately) {
87
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
+ }
91
+ else {
92
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
93
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
+ }
96
+ dctx->litBufferLocation = ZSTD_split;
97
+ }
98
+ else
99
+ {
100
+ /* fits entirely within litExtraBuffer, so no split is necessary */
101
+ dctx->litBuffer = dctx->litExtraBuffer;
102
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
103
+ dctx->litBufferLocation = ZSTD_not_in_dst;
104
+ }
105
+ }
72
106
 
73
107
  /* Hidden declaration for fullbench */
74
108
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
109
+ const void* src, size_t srcSize,
110
+ void* dst, size_t dstCapacity, const streaming_operation streaming);
76
111
  /*! ZSTD_decodeLiteralsBlock() :
112
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
114
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
+ *
77
117
  * @return : nb of bytes read from src (< srcSize )
78
118
  * note : symbol not declared but exposed for fullbench */
79
119
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
120
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
121
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
122
  {
82
123
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
124
  RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
@@ -90,15 +131,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
90
131
  case set_repeat:
91
132
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
133
  RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
93
- /* fall-through */
134
+ ZSTD_FALLTHROUGH;
94
135
 
95
136
  case set_compressed:
96
- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
137
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
97
138
  { size_t lhSize, litSize, litCSize;
98
139
  U32 singleStream=0;
99
140
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
141
  U32 const lhc = MEM_readLE32(istart);
101
142
  size_t hufSuccess;
143
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
144
+ int const flags = 0
145
+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
146
+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
102
147
  switch(lhlCode)
103
148
  {
104
149
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +166,15 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
166
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
167
  break;
123
168
  }
169
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
124
170
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
171
+ if (!singleStream)
172
+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
173
+ "Not enough literals (%zu) for the 4-streams mode (min %u)",
174
+ litSize, MIN_LITERALS_FOR_4_STREAMS);
125
175
  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
176
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
177
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
178
 
127
179
  /* prefetch huffman table if cold */
128
180
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -131,13 +183,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
131
183
 
132
184
  if (litEncType==set_repeat) {
133
185
  if (singleStream) {
134
- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
186
+ hufSuccess = HUF_decompress1X_usingDTable(
135
187
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
188
+ dctx->HUFptr, flags);
137
189
  } else {
138
- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
190
+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
191
+ hufSuccess = HUF_decompress4X_usingDTable(
139
192
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
193
+ dctx->HUFptr, flags);
141
194
  }
142
195
  } else {
143
196
  if (singleStream) {
@@ -145,20 +198,27 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
145
198
  hufSuccess = HUF_decompress1X_DCtx_wksp(
146
199
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
147
200
  istart+lhSize, litCSize, dctx->workspace,
148
- sizeof(dctx->workspace));
201
+ sizeof(dctx->workspace), flags);
149
202
  #else
150
- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
203
+ hufSuccess = HUF_decompress1X1_DCtx_wksp(
151
204
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
205
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
206
+ sizeof(dctx->workspace), flags);
154
207
  #endif
155
208
  } else {
156
- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
209
+ hufSuccess = HUF_decompress4X_hufOnly_wksp(
157
210
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
211
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
212
+ sizeof(dctx->workspace), flags);
160
213
  }
161
214
  }
215
+ if (dctx->litBufferLocation == ZSTD_split)
216
+ {
217
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
218
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
219
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
220
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
221
+ }
162
222
 
163
223
  RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
224
 
@@ -166,13 +226,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
166
226
  dctx->litSize = litSize;
167
227
  dctx->litEntropy = 1;
168
228
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
229
  return litCSize + lhSize;
171
230
  }
172
231
 
173
232
  case set_basic:
174
233
  { size_t litSize, lhSize;
175
234
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
235
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
176
236
  switch(lhlCode)
177
237
  {
178
238
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -185,27 +245,41 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
185
245
  break;
186
246
  case 3:
187
247
  lhSize = 3;
248
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
188
249
  litSize = MEM_readLE24(istart) >> 4;
189
250
  break;
190
251
  }
191
252
 
253
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
254
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
255
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
256
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
257
  RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
- memcpy(dctx->litBuffer, istart+lhSize, litSize);
258
+ if (dctx->litBufferLocation == ZSTD_split)
259
+ {
260
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
261
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
262
+ }
263
+ else
264
+ {
265
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
266
+ }
195
267
  dctx->litPtr = dctx->litBuffer;
196
268
  dctx->litSize = litSize;
197
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
269
  return lhSize+litSize;
199
270
  }
200
271
  /* direct reference into compressed stream */
201
272
  dctx->litPtr = istart+lhSize;
202
273
  dctx->litSize = litSize;
274
+ dctx->litBufferEnd = dctx->litPtr + litSize;
275
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
276
  return lhSize+litSize;
204
277
  }
205
278
 
206
279
  case set_rle:
207
280
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
281
  size_t litSize, lhSize;
282
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
209
283
  switch(lhlCode)
210
284
  {
211
285
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -214,16 +288,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
214
288
  break;
215
289
  case 1:
216
290
  lhSize = 2;
291
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
217
292
  litSize = MEM_readLE16(istart) >> 4;
218
293
  break;
219
294
  case 3:
220
295
  lhSize = 3;
296
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
221
297
  litSize = MEM_readLE24(istart) >> 4;
222
- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
298
  break;
224
299
  }
300
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
225
301
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
- memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
302
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
303
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
304
+ if (dctx->litBufferLocation == ZSTD_split)
305
+ {
306
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
307
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
308
+ }
309
+ else
310
+ {
311
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
312
+ }
227
313
  dctx->litPtr = dctx->litBuffer;
228
314
  dctx->litSize = litSize;
229
315
  return lhSize+1;
@@ -236,7 +322,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
236
322
 
237
323
  /* Default FSE distribution tables.
238
324
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
325
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
240
326
  * They were generated programmatically with following method :
241
327
  * - start from default distributions, present in /lib/common/zstd_internal.h
242
328
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -343,7 +429,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
429
  }; /* ML_defaultDTable */
344
430
 
345
431
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
432
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
433
  {
348
434
  void* ptr = dt;
349
435
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +441,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
441
  cell->nbBits = 0;
356
442
  cell->nextState = 0;
357
443
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
444
+ cell->nbAdditionalBits = nbAddBits;
359
445
  cell->baseValue = baseValue;
360
446
  }
361
447
 
@@ -364,23 +450,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
364
450
  * generate FSE decoding table for one symbol (ll, ml or off)
365
451
  * cannot fail if input is valid =>
366
452
  * all inputs are presumed validated at this stage */
367
- void
368
- ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
453
+ FORCE_INLINE_TEMPLATE
454
+ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
455
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
371
- unsigned tableLog)
456
+ const U32* baseValue, const U8* nbAdditionalBits,
457
+ unsigned tableLog, void* wksp, size_t wkspSize)
372
458
  {
373
459
  ZSTD_seqSymbol* const tableDecode = dt+1;
374
- U16 symbolNext[MaxSeq+1];
375
-
376
460
  U32 const maxSV1 = maxSymbolValue + 1;
377
461
  U32 const tableSize = 1 << tableLog;
378
- U32 highThreshold = tableSize-1;
462
+
463
+ U16* symbolNext = (U16*)wksp;
464
+ BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
465
+ U32 highThreshold = tableSize - 1;
466
+
379
467
 
380
468
  /* Sanity Checks */
381
469
  assert(maxSymbolValue <= MaxSeq);
382
470
  assert(tableLog <= MaxFSELog);
383
-
471
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
472
+ (void)wkspSize;
384
473
  /* Init, lay down lowprob symbols */
385
474
  { ZSTD_seqSymbol_header DTableH;
386
475
  DTableH.tableLog = tableLog;
@@ -396,34 +485,128 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
396
485
  assert(normalizedCounter[s]>=0);
397
486
  symbolNext[s] = (U16)normalizedCounter[s];
398
487
  } } }
399
- memcpy(dt, &DTableH, sizeof(DTableH));
488
+ ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
400
489
  }
401
490
 
402
491
  /* Spread symbols */
403
- { U32 const tableMask = tableSize-1;
492
+ assert(tableSize <= 512);
493
+ /* Specialized symbol spreading for the case when there are
494
+ * no low probability (-1 count) symbols. When compressing
495
+ * small blocks we avoid low probability symbols to hit this
496
+ * case, since header decoding speed matters more.
497
+ */
498
+ if (highThreshold == tableSize - 1) {
499
+ size_t const tableMask = tableSize-1;
500
+ size_t const step = FSE_TABLESTEP(tableSize);
501
+ /* First lay down the symbols in order.
502
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
503
+ * misses since small blocks generally have small table logs, so nearly
504
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
505
+ * our buffer to handle the over-write.
506
+ */
507
+ {
508
+ U64 const add = 0x0101010101010101ull;
509
+ size_t pos = 0;
510
+ U64 sv = 0;
511
+ U32 s;
512
+ for (s=0; s<maxSV1; ++s, sv += add) {
513
+ int i;
514
+ int const n = normalizedCounter[s];
515
+ MEM_write64(spread + pos, sv);
516
+ for (i = 8; i < n; i += 8) {
517
+ MEM_write64(spread + pos + i, sv);
518
+ }
519
+ assert(n>=0);
520
+ pos += (size_t)n;
521
+ }
522
+ }
523
+ /* Now we spread those positions across the table.
524
+ * The benefit of doing it in two stages is that we avoid the
525
+ * variable size inner loop, which caused lots of branch misses.
526
+ * Now we can run through all the positions without any branch misses.
527
+ * We unroll the loop twice, since that is what empirically worked best.
528
+ */
529
+ {
530
+ size_t position = 0;
531
+ size_t s;
532
+ size_t const unroll = 2;
533
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
534
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
535
+ size_t u;
536
+ for (u = 0; u < unroll; ++u) {
537
+ size_t const uPosition = (position + (u * step)) & tableMask;
538
+ tableDecode[uPosition].baseValue = spread[s + u];
539
+ }
540
+ position = (position + (unroll * step)) & tableMask;
541
+ }
542
+ assert(position == 0);
543
+ }
544
+ } else {
545
+ U32 const tableMask = tableSize-1;
404
546
  U32 const step = FSE_TABLESTEP(tableSize);
405
547
  U32 s, position = 0;
406
548
  for (s=0; s<maxSV1; s++) {
407
549
  int i;
408
- for (i=0; i<normalizedCounter[s]; i++) {
550
+ int const n = normalizedCounter[s];
551
+ for (i=0; i<n; i++) {
409
552
  tableDecode[position].baseValue = s;
410
553
  position = (position + step) & tableMask;
411
- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
554
+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
412
555
  } }
413
556
  assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
414
557
  }
415
558
 
416
559
  /* Build Decoding table */
417
- { U32 u;
560
+ {
561
+ U32 u;
418
562
  for (u=0; u<tableSize; u++) {
419
563
  U32 const symbol = tableDecode[u].baseValue;
420
564
  U32 const nextState = symbolNext[symbol]++;
421
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
565
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
422
566
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
423
567
  assert(nbAdditionalBits[symbol] < 255);
424
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
568
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
425
569
  tableDecode[u].baseValue = baseValue[symbol];
426
- } }
570
+ }
571
+ }
572
+ }
573
+
574
+ /* Avoids the FORCE_INLINE of the _body() function. */
575
+ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
576
+ const short* normalizedCounter, unsigned maxSymbolValue,
577
+ const U32* baseValue, const U8* nbAdditionalBits,
578
+ unsigned tableLog, void* wksp, size_t wkspSize)
579
+ {
580
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
581
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
582
+ }
583
+
584
+ #if DYNAMIC_BMI2
585
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
586
+ const short* normalizedCounter, unsigned maxSymbolValue,
587
+ const U32* baseValue, const U8* nbAdditionalBits,
588
+ unsigned tableLog, void* wksp, size_t wkspSize)
589
+ {
590
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
591
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
592
+ }
593
+ #endif
594
+
595
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
596
+ const short* normalizedCounter, unsigned maxSymbolValue,
597
+ const U32* baseValue, const U8* nbAdditionalBits,
598
+ unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
599
+ {
600
+ #if DYNAMIC_BMI2
601
+ if (bmi2) {
602
+ ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
603
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
604
+ return;
605
+ }
606
+ #endif
607
+ (void)bmi2;
608
+ ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
609
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
427
610
  }
428
611
 
429
612
 
@@ -433,9 +616,10 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
433
616
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
434
617
  symbolEncodingType_e type, unsigned max, U32 maxLog,
435
618
  const void* src, size_t srcSize,
436
- const U32* baseValue, const U32* nbAdditionalBits,
619
+ const U32* baseValue, const U8* nbAdditionalBits,
437
620
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
438
- int ddictIsCold, int nbSeq)
621
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
622
+ int bmi2)
439
623
  {
440
624
  switch(type)
441
625
  {
@@ -444,7 +628,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
444
628
  RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
445
629
  { U32 const symbol = *(const BYTE*)src;
446
630
  U32 const baseline = baseValue[symbol];
447
- U32 const nbBits = nbAdditionalBits[symbol];
631
+ U8 const nbBits = nbAdditionalBits[symbol];
448
632
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
449
633
  }
450
634
  *DTablePtr = DTableSpace;
@@ -467,7 +651,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
467
651
  size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
468
652
  RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
469
653
  RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
470
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
654
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
471
655
  *DTablePtr = DTableSpace;
472
656
  return headerSize;
473
657
  }
@@ -480,7 +664,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
480
664
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
481
665
  const void* src, size_t srcSize)
482
666
  {
483
- const BYTE* const istart = (const BYTE* const)src;
667
+ const BYTE* const istart = (const BYTE*)src;
484
668
  const BYTE* const iend = istart + srcSize;
485
669
  const BYTE* ip = istart;
486
670
  int nbSeq;
@@ -499,7 +683,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
499
683
  if (nbSeq > 0x7F) {
500
684
  if (nbSeq == 0xFF) {
501
685
  RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
502
- nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
686
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
687
+ ip+=2;
503
688
  } else {
504
689
  RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
505
690
  nbSeq = ((nbSeq-0x80)<<8) + *ip++;
@@ -520,7 +705,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
520
705
  ip, iend-ip,
521
706
  LL_base, LL_bits,
522
707
  LL_defaultDTable, dctx->fseEntropy,
523
- dctx->ddictIsCold, nbSeq);
708
+ dctx->ddictIsCold, nbSeq,
709
+ dctx->workspace, sizeof(dctx->workspace),
710
+ ZSTD_DCtx_get_bmi2(dctx));
524
711
  RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
525
712
  ip += llhSize;
526
713
  }
@@ -530,7 +717,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
530
717
  ip, iend-ip,
531
718
  OF_base, OF_bits,
532
719
  OF_defaultDTable, dctx->fseEntropy,
533
- dctx->ddictIsCold, nbSeq);
720
+ dctx->ddictIsCold, nbSeq,
721
+ dctx->workspace, sizeof(dctx->workspace),
722
+ ZSTD_DCtx_get_bmi2(dctx));
534
723
  RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
535
724
  ip += ofhSize;
536
725
  }
@@ -540,7 +729,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
540
729
  ip, iend-ip,
541
730
  ML_base, ML_bits,
542
731
  ML_defaultDTable, dctx->fseEntropy,
543
- dctx->ddictIsCold, nbSeq);
732
+ dctx->ddictIsCold, nbSeq,
733
+ dctx->workspace, sizeof(dctx->workspace),
734
+ ZSTD_DCtx_get_bmi2(dctx));
544
735
  RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
545
736
  ip += mlhSize;
546
737
  }
@@ -554,7 +745,6 @@ typedef struct {
554
745
  size_t litLength;
555
746
  size_t matchLength;
556
747
  size_t offset;
557
- const BYTE* match;
558
748
  } seq_t;
559
749
 
560
750
  typedef struct {
@@ -568,9 +758,6 @@ typedef struct {
568
758
  ZSTD_fseState stateOffb;
569
759
  ZSTD_fseState stateML;
570
760
  size_t prevOffset[ZSTD_REP_NUM];
571
- const BYTE* prefixStart;
572
- const BYTE* dictEnd;
573
- size_t pos;
574
761
  } seqState_t;
575
762
 
576
763
  /*! ZSTD_overlapCopy8() :
@@ -613,7 +800,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
613
800
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
614
801
  * The src buffer must be before the dst buffer.
615
802
  */
616
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
803
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
617
804
  ptrdiff_t const diff = op - ip;
618
805
  BYTE* const oend = op + length;
619
806
 
@@ -629,6 +816,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
629
816
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
630
817
  assert(length >= 8);
631
818
  ZSTD_overlapCopy8(&op, &ip, diff);
819
+ length -= 8;
632
820
  assert(op - ip >= 8);
633
821
  assert(op <= oend);
634
822
  }
@@ -643,8 +831,31 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
643
831
  assert(oend > oend_w);
644
832
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
645
833
  ip += oend_w - op;
646
- op = oend_w;
834
+ op += oend_w - op;
835
+ }
836
+ /* Handle the leftovers. */
837
+ while (op < oend) *op++ = *ip++;
838
+ }
839
+
840
+ /* ZSTD_safecopyDstBeforeSrc():
841
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
842
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
843
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
844
+ ptrdiff_t const diff = op - ip;
845
+ BYTE* const oend = op + length;
846
+
847
+ if (length < 8 || diff > -8) {
848
+ /* Handle short lengths, close overlaps, and dst not before src. */
849
+ while (op < oend) *op++ = *ip++;
850
+ return;
647
851
  }
852
+
853
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
854
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
855
+ ip += oend - WILDCOPY_OVERLENGTH - op;
856
+ op += oend - WILDCOPY_OVERLENGTH - op;
857
+ }
858
+
648
859
  /* Handle the leftovers. */
649
860
  while (op < oend) *op++ = *ip++;
650
861
  }
@@ -659,9 +870,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
659
870
  */
660
871
  FORCE_NOINLINE
661
872
  size_t ZSTD_execSequenceEnd(BYTE* op,
662
- BYTE* const oend, seq_t sequence,
663
- const BYTE** litPtr, const BYTE* const litLimit,
664
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
873
+ BYTE* const oend, seq_t sequence,
874
+ const BYTE** litPtr, const BYTE* const litLimit,
875
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
665
876
  {
666
877
  BYTE* const oLitEnd = op + sequence.litLength;
667
878
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -684,27 +895,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
684
895
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
685
896
  /* offset beyond prefix */
686
897
  RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
687
- match = dictEnd - (prefixStart-match);
898
+ match = dictEnd - (prefixStart - match);
688
899
  if (match + sequence.matchLength <= dictEnd) {
689
- memmove(oLitEnd, match, sequence.matchLength);
900
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
690
901
  return sequenceLength;
691
902
  }
692
903
  /* span extDict & currentPrefixSegment */
693
904
  { size_t const length1 = dictEnd - match;
694
- memmove(oLitEnd, match, length1);
695
- op = oLitEnd + length1;
696
- sequence.matchLength -= length1;
697
- match = prefixStart;
698
- } }
905
+ ZSTD_memmove(oLitEnd, match, length1);
906
+ op = oLitEnd + length1;
907
+ sequence.matchLength -= length1;
908
+ match = prefixStart;
909
+ }
910
+ }
911
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
912
+ return sequenceLength;
913
+ }
914
+
915
+ /* ZSTD_execSequenceEndSplitLitBuffer():
916
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
917
+ */
918
+ FORCE_NOINLINE
919
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
920
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
921
+ const BYTE** litPtr, const BYTE* const litLimit,
922
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
923
+ {
924
+ BYTE* const oLitEnd = op + sequence.litLength;
925
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
926
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
927
+ const BYTE* match = oLitEnd - sequence.offset;
928
+
929
+
930
+ /* bounds checks : careful of address space overflow in 32-bit mode */
931
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
932
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
933
+ assert(op < op + sequenceLength);
934
+ assert(oLitEnd < op + sequenceLength);
935
+
936
+ /* copy literals */
937
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
938
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
939
+ op = oLitEnd;
940
+ *litPtr = iLitEnd;
941
+
942
+ /* copy Match */
943
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
944
+ /* offset beyond prefix */
945
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
946
+ match = dictEnd - (prefixStart - match);
947
+ if (match + sequence.matchLength <= dictEnd) {
948
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
949
+ return sequenceLength;
950
+ }
951
+ /* span extDict & currentPrefixSegment */
952
+ { size_t const length1 = dictEnd - match;
953
+ ZSTD_memmove(oLitEnd, match, length1);
954
+ op = oLitEnd + length1;
955
+ sequence.matchLength -= length1;
956
+ match = prefixStart;
957
+ }
958
+ }
699
959
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
700
960
  return sequenceLength;
701
961
  }
702
962
 
703
963
  HINT_INLINE
704
964
  size_t ZSTD_execSequence(BYTE* op,
705
- BYTE* const oend, seq_t sequence,
706
- const BYTE** litPtr, const BYTE* const litLimit,
707
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
965
+ BYTE* const oend, seq_t sequence,
966
+ const BYTE** litPtr, const BYTE* const litLimit,
967
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
708
968
  {
709
969
  BYTE* const oLitEnd = op + sequence.litLength;
710
970
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -715,6 +975,103 @@ size_t ZSTD_execSequence(BYTE* op,
715
975
 
716
976
  assert(op != NULL /* Precondition */);
717
977
  assert(oend_w < oend /* No underflow */);
978
+
979
+ #if defined(__aarch64__)
980
+ /* prefetch sequence starting from match that will be used for copy later */
981
+ PREFETCH_L1(match);
982
+ #endif
983
+ /* Handle edge cases in a slow path:
984
+ * - Read beyond end of literals
985
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
986
+ * - 32-bit mode and the match length overflows
987
+ */
988
+ if (UNLIKELY(
989
+ iLitEnd > litLimit ||
990
+ oMatchEnd > oend_w ||
991
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
992
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
993
+
994
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
995
+ assert(op <= oLitEnd /* No overflow */);
996
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
997
+ assert(oMatchEnd <= oend /* No underflow */);
998
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
999
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
1000
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
1001
+
1002
+ /* Copy Literals:
1003
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
1004
+ * We likely don't need the full 32-byte wildcopy.
1005
+ */
1006
+ assert(WILDCOPY_OVERLENGTH >= 16);
1007
+ ZSTD_copy16(op, (*litPtr));
1008
+ if (UNLIKELY(sequence.litLength > 16)) {
1009
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
1010
+ }
1011
+ op = oLitEnd;
1012
+ *litPtr = iLitEnd; /* update for next sequence */
1013
+
1014
+ /* Copy Match */
1015
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1016
+ /* offset beyond prefix -> go into extDict */
1017
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1018
+ match = dictEnd + (match - prefixStart);
1019
+ if (match + sequence.matchLength <= dictEnd) {
1020
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1021
+ return sequenceLength;
1022
+ }
1023
+ /* span extDict & currentPrefixSegment */
1024
+ { size_t const length1 = dictEnd - match;
1025
+ ZSTD_memmove(oLitEnd, match, length1);
1026
+ op = oLitEnd + length1;
1027
+ sequence.matchLength -= length1;
1028
+ match = prefixStart;
1029
+ }
1030
+ }
1031
+ /* Match within prefix of 1 or more bytes */
1032
+ assert(op <= oMatchEnd);
1033
+ assert(oMatchEnd <= oend_w);
1034
+ assert(match >= prefixStart);
1035
+ assert(sequence.matchLength >= 1);
1036
+
1037
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1038
+ * without overlap checking.
1039
+ */
1040
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1041
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1042
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1043
+ * than 16 bytes.
1044
+ */
1045
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1046
+ return sequenceLength;
1047
+ }
1048
+ assert(sequence.offset < WILDCOPY_VECLEN);
1049
+
1050
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1051
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1052
+
1053
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1054
+ if (sequence.matchLength > 8) {
1055
+ assert(op < oMatchEnd);
1056
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1057
+ }
1058
+ return sequenceLength;
1059
+ }
1060
+
1061
+ HINT_INLINE
1062
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1063
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1064
+ const BYTE** litPtr, const BYTE* const litLimit,
1065
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1066
+ {
1067
+ BYTE* const oLitEnd = op + sequence.litLength;
1068
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1069
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1070
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1071
+ const BYTE* match = oLitEnd - sequence.offset;
1072
+
1073
+ assert(op != NULL /* Precondition */);
1074
+ assert(oend_w < oend /* No underflow */);
718
1075
  /* Handle edge cases in a slow path:
719
1076
  * - Read beyond end of literals
720
1077
  * - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -724,7 +1081,7 @@ size_t ZSTD_execSequence(BYTE* op,
724
1081
  iLitEnd > litLimit ||
725
1082
  oMatchEnd > oend_w ||
726
1083
  (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
727
- return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1084
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
728
1085
 
729
1086
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
730
1087
  assert(op <= oLitEnd /* No overflow */);
@@ -752,12 +1109,12 @@ size_t ZSTD_execSequence(BYTE* op,
752
1109
  RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
753
1110
  match = dictEnd + (match - prefixStart);
754
1111
  if (match + sequence.matchLength <= dictEnd) {
755
- memmove(oLitEnd, match, sequence.matchLength);
1112
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
756
1113
  return sequenceLength;
757
1114
  }
758
1115
  /* span extDict & currentPrefixSegment */
759
1116
  { size_t const length1 = dictEnd - match;
760
- memmove(oLitEnd, match, length1);
1117
+ ZSTD_memmove(oLitEnd, match, length1);
761
1118
  op = oLitEnd + length1;
762
1119
  sequence.matchLength -= length1;
763
1120
  match = prefixStart;
@@ -792,6 +1149,7 @@ size_t ZSTD_execSequence(BYTE* op,
792
1149
  return sequenceLength;
793
1150
  }
794
1151
 
1152
+
795
1153
  static void
796
1154
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
797
1155
  {
@@ -805,24 +1163,14 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
805
1163
  }
806
1164
 
807
1165
  FORCE_INLINE_TEMPLATE void
808
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
1166
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
809
1167
  {
810
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
811
- U32 const nbBits = DInfo.nbBits;
812
1168
  size_t const lowBits = BIT_readBits(bitD, nbBits);
813
- DStatePtr->state = DInfo.nextState + lowBits;
814
- }
815
-
816
- FORCE_INLINE_TEMPLATE void
817
- ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
818
- {
819
- U32 const nbBits = DInfo.nbBits;
820
- size_t const lowBits = BIT_readBits(bitD, nbBits);
821
- DStatePtr->state = DInfo.nextState + lowBits;
1169
+ DStatePtr->state = nextState + lowBits;
822
1170
  }
823
1171
 
824
1172
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
825
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
1173
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
826
1174
  * bits before reloading. This value is the maximum number of bytes we read
827
1175
  * after reloading when we are decoding long offsets.
828
1176
  */
@@ -832,123 +1180,125 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD
832
1180
  : 0)
833
1181
 
834
1182
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
835
- typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
836
1183
 
837
1184
  FORCE_INLINE_TEMPLATE seq_t
838
- ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
1185
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
839
1186
  {
840
1187
  seq_t seq;
841
- ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
842
- ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
843
- ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
844
- U32 const llBase = llDInfo.baseValue;
845
- U32 const mlBase = mlDInfo.baseValue;
846
- U32 const ofBase = ofDInfo.baseValue;
847
- BYTE const llBits = llDInfo.nbAdditionalBits;
848
- BYTE const mlBits = mlDInfo.nbAdditionalBits;
849
- BYTE const ofBits = ofDInfo.nbAdditionalBits;
850
- BYTE const totalBits = llBits+mlBits+ofBits;
851
-
852
- /* sequence */
853
- { size_t offset;
854
- if (ofBits > 1) {
855
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
856
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
857
- assert(ofBits <= MaxOff);
858
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
859
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
860
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
861
- BIT_reloadDStream(&seqState->DStream);
862
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
863
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
864
- } else {
865
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
866
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
867
- }
868
- seqState->prevOffset[2] = seqState->prevOffset[1];
869
- seqState->prevOffset[1] = seqState->prevOffset[0];
870
- seqState->prevOffset[0] = offset;
871
- } else {
872
- U32 const ll0 = (llBase == 0);
873
- if (LIKELY((ofBits == 0))) {
874
- if (LIKELY(!ll0))
875
- offset = seqState->prevOffset[0];
876
- else {
877
- offset = seqState->prevOffset[1];
878
- seqState->prevOffset[1] = seqState->prevOffset[0];
879
- seqState->prevOffset[0] = offset;
880
- }
881
- } else {
882
- offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
883
- { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
884
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
885
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
886
- seqState->prevOffset[1] = seqState->prevOffset[0];
887
- seqState->prevOffset[0] = offset = temp;
888
- } } }
889
- seq.offset = offset;
890
- }
891
-
892
- seq.matchLength = mlBase;
893
- if (mlBits > 0)
894
- seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
895
-
896
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
897
- BIT_reloadDStream(&seqState->DStream);
898
- if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
899
- BIT_reloadDStream(&seqState->DStream);
900
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
901
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
902
-
903
- seq.litLength = llBase;
904
- if (llBits > 0)
905
- seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
906
-
907
- if (MEM_32bits())
908
- BIT_reloadDStream(&seqState->DStream);
909
-
910
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
911
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
912
-
913
- if (prefetch == ZSTD_p_prefetch) {
914
- size_t const pos = seqState->pos + seq.litLength;
915
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
916
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
917
- * No consequence though : no memory access will occur, offset is only used for prefetching */
918
- seqState->pos = pos + seq.matchLength;
919
- }
920
-
921
- /* ANS state update
922
- * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
923
- * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
924
- * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
925
- * better option, so it is the default for other compilers. But, if you
926
- * measure that it is worse, please put up a pull request.
1188
+ /*
1189
+ * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
1190
+ * loaded in one operation and extracted its fields by simply shifting or
1191
+ * bit-extracting on aarch64.
1192
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
1193
+ * operations that cause performance drop. This can be avoided by using this
1194
+ * ZSTD_memcpy hack.
927
1195
  */
928
- {
929
- #if defined(__GNUC__) && !defined(__clang__)
930
- const int kUseUpdateFseState = 1;
1196
+ #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
1197
+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
1198
+ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
1199
+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
1200
+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
1201
+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
1202
+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
1203
+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
931
1204
  #else
932
- const int kUseUpdateFseState = 0;
1205
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1206
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1207
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
933
1208
  #endif
934
- if (kUseUpdateFseState) {
935
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
936
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
937
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
938
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
939
- } else {
940
- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
941
- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
942
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
943
- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1209
+ seq.matchLength = mlDInfo->baseValue;
1210
+ seq.litLength = llDInfo->baseValue;
1211
+ { U32 const ofBase = ofDInfo->baseValue;
1212
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1213
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1214
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1215
+ BYTE const totalBits = llBits+mlBits+ofBits;
1216
+
1217
+ U16 const llNext = llDInfo->nextState;
1218
+ U16 const mlNext = mlDInfo->nextState;
1219
+ U16 const ofNext = ofDInfo->nextState;
1220
+ U32 const llnbBits = llDInfo->nbBits;
1221
+ U32 const mlnbBits = mlDInfo->nbBits;
1222
+ U32 const ofnbBits = ofDInfo->nbBits;
1223
+
1224
+ assert(llBits <= MaxLLBits);
1225
+ assert(mlBits <= MaxMLBits);
1226
+ assert(ofBits <= MaxOff);
1227
+ /*
1228
+ * As gcc has better branch and block analyzers, sometimes it is only
1229
+ * valuable to mark likeliness for clang, it gives around 3-4% of
1230
+ * performance.
1231
+ */
1232
+
1233
+ /* sequence */
1234
+ { size_t offset;
1235
+ if (ofBits > 1) {
1236
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1237
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1238
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
1239
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
1240
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1241
+ /* Always read extra bits, this keeps the logic simple,
1242
+ * avoids branches, and avoids accidentally reading 0 bits.
1243
+ */
1244
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
1245
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1246
+ BIT_reloadDStream(&seqState->DStream);
1247
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1248
+ } else {
1249
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1250
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
1251
+ }
1252
+ seqState->prevOffset[2] = seqState->prevOffset[1];
1253
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1254
+ seqState->prevOffset[0] = offset;
1255
+ } else {
1256
+ U32 const ll0 = (llDInfo->baseValue == 0);
1257
+ if (LIKELY((ofBits == 0))) {
1258
+ offset = seqState->prevOffset[ll0];
1259
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1260
+ seqState->prevOffset[0] = offset;
1261
+ } else {
1262
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1263
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1264
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1265
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1266
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1267
+ seqState->prevOffset[0] = offset = temp;
1268
+ } } }
1269
+ seq.offset = offset;
944
1270
  }
1271
+
1272
+ if (mlBits > 0)
1273
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1274
+
1275
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1276
+ BIT_reloadDStream(&seqState->DStream);
1277
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1278
+ BIT_reloadDStream(&seqState->DStream);
1279
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1280
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1281
+
1282
+ if (llBits > 0)
1283
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1284
+
1285
+ if (MEM_32bits())
1286
+ BIT_reloadDStream(&seqState->DStream);
1287
+
1288
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1289
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1290
+
1291
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1292
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1293
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1294
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
945
1295
  }
946
1296
 
947
1297
  return seq;
948
1298
  }
949
1299
 
950
1300
  #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
951
- static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1301
+ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
952
1302
  {
953
1303
  size_t const windowSize = dctx->fParams.windowSize;
954
1304
  /* No dictionary used. */
@@ -969,6 +1319,7 @@ MEM_STATIC void ZSTD_assertValidSequence(
969
1319
  seq_t const seq,
970
1320
  BYTE const* prefixStart, BYTE const* virtualStart)
971
1321
  {
1322
+ #if DEBUGLEVEL >= 1
972
1323
  size_t const windowSize = dctx->fParams.windowSize;
973
1324
  size_t const sequenceSize = seq.litLength + seq.matchLength;
974
1325
  BYTE const* const oLitEnd = op + seq.litLength;
@@ -986,13 +1337,18 @@ MEM_STATIC void ZSTD_assertValidSequence(
986
1337
  /* Offset must be within our window. */
987
1338
  assert(seq.offset <= windowSize);
988
1339
  }
1340
+ #else
1341
+ (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1342
+ #endif
989
1343
  }
990
1344
  #endif
991
1345
 
992
1346
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1347
+
1348
+
993
1349
  FORCE_INLINE_TEMPLATE size_t
994
1350
  DONT_VECTORIZE
995
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1351
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
996
1352
  void* dst, size_t maxDstSize,
997
1353
  const void* seqStart, size_t seqSize, int nbSeq,
998
1354
  const ZSTD_longOffset_e isLongOffset,
@@ -1000,21 +1356,20 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1000
1356
  {
1001
1357
  const BYTE* ip = (const BYTE*)seqStart;
1002
1358
  const BYTE* const iend = ip + seqSize;
1003
- BYTE* const ostart = (BYTE* const)dst;
1359
+ BYTE* const ostart = (BYTE*)dst;
1004
1360
  BYTE* const oend = ostart + maxDstSize;
1005
1361
  BYTE* op = ostart;
1006
1362
  const BYTE* litPtr = dctx->litPtr;
1007
- const BYTE* const litEnd = litPtr + dctx->litSize;
1363
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1008
1364
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1009
1365
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1010
1366
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1011
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1367
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1012
1368
  (void)frame;
1013
1369
 
1014
1370
  /* Regen sequences */
1015
1371
  if (nbSeq) {
1016
1372
  seqState_t seqState;
1017
- size_t error = 0;
1018
1373
  dctx->fseEntropy = 1;
1019
1374
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1020
1375
  RETURN_ERROR_IF(
@@ -1030,70 +1385,255 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1030
1385
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1031
1386
  BIT_DStream_completed < BIT_DStream_overflow);
1032
1387
 
1388
+ /* decompress without overrunning litPtr begins */
1389
+ {
1390
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1391
+ /* Align the decompression loop to 32 + 16 bytes.
1392
+ *
1393
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1394
+ * speed swings based on the alignment of the decompression loop. This
1395
+ * performance swing is caused by parts of the decompression loop falling
1396
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1397
+ * when it can't we get much worse performance. You can measure if you've
1398
+ * hit the good case or the bad case with this perf command for some
1399
+ * compressed file test.zst:
1400
+ *
1401
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1402
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1403
+ *
1404
+ * If you see most cycles served out of the MITE you've hit the bad case.
1405
+ * If you see most cycles served out of the DSB you've hit the good case.
1406
+ * If it is pretty even then you may be in an okay case.
1407
+ *
1408
+ * This issue has been reproduced on the following CPUs:
1409
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1410
+ * Use Instruments->Counters to get DSB/MITE cycles.
1411
+ * I never got performance swings, but I was able to
1412
+ * go from the good case of mostly DSB to half of the
1413
+ * cycles served from MITE.
1414
+ * - Coffeelake: Intel i9-9900k
1415
+ * - Coffeelake: Intel i7-9700k
1416
+ *
1417
+ * I haven't been able to reproduce the instability or DSB misses on any
1418
+ * of the following CPUS:
1419
+ * - Haswell
1420
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1421
+ * - Skylake
1422
+ *
1423
+ * Alignment is done for each of the three major decompression loops:
1424
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1425
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1426
+ * - ZSTD_decompressSequences_body
1427
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1428
+ * from changes external to this code, rather than to overoptimize on the current commit.
1429
+ *
1430
+ * If you are seeing performance stability this script can help test.
1431
+ * It tests on 4 commits in zstd where I saw performance change.
1432
+ *
1433
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1434
+ */
1033
1435
  #if defined(__GNUC__) && defined(__x86_64__)
1034
- /* Align the decompression loop to 32 + 16 bytes.
1035
- *
1036
- * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1037
- * speed swings based on the alignment of the decompression loop. This
1038
- * performance swing is caused by parts of the decompression loop falling
1039
- * out of the DSB. The entire decompression loop should fit in the DSB,
1040
- * when it can't we get much worse performance. You can measure if you've
1041
- * hit the good case or the bad case with this perf command for some
1042
- * compressed file test.zst:
1043
- *
1044
- * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1045
- * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1046
- *
1047
- * If you see most cycles served out of the MITE you've hit the bad case.
1048
- * If you see most cycles served out of the DSB you've hit the good case.
1049
- * If it is pretty even then you may be in an okay case.
1050
- *
1051
- * I've been able to reproduce this issue on the following CPUs:
1052
- * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1053
- * Use Instruments->Counters to get DSB/MITE cycles.
1054
- * I never got performance swings, but I was able to
1055
- * go from the good case of mostly DSB to half of the
1056
- * cycles served from MITE.
1057
- * - Coffeelake: Intel i9-9900k
1058
- *
1059
- * I haven't been able to reproduce the instability or DSB misses on any
1060
- * of the following CPUS:
1061
- * - Haswell
1062
- * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1063
- * - Skylake
1064
- *
1065
- * If you are seeing performance stability this script can help test.
1066
- * It tests on 4 commits in zstd where I saw performance change.
1067
- *
1068
- * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1069
- */
1070
- __asm__(".p2align 5");
1071
- __asm__("nop");
1072
- __asm__(".p2align 4");
1436
+ __asm__(".p2align 6");
1437
+ # if __GNUC__ >= 7
1438
+ /* good for gcc-7, gcc-9, and gcc-11 */
1439
+ __asm__("nop");
1440
+ __asm__(".p2align 5");
1441
+ __asm__("nop");
1442
+ __asm__(".p2align 4");
1443
+ # if __GNUC__ == 8 || __GNUC__ == 10
1444
+ /* good for gcc-8 and gcc-10 */
1445
+ __asm__("nop");
1446
+ __asm__(".p2align 3");
1447
+ # endif
1448
+ # endif
1449
+ #endif
1450
+
1451
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1452
+ for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1453
+ size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
+ assert(!ZSTD_isError(oneSeqSize));
1456
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
+ #endif
1458
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
+ return oneSeqSize;
1460
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1461
+ op += oneSeqSize;
1462
+ if (UNLIKELY(!--nbSeq))
1463
+ break;
1464
+ BIT_reloadDStream(&(seqState.DStream));
1465
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1466
+ }
1467
+
1468
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1469
+ if (nbSeq > 0) {
1470
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1471
+ if (leftoverLit)
1472
+ {
1473
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1474
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1475
+ sequence.litLength -= leftoverLit;
1476
+ op += leftoverLit;
1477
+ }
1478
+ litPtr = dctx->litExtraBuffer;
1479
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1480
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1481
+ {
1482
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1483
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1484
+ assert(!ZSTD_isError(oneSeqSize));
1485
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1073
1486
  #endif
1487
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1488
+ return oneSeqSize;
1489
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1490
+ op += oneSeqSize;
1491
+ if (--nbSeq)
1492
+ BIT_reloadDStream(&(seqState.DStream));
1493
+ }
1494
+ }
1495
+ }
1496
+
1497
+ if (nbSeq > 0) /* there is remaining lit from extra buffer */
1498
+ {
1499
+
1500
+ #if defined(__GNUC__) && defined(__x86_64__)
1501
+ __asm__(".p2align 6");
1502
+ __asm__("nop");
1503
+ # if __GNUC__ != 7
1504
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1505
+ __asm__(".p2align 4");
1506
+ __asm__("nop");
1507
+ __asm__(".p2align 3");
1508
+ # elif __GNUC__ >= 11
1509
+ __asm__(".p2align 3");
1510
+ # else
1511
+ __asm__(".p2align 5");
1512
+ __asm__("nop");
1513
+ __asm__(".p2align 3");
1514
+ # endif
1515
+ #endif
1516
+
1517
+ for (; ; ) {
1518
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1519
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1520
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1521
+ assert(!ZSTD_isError(oneSeqSize));
1522
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1523
+ #endif
1524
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1525
+ return oneSeqSize;
1526
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1527
+ op += oneSeqSize;
1528
+ if (UNLIKELY(!--nbSeq))
1529
+ break;
1530
+ BIT_reloadDStream(&(seqState.DStream));
1531
+ }
1532
+ }
1533
+
1534
+ /* check if reached exact end */
1535
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1536
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1537
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1538
+ /* save reps for next block */
1539
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1540
+ }
1541
+
1542
+ /* last literal segment */
1543
+ if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1544
+ {
1545
+ size_t const lastLLSize = litBufferEnd - litPtr;
1546
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1547
+ if (op != NULL) {
1548
+ ZSTD_memmove(op, litPtr, lastLLSize);
1549
+ op += lastLLSize;
1550
+ }
1551
+ litPtr = dctx->litExtraBuffer;
1552
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1553
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1554
+ }
1555
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1556
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1557
+ if (op != NULL) {
1558
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1559
+ op += lastLLSize;
1560
+ }
1561
+ }
1562
+
1563
+ return op-ostart;
1564
+ }
1565
+
1566
+ FORCE_INLINE_TEMPLATE size_t
1567
+ DONT_VECTORIZE
1568
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1569
+ void* dst, size_t maxDstSize,
1570
+ const void* seqStart, size_t seqSize, int nbSeq,
1571
+ const ZSTD_longOffset_e isLongOffset,
1572
+ const int frame)
1573
+ {
1574
+ const BYTE* ip = (const BYTE*)seqStart;
1575
+ const BYTE* const iend = ip + seqSize;
1576
+ BYTE* const ostart = (BYTE*)dst;
1577
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1578
+ BYTE* op = ostart;
1579
+ const BYTE* litPtr = dctx->litPtr;
1580
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1581
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1582
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1583
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1584
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
1585
+ (void)frame;
1586
+
1587
+ /* Regen sequences */
1588
+ if (nbSeq) {
1589
+ seqState_t seqState;
1590
+ dctx->fseEntropy = 1;
1591
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1592
+ RETURN_ERROR_IF(
1593
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1594
+ corruption_detected, "");
1595
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1596
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1597
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1598
+ assert(dst != NULL);
1599
+
1600
+ ZSTD_STATIC_ASSERT(
1601
+ BIT_DStream_unfinished < BIT_DStream_completed &&
1602
+ BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1603
+ BIT_DStream_completed < BIT_DStream_overflow);
1604
+
1605
+ #if defined(__GNUC__) && defined(__x86_64__)
1606
+ __asm__(".p2align 6");
1607
+ __asm__("nop");
1608
+ # if __GNUC__ >= 7
1609
+ __asm__(".p2align 5");
1610
+ __asm__("nop");
1611
+ __asm__(".p2align 3");
1612
+ # else
1613
+ __asm__(".p2align 4");
1614
+ __asm__("nop");
1615
+ __asm__(".p2align 3");
1616
+ # endif
1617
+ #endif
1618
+
1074
1619
  for ( ; ; ) {
1075
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
1620
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1076
1621
  size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1077
1622
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1078
1623
  assert(!ZSTD_isError(oneSeqSize));
1079
1624
  if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1080
1625
  #endif
1626
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1627
+ return oneSeqSize;
1081
1628
  DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1629
+ op += oneSeqSize;
1630
+ if (UNLIKELY(!--nbSeq))
1631
+ break;
1082
1632
  BIT_reloadDStream(&(seqState.DStream));
1083
- /* gcc and clang both don't like early returns in this loop.
1084
- * gcc doesn't like early breaks either.
1085
- * Instead save an error and report it at the end.
1086
- * When there is an error, don't increment op, so we don't
1087
- * overwrite.
1088
- */
1089
- if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize;
1090
- else op += oneSeqSize;
1091
- if (UNLIKELY(!--nbSeq)) break;
1092
1633
  }
1093
1634
 
1094
1635
  /* check if reached exact end */
1095
1636
  DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
1096
- if (ZSTD_isError(error)) return error;
1097
1637
  RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1098
1638
  RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1099
1639
  /* save reps for next block */
@@ -1104,7 +1644,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1104
1644
  { size_t const lastLLSize = litEnd - litPtr;
1105
1645
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1106
1646
  if (op != NULL) {
1107
- memcpy(op, litPtr, lastLLSize);
1647
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1108
1648
  op += lastLLSize;
1109
1649
  }
1110
1650
  }
@@ -1121,9 +1661,37 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1121
1661
  {
1122
1662
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1123
1663
  }
1664
+
1665
+ static size_t
1666
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1667
+ void* dst, size_t maxDstSize,
1668
+ const void* seqStart, size_t seqSize, int nbSeq,
1669
+ const ZSTD_longOffset_e isLongOffset,
1670
+ const int frame)
1671
+ {
1672
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1673
+ }
1124
1674
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1125
1675
 
1126
1676
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1677
+
1678
+ FORCE_INLINE_TEMPLATE size_t
1679
+ ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1680
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1681
+ {
1682
+ prefetchPos += sequence.litLength;
1683
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1684
+ const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1685
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1686
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1687
+ }
1688
+ return prefetchPos + sequence.matchLength;
1689
+ }
1690
+
1691
+ /* This decoding function employs prefetching
1692
+ * to reduce latency impact of cache misses.
1693
+ * It's generally employed when block contains a significant portion of long-distance matches
1694
+ * or when coupled with a "cold" dictionary */
1127
1695
  FORCE_INLINE_TEMPLATE size_t
1128
1696
  ZSTD_decompressSequencesLong_body(
1129
1697
  ZSTD_DCtx* dctx,
@@ -1134,11 +1702,11 @@ ZSTD_decompressSequencesLong_body(
1134
1702
  {
1135
1703
  const BYTE* ip = (const BYTE*)seqStart;
1136
1704
  const BYTE* const iend = ip + seqSize;
1137
- BYTE* const ostart = (BYTE* const)dst;
1138
- BYTE* const oend = ostart + maxDstSize;
1705
+ BYTE* const ostart = (BYTE*)dst;
1706
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1139
1707
  BYTE* op = ostart;
1140
1708
  const BYTE* litPtr = dctx->litPtr;
1141
- const BYTE* const litEnd = litPtr + dctx->litSize;
1709
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1142
1710
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1143
1711
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1144
1712
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
@@ -1146,18 +1714,17 @@ ZSTD_decompressSequencesLong_body(
1146
1714
 
1147
1715
  /* Regen sequences */
1148
1716
  if (nbSeq) {
1149
- #define STORED_SEQS 4
1717
+ #define STORED_SEQS 8
1150
1718
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1151
- #define ADVANCED_SEQS 4
1719
+ #define ADVANCED_SEQS STORED_SEQS
1152
1720
  seq_t sequences[STORED_SEQS];
1153
1721
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1154
1722
  seqState_t seqState;
1155
1723
  int seqNb;
1724
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1725
+
1156
1726
  dctx->fseEntropy = 1;
1157
1727
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1158
- seqState.prefixStart = prefixStart;
1159
- seqState.pos = (size_t)(op-prefixStart);
1160
- seqState.dictEnd = dictEnd;
1161
1728
  assert(dst != NULL);
1162
1729
  assert(iend >= ip);
1163
1730
  RETURN_ERROR_IF(
@@ -1169,36 +1736,100 @@ ZSTD_decompressSequencesLong_body(
1169
1736
 
1170
1737
  /* prepare in advance */
1171
1738
  for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1172
- sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1173
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1739
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1740
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1741
+ sequences[seqNb] = sequence;
1174
1742
  }
1175
1743
  RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1176
1744
 
1177
- /* decode and decompress */
1178
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1179
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1180
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1745
+ /* decompress without stomping litBuffer */
1746
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1747
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1748
+ size_t oneSeqSize;
1749
+
1750
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1751
+ {
1752
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1753
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1754
+ if (leftoverLit)
1755
+ {
1756
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1757
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1758
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1759
+ op += leftoverLit;
1760
+ }
1761
+ litPtr = dctx->litExtraBuffer;
1762
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1763
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1764
+ oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1181
1765
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1182
- assert(!ZSTD_isError(oneSeqSize));
1183
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1766
+ assert(!ZSTD_isError(oneSeqSize));
1767
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1184
1768
  #endif
1185
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1186
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1187
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1188
- op += oneSeqSize;
1769
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1770
+
1771
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1772
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1773
+ op += oneSeqSize;
1774
+ }
1775
+ else
1776
+ {
1777
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1778
+ oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1779
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1780
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1781
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1782
+ assert(!ZSTD_isError(oneSeqSize));
1783
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1784
+ #endif
1785
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1786
+
1787
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1788
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1789
+ op += oneSeqSize;
1790
+ }
1189
1791
  }
1190
1792
  RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1191
1793
 
1192
1794
  /* finish queue */
1193
1795
  seqNb -= seqAdvance;
1194
1796
  for ( ; seqNb<nbSeq ; seqNb++) {
1195
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1797
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1798
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1799
+ {
1800
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1801
+ if (leftoverLit)
1802
+ {
1803
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1804
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1805
+ sequence->litLength -= leftoverLit;
1806
+ op += leftoverLit;
1807
+ }
1808
+ litPtr = dctx->litExtraBuffer;
1809
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1810
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1811
+ {
1812
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1196
1813
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1197
- assert(!ZSTD_isError(oneSeqSize));
1198
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1814
+ assert(!ZSTD_isError(oneSeqSize));
1815
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1199
1816
  #endif
1200
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1201
- op += oneSeqSize;
1817
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1818
+ op += oneSeqSize;
1819
+ }
1820
+ }
1821
+ else
1822
+ {
1823
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1824
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1825
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1826
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1827
+ assert(!ZSTD_isError(oneSeqSize));
1828
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1829
+ #endif
1830
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1831
+ op += oneSeqSize;
1832
+ }
1202
1833
  }
1203
1834
 
1204
1835
  /* save reps for next block */
@@ -1206,10 +1837,21 @@ ZSTD_decompressSequencesLong_body(
1206
1837
  }
1207
1838
 
1208
1839
  /* last literal segment */
1209
- { size_t const lastLLSize = litEnd - litPtr;
1840
+ if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
1841
+ {
1842
+ size_t const lastLLSize = litBufferEnd - litPtr;
1843
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1844
+ if (op != NULL) {
1845
+ ZSTD_memmove(op, litPtr, lastLLSize);
1846
+ op += lastLLSize;
1847
+ }
1848
+ litPtr = dctx->litExtraBuffer;
1849
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1850
+ }
1851
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1210
1852
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1211
1853
  if (op != NULL) {
1212
- memcpy(op, litPtr, lastLLSize);
1854
+ ZSTD_memmove(op, litPtr, lastLLSize);
1213
1855
  op += lastLLSize;
1214
1856
  }
1215
1857
  }
@@ -1233,7 +1875,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1233
1875
  #if DYNAMIC_BMI2
1234
1876
 
1235
1877
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1236
- static TARGET_ATTRIBUTE("bmi2") size_t
1878
+ static BMI2_TARGET_ATTRIBUTE size_t
1237
1879
  DONT_VECTORIZE
1238
1880
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1239
1881
  void* dst, size_t maxDstSize,
@@ -1243,10 +1885,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1243
1885
  {
1244
1886
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1245
1887
  }
1888
+ static BMI2_TARGET_ATTRIBUTE size_t
1889
+ DONT_VECTORIZE
1890
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1891
+ void* dst, size_t maxDstSize,
1892
+ const void* seqStart, size_t seqSize, int nbSeq,
1893
+ const ZSTD_longOffset_e isLongOffset,
1894
+ const int frame)
1895
+ {
1896
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1897
+ }
1246
1898
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1247
1899
 
1248
1900
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1249
- static TARGET_ATTRIBUTE("bmi2") size_t
1901
+ static BMI2_TARGET_ATTRIBUTE size_t
1250
1902
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1251
1903
  void* dst, size_t maxDstSize,
1252
1904
  const void* seqStart, size_t seqSize, int nbSeq,
@@ -1275,11 +1927,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1275
1927
  {
1276
1928
  DEBUGLOG(5, "ZSTD_decompressSequences");
1277
1929
  #if DYNAMIC_BMI2
1278
- if (dctx->bmi2) {
1930
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1279
1931
  return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1280
1932
  }
1281
1933
  #endif
1282
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1934
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1935
+ }
1936
+ static size_t
1937
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1938
+ const void* seqStart, size_t seqSize, int nbSeq,
1939
+ const ZSTD_longOffset_e isLongOffset,
1940
+ const int frame)
1941
+ {
1942
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1943
+ #if DYNAMIC_BMI2
1944
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1945
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1946
+ }
1947
+ #endif
1948
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1283
1949
  }
1284
1950
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1285
1951
 
@@ -1299,7 +1965,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1299
1965
  {
1300
1966
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1301
1967
  #if DYNAMIC_BMI2
1302
- if (dctx->bmi2) {
1968
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1303
1969
  return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1304
1970
  }
1305
1971
  #endif
@@ -1308,55 +1974,101 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1308
1974
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1309
1975
 
1310
1976
 
1977
+ /**
1978
+ * @returns The total size of the history referenceable by zstd, including
1979
+ * both the prefix and the extDict. At @p op any offset larger than this
1980
+ * is invalid.
1981
+ */
1982
+ static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
1983
+ {
1984
+ return (size_t)(op - virtualStart);
1985
+ }
1311
1986
 
1312
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1313
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1314
- /* ZSTD_getLongOffsetsShare() :
1987
+ typedef struct {
1988
+ unsigned longOffsetShare;
1989
+ unsigned maxNbAdditionalBits;
1990
+ } ZSTD_OffsetInfo;
1991
+
1992
+ /* ZSTD_getOffsetInfo() :
1315
1993
  * condition : offTable must be valid
1316
1994
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1317
- * compared to maximum possible of (1<<OffFSELog) */
1318
- static unsigned
1319
- ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1995
+ * compared to maximum possible of (1<<OffFSELog),
1996
+ * as well as the maximum number additional bits required.
1997
+ */
1998
+ static ZSTD_OffsetInfo
1999
+ ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
1320
2000
  {
1321
- const void* ptr = offTable;
1322
- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
1323
- const ZSTD_seqSymbol* table = offTable + 1;
1324
- U32 const max = 1 << tableLog;
1325
- U32 u, total = 0;
1326
- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
1327
-
1328
- assert(max <= (1 << OffFSELog)); /* max not too large */
1329
- for (u=0; u<max; u++) {
1330
- if (table[u].nbAdditionalBits > 22) total += 1;
2001
+ ZSTD_OffsetInfo info = {0, 0};
2002
+ /* If nbSeq == 0, then the offTable is uninitialized, but we have
2003
+ * no sequences, so both values should be 0.
2004
+ */
2005
+ if (nbSeq != 0) {
2006
+ const void* ptr = offTable;
2007
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
2008
+ const ZSTD_seqSymbol* table = offTable + 1;
2009
+ U32 const max = 1 << tableLog;
2010
+ U32 u;
2011
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
2012
+
2013
+ assert(max <= (1 << OffFSELog)); /* max not too large */
2014
+ for (u=0; u<max; u++) {
2015
+ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
2016
+ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
2017
+ }
2018
+
2019
+ assert(tableLog <= OffFSELog);
2020
+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
1331
2021
  }
1332
2022
 
1333
- assert(tableLog <= OffFSELog);
1334
- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
2023
+ return info;
2024
+ }
1335
2025
 
1336
- return total;
2026
+ /**
2027
+ * @returns The maximum offset we can decode in one read of our bitstream, without
2028
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
2029
+ * than this must use the long offset decoder.
2030
+ */
2031
+ static size_t ZSTD_maxShortOffset(void)
2032
+ {
2033
+ if (MEM_64bits()) {
2034
+ /* We can decode any offset without reloading bits.
2035
+ * This might change if the max window size grows.
2036
+ */
2037
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
2038
+ return (size_t)-1;
2039
+ } else {
2040
+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
2041
+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
2042
+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
2043
+ */
2044
+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
2045
+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
2046
+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
2047
+ return maxOffset;
2048
+ }
1337
2049
  }
1338
- #endif
1339
2050
 
1340
2051
  size_t
1341
2052
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1342
2053
  void* dst, size_t dstCapacity,
1343
- const void* src, size_t srcSize, const int frame)
2054
+ const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1344
2055
  { /* blockType == blockCompressed */
1345
2056
  const BYTE* ip = (const BYTE*)src;
1346
- /* isLongOffset must be true if there are long offsets.
1347
- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1348
- * We don't expect that to be the case in 64-bit mode.
1349
- * In block mode, window size is not known, so we have to be conservative.
1350
- * (note: but it could be evaluated from current-lowLimit)
1351
- */
1352
- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1353
2057
  DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1354
2058
 
1355
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
2059
+ /* Note : the wording of the specification
2060
+ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
2061
+ * This generally does not happen, as it makes little sense,
2062
+ * since an uncompressed block would feature same size and have no decompression cost.
2063
+ * Also, note that decoder from reference libzstd before < v1.5.4
2064
+ * would consider this edge case as an error.
2065
+ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
2066
+ * for broader compatibility with the deployed ecosystem of zstd decoders */
2067
+ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1356
2068
 
1357
2069
  /* Decode literals section */
1358
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1359
- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
2070
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
2071
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
1360
2072
  if (ZSTD_isError(litCSize)) return litCSize;
1361
2073
  ip += litCSize;
1362
2074
  srcSize -= litCSize;
@@ -1364,6 +2076,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1364
2076
 
1365
2077
  /* Build Decoding Tables */
1366
2078
  {
2079
+ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
2080
+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
2081
+ */
2082
+ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
2083
+ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
2084
+ /* isLongOffset must be true if there are long offsets.
2085
+ * Offsets are long if they are larger than ZSTD_maxShortOffset().
2086
+ * We don't expect that to be the case in 64-bit mode.
2087
+ *
2088
+ * We check here to see if our history is large enough to allow long offsets.
2089
+ * If it isn't, then we can't possible have (valid) long offsets. If the offset
2090
+ * is invalid, then it is okay to read it incorrectly.
2091
+ *
2092
+ * If isLongOffsets is true, then we will later check our decoding table to see
2093
+ * if it is even possible to generate long offsets.
2094
+ */
2095
+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
1367
2096
  /* These macros control at build-time which decompressor implementation
1368
2097
  * we use. If neither is defined, we do some inspection and dispatch at
1369
2098
  * runtime.
@@ -1371,6 +2100,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1371
2100
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1372
2101
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1373
2102
  int usePrefetchDecoder = dctx->ddictIsCold;
2103
+ #else
2104
+ /* Set to 1 to avoid computing offset info if we don't need to.
2105
+ * Otherwise this value is ignored.
2106
+ */
2107
+ int usePrefetchDecoder = 1;
1374
2108
  #endif
1375
2109
  int nbSeq;
1376
2110
  size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -1378,40 +2112,57 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1378
2112
  ip += seqHSize;
1379
2113
  srcSize -= seqHSize;
1380
2114
 
1381
- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2115
+ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2116
+ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
2117
+ "invalid dst");
1382
2118
 
1383
- #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1384
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1385
- if ( !usePrefetchDecoder
1386
- && (!frame || (dctx->fParams.windowSize > (1<<24)))
1387
- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
1388
- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
1389
- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
1390
- usePrefetchDecoder = (shareLongOffsets >= minShare);
2119
+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
2120
+ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
2121
+ * NOTE: could probably use a larger nbSeq limit
2122
+ */
2123
+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
2124
+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
2125
+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
2126
+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
2127
+ * enough, then we know it is impossible to have too long an offset in this block, so we can
2128
+ * use the regular offset decoder.
2129
+ */
2130
+ isLongOffset = ZSTD_lo_isRegularOffset;
2131
+ }
2132
+ if (!usePrefetchDecoder) {
2133
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2134
+ usePrefetchDecoder = (info.longOffsetShare >= minShare);
2135
+ }
1391
2136
  }
1392
- #endif
1393
2137
 
1394
2138
  dctx->ddictIsCold = 0;
1395
2139
 
1396
2140
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1397
2141
  !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1398
- if (usePrefetchDecoder)
2142
+ if (usePrefetchDecoder) {
2143
+ #else
2144
+ (void)usePrefetchDecoder;
2145
+ {
1399
2146
  #endif
1400
2147
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1401
2148
  return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1402
2149
  #endif
2150
+ }
1403
2151
 
1404
2152
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1405
2153
  /* else */
1406
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2154
+ if (dctx->litBufferLocation == ZSTD_split)
2155
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2156
+ else
2157
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1407
2158
  #endif
1408
2159
  }
1409
2160
  }
1410
2161
 
1411
2162
 
1412
- void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
2163
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
1413
2164
  {
1414
- if (dst != dctx->previousDstEnd) { /* not contiguous */
2165
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
1415
2166
  dctx->dictEnd = dctx->previousDstEnd;
1416
2167
  dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
1417
2168
  dctx->prefixStart = dst;
@@ -1420,13 +2171,22 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
1420
2171
  }
1421
2172
 
1422
2173
 
1423
- size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1424
- void* dst, size_t dstCapacity,
1425
- const void* src, size_t srcSize)
2174
+ size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
2175
+ void* dst, size_t dstCapacity,
2176
+ const void* src, size_t srcSize)
1426
2177
  {
1427
2178
  size_t dSize;
1428
- ZSTD_checkContinuity(dctx, dst);
1429
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2179
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
2180
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
1430
2181
  dctx->previousDstEnd = (char*)dst + dSize;
1431
2182
  return dSize;
1432
2183
  }
2184
+
2185
+
2186
+ /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
2187
+ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
2188
+ void* dst, size_t dstCapacity,
2189
+ const void* src, size_t srcSize)
2190
+ {
2191
+ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
2192
+ }