zstdlib 0.8.0-x64-mingw32 → 0.9.0-x64-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +10 -0
  3. data/README.md +7 -1
  4. data/Rakefile +38 -8
  5. data/ext/{zstdlib → zstdlib_c}/extconf.rb +10 -5
  6. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.2/zstdlib.c +2 -2
  7. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.3/zstdlib.c +2 -2
  8. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.4/zstdlib.c +2 -2
  9. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.5/zstdlib.c +2 -2
  10. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.6/zstdlib.c +2 -2
  11. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.7/zstdlib.c +2 -2
  12. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-3.0/zstdlib.c +2 -2
  13. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  14. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/adler32.c +0 -0
  15. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/compress.c +0 -0
  16. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/crc32.c +0 -0
  17. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/crc32.h +0 -0
  18. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/deflate.c +0 -0
  19. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/deflate.h +0 -0
  20. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzclose.c +0 -0
  21. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzguts.h +0 -0
  22. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzlib.c +0 -0
  23. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzread.c +0 -0
  24. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/gzwrite.c +0 -0
  25. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/infback.c +0 -0
  26. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffast.c +0 -0
  27. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffast.h +0 -0
  28. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inffixed.h +0 -0
  29. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inflate.c +0 -0
  30. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inflate.h +0 -0
  31. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inftrees.c +0 -0
  32. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/inftrees.h +0 -0
  33. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/trees.c +0 -0
  34. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/trees.h +0 -0
  35. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/uncompr.c +0 -0
  36. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zconf.h +0 -0
  37. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zlib.h +0 -0
  38. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zutil.c +0 -0
  39. data/ext/{zstdlib → zstdlib_c}/zlib-1.2.11/zutil.h +0 -0
  40. data/ext/{zstdlib → zstdlib_c}/zlib.mk +0 -0
  41. data/ext/{zstdlib → zstdlib_c}/zlibwrapper/zlibwrapper.c +1 -5
  42. data/ext/{zstdlib → zstdlib_c}/zlibwrapper.mk +0 -0
  43. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/bitstream.h +24 -9
  44. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/compiler.h +89 -43
  45. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/cpu.h +0 -0
  46. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/debug.c +0 -0
  47. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/debug.h +0 -0
  48. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/entropy_common.c +11 -5
  49. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.c +0 -0
  50. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.h +79 -0
  51. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/fse.h +2 -1
  52. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/fse_decompress.c +1 -1
  53. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/huf.h +24 -22
  54. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/mem.h +18 -0
  55. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/pool.c +11 -6
  56. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/pool.h +2 -2
  57. data/ext/zstdlib_c/zstd-1.5.2/lib/common/portability_macros.h +137 -0
  58. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/threading.c +0 -0
  59. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/threading.h +0 -0
  60. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.c +24 -0
  61. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.h +5686 -0
  62. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_common.c +0 -0
  63. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_deps.h +0 -0
  64. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_internal.h +95 -92
  65. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_trace.h +12 -3
  66. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/clevels.h +134 -0
  67. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/fse_compress.c +63 -27
  68. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.c +0 -0
  69. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.h +0 -0
  70. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/huf_compress.c +537 -104
  71. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress.c +307 -373
  72. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_internal.h +174 -83
  73. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.c +4 -3
  74. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.h +3 -1
  75. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.c +15 -14
  76. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.h +0 -0
  77. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.c +4 -3
  78. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.h +0 -0
  79. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_cwksp.h +41 -27
  80. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.c +295 -120
  81. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.h +0 -0
  82. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.c +309 -130
  83. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.h +0 -0
  84. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_lazy.c +482 -562
  85. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_lazy.h +0 -0
  86. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.c +9 -7
  87. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.h +1 -1
  88. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm_geartab.h +4 -1
  89. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.c +249 -148
  90. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.h +0 -0
  91. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.c +76 -38
  92. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.h +4 -1
  93. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/huf_decompress.c +727 -189
  94. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress_amd64.S +585 -0
  95. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.c +0 -0
  96. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.h +0 -0
  97. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress.c +85 -22
  98. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.c +744 -220
  99. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.h +8 -2
  100. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_internal.h +34 -3
  101. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zdict.h +4 -4
  102. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zstd.h +179 -136
  103. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/lib/zstd_errors.h +0 -0
  104. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzclose.c +0 -0
  105. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzcompatibility.h +0 -0
  106. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzguts.h +0 -0
  107. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzlib.c +0 -0
  108. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzread.c +0 -0
  109. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzwrite.c +0 -0
  110. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.c +7 -0
  111. data/ext/{zstdlib/zstd-1.5.0 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.h +0 -0
  112. data/ext/zstdlib_c/zstd.mk +15 -0
  113. data/lib/2.4/zstdlib_c.so +0 -0
  114. data/lib/2.5/zstdlib_c.so +0 -0
  115. data/lib/2.6/zstdlib_c.so +0 -0
  116. data/lib/2.7/zstdlib_c.so +0 -0
  117. data/lib/3.0/zstdlib_c.so +0 -0
  118. data/lib/zstdlib.rb +2 -2
  119. metadata +124 -121
  120. data/ext/zstdlib/zstd-1.5.0/lib/common/xxhash.c +0 -824
  121. data/ext/zstdlib/zstd-1.5.0/lib/common/xxhash.h +0 -285
  122. data/ext/zstdlib/zstd.mk +0 -14
  123. data/lib/2.2/zstdlib.so +0 -0
  124. data/lib/2.3/zstdlib.so +0 -0
  125. data/lib/2.4/zstdlib.so +0 -0
  126. data/lib/2.5/zstdlib.so +0 -0
  127. data/lib/2.6/zstdlib.so +0 -0
  128. data/lib/2.7/zstdlib.so +0 -0
@@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
69
69
  }
70
70
  }
71
71
 
72
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
+ {
76
+ if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
+ {
78
+ /* room for litbuffer to fit without read faulting */
79
+ dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
81
+ dctx->litBufferLocation = ZSTD_in_dst;
82
+ }
83
+ else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
+ {
85
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
+ if (splitImmediately) {
87
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
+ }
91
+ else {
92
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
93
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
+ }
96
+ dctx->litBufferLocation = ZSTD_split;
97
+ }
98
+ else
99
+ {
100
+ /* fits entirely within litExtraBuffer, so no split is necessary */
101
+ dctx->litBuffer = dctx->litExtraBuffer;
102
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
103
+ dctx->litBufferLocation = ZSTD_not_in_dst;
104
+ }
105
+ }
72
106
 
73
107
  /* Hidden declaration for fullbench */
74
108
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
109
+ const void* src, size_t srcSize,
110
+ void* dst, size_t dstCapacity, const streaming_operation streaming);
76
111
  /*! ZSTD_decodeLiteralsBlock() :
112
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
114
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
+ *
77
117
  * @return : nb of bytes read from src (< srcSize )
78
118
  * note : symbol not declared but exposed for fullbench */
79
119
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
120
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
121
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
122
  {
82
123
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
124
  RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
@@ -90,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
90
131
  case set_repeat:
91
132
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
133
  RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
93
- /* fall-through */
134
+ ZSTD_FALLTHROUGH;
94
135
 
95
136
  case set_compressed:
96
137
  RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
@@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
99
140
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
141
  U32 const lhc = MEM_readLE32(istart);
101
142
  size_t hufSuccess;
143
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
102
144
  switch(lhlCode)
103
145
  {
104
146
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
163
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
164
  break;
123
165
  }
166
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
124
167
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
125
168
  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
169
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
170
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
171
 
127
172
  /* prefetch huffman table if cold */
128
173
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
133
178
  if (singleStream) {
134
179
  hufSuccess = HUF_decompress1X_usingDTable_bmi2(
135
180
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
181
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
137
182
  } else {
138
183
  hufSuccess = HUF_decompress4X_usingDTable_bmi2(
139
184
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
185
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
141
186
  }
142
187
  } else {
143
188
  if (singleStream) {
@@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
150
195
  hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
151
196
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
197
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
198
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
154
199
  #endif
155
200
  } else {
156
201
  hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
157
202
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
203
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
204
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
160
205
  }
161
206
  }
207
+ if (dctx->litBufferLocation == ZSTD_split)
208
+ {
209
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
210
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
211
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
212
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
213
+ }
162
214
 
163
215
  RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
216
 
@@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
166
218
  dctx->litSize = litSize;
167
219
  dctx->litEntropy = 1;
168
220
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
221
  return litCSize + lhSize;
171
222
  }
172
223
 
173
224
  case set_basic:
174
225
  { size_t litSize, lhSize;
175
226
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
227
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
176
228
  switch(lhlCode)
177
229
  {
178
230
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
189
241
  break;
190
242
  }
191
243
 
244
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
245
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
246
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
247
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
248
  RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
- ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
249
+ if (dctx->litBufferLocation == ZSTD_split)
250
+ {
251
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
252
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
253
+ }
254
+ else
255
+ {
256
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
257
+ }
195
258
  dctx->litPtr = dctx->litBuffer;
196
259
  dctx->litSize = litSize;
197
- ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
260
  return lhSize+litSize;
199
261
  }
200
262
  /* direct reference into compressed stream */
201
263
  dctx->litPtr = istart+lhSize;
202
264
  dctx->litSize = litSize;
265
+ dctx->litBufferEnd = dctx->litPtr + litSize;
266
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
267
  return lhSize+litSize;
204
268
  }
205
269
 
206
270
  case set_rle:
207
271
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
272
  size_t litSize, lhSize;
273
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
209
274
  switch(lhlCode)
210
275
  {
211
276
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
222
287
  RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
288
  break;
224
289
  }
290
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
225
291
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
- ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
292
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
293
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
294
+ if (dctx->litBufferLocation == ZSTD_split)
295
+ {
296
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
297
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
298
+ }
299
+ else
300
+ {
301
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
302
+ }
227
303
  dctx->litPtr = dctx->litBuffer;
228
304
  dctx->litSize = litSize;
229
305
  return lhSize+1;
@@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
419
  }; /* ML_defaultDTable */
344
420
 
345
421
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
422
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
423
  {
348
424
  void* ptr = dt;
349
425
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
431
  cell->nbBits = 0;
356
432
  cell->nextState = 0;
357
433
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
434
+ cell->nbAdditionalBits = nbAddBits;
359
435
  cell->baseValue = baseValue;
360
436
  }
361
437
 
@@ -367,7 +443,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
367
443
  FORCE_INLINE_TEMPLATE
368
444
  void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
445
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
446
+ const U32* baseValue, const U8* nbAdditionalBits,
371
447
  unsigned tableLog, void* wksp, size_t wkspSize)
372
448
  {
373
449
  ZSTD_seqSymbol* const tableDecode = dt+1;
@@ -478,7 +554,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
478
554
  tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
479
555
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
480
556
  assert(nbAdditionalBits[symbol] < 255);
481
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
557
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
482
558
  tableDecode[u].baseValue = baseValue[symbol];
483
559
  }
484
560
  }
@@ -487,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
487
563
  /* Avoids the FORCE_INLINE of the _body() function. */
488
564
  static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
489
565
  const short* normalizedCounter, unsigned maxSymbolValue,
490
- const U32* baseValue, const U32* nbAdditionalBits,
566
+ const U32* baseValue, const U8* nbAdditionalBits,
491
567
  unsigned tableLog, void* wksp, size_t wkspSize)
492
568
  {
493
569
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -495,9 +571,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
495
571
  }
496
572
 
497
573
  #if DYNAMIC_BMI2
498
- TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
574
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
499
575
  const short* normalizedCounter, unsigned maxSymbolValue,
500
- const U32* baseValue, const U32* nbAdditionalBits,
576
+ const U32* baseValue, const U8* nbAdditionalBits,
501
577
  unsigned tableLog, void* wksp, size_t wkspSize)
502
578
  {
503
579
  ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -507,7 +583,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol
507
583
 
508
584
  void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
509
585
  const short* normalizedCounter, unsigned maxSymbolValue,
510
- const U32* baseValue, const U32* nbAdditionalBits,
586
+ const U32* baseValue, const U8* nbAdditionalBits,
511
587
  unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
512
588
  {
513
589
  #if DYNAMIC_BMI2
@@ -529,7 +605,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
529
605
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
530
606
  symbolEncodingType_e type, unsigned max, U32 maxLog,
531
607
  const void* src, size_t srcSize,
532
- const U32* baseValue, const U32* nbAdditionalBits,
608
+ const U32* baseValue, const U8* nbAdditionalBits,
533
609
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
534
610
  int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
535
611
  int bmi2)
@@ -541,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
541
617
  RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
542
618
  { U32 const symbol = *(const BYTE*)src;
543
619
  U32 const baseline = baseValue[symbol];
544
- U32 const nbBits = nbAdditionalBits[symbol];
620
+ U8 const nbBits = nbAdditionalBits[symbol];
545
621
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
546
622
  }
547
623
  *DTablePtr = DTableSpace;
@@ -620,7 +696,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
620
696
  LL_defaultDTable, dctx->fseEntropy,
621
697
  dctx->ddictIsCold, nbSeq,
622
698
  dctx->workspace, sizeof(dctx->workspace),
623
- dctx->bmi2);
699
+ ZSTD_DCtx_get_bmi2(dctx));
624
700
  RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
625
701
  ip += llhSize;
626
702
  }
@@ -632,7 +708,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
632
708
  OF_defaultDTable, dctx->fseEntropy,
633
709
  dctx->ddictIsCold, nbSeq,
634
710
  dctx->workspace, sizeof(dctx->workspace),
635
- dctx->bmi2);
711
+ ZSTD_DCtx_get_bmi2(dctx));
636
712
  RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
637
713
  ip += ofhSize;
638
714
  }
@@ -644,7 +720,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
644
720
  ML_defaultDTable, dctx->fseEntropy,
645
721
  dctx->ddictIsCold, nbSeq,
646
722
  dctx->workspace, sizeof(dctx->workspace),
647
- dctx->bmi2);
723
+ ZSTD_DCtx_get_bmi2(dctx));
648
724
  RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
649
725
  ip += mlhSize;
650
726
  }
@@ -713,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
713
789
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
714
790
  * The src buffer must be before the dst buffer.
715
791
  */
716
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
792
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
717
793
  ptrdiff_t const diff = op - ip;
718
794
  BYTE* const oend = op + length;
719
795
 
@@ -729,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
729
805
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
730
806
  assert(length >= 8);
731
807
  ZSTD_overlapCopy8(&op, &ip, diff);
808
+ length -= 8;
732
809
  assert(op - ip >= 8);
733
810
  assert(op <= oend);
734
811
  }
@@ -743,12 +820,35 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
743
820
  assert(oend > oend_w);
744
821
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
745
822
  ip += oend_w - op;
746
- op = oend_w;
823
+ op += oend_w - op;
747
824
  }
748
825
  /* Handle the leftovers. */
749
826
  while (op < oend) *op++ = *ip++;
750
827
  }
751
828
 
829
+ /* ZSTD_safecopyDstBeforeSrc():
830
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
831
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
832
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
833
+ ptrdiff_t const diff = op - ip;
834
+ BYTE* const oend = op + length;
835
+
836
+ if (length < 8 || diff > -8) {
837
+ /* Handle short lengths, close overlaps, and dst not before src. */
838
+ while (op < oend) *op++ = *ip++;
839
+ return;
840
+ }
841
+
842
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
843
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
844
+ ip += oend - WILDCOPY_OVERLENGTH - op;
845
+ op += oend - WILDCOPY_OVERLENGTH - op;
846
+ }
847
+
848
+ /* Handle the leftovers. */
849
+ while (op < oend) *op++ = *ip++;
850
+ }
851
+
752
852
  /* ZSTD_execSequenceEnd():
753
853
  * This version handles cases that are near the end of the output buffer. It requires
754
854
  * more careful checks to make sure there is no overflow. By separating out these hard
@@ -759,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
759
859
  */
760
860
  FORCE_NOINLINE
761
861
  size_t ZSTD_execSequenceEnd(BYTE* op,
762
- BYTE* const oend, seq_t sequence,
763
- const BYTE** litPtr, const BYTE* const litLimit,
764
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
862
+ BYTE* const oend, seq_t sequence,
863
+ const BYTE** litPtr, const BYTE* const litLimit,
864
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
765
865
  {
766
866
  BYTE* const oLitEnd = op + sequence.litLength;
767
867
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -784,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
784
884
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
785
885
  /* offset beyond prefix */
786
886
  RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
787
- match = dictEnd - (prefixStart-match);
887
+ match = dictEnd - (prefixStart - match);
788
888
  if (match + sequence.matchLength <= dictEnd) {
789
889
  ZSTD_memmove(oLitEnd, match, sequence.matchLength);
790
890
  return sequenceLength;
791
891
  }
792
892
  /* span extDict & currentPrefixSegment */
793
893
  { size_t const length1 = dictEnd - match;
794
- ZSTD_memmove(oLitEnd, match, length1);
795
- op = oLitEnd + length1;
796
- sequence.matchLength -= length1;
797
- match = prefixStart;
798
- } }
894
+ ZSTD_memmove(oLitEnd, match, length1);
895
+ op = oLitEnd + length1;
896
+ sequence.matchLength -= length1;
897
+ match = prefixStart;
898
+ }
899
+ }
900
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
901
+ return sequenceLength;
902
+ }
903
+
904
+ /* ZSTD_execSequenceEndSplitLitBuffer():
905
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
906
+ */
907
+ FORCE_NOINLINE
908
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
909
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
910
+ const BYTE** litPtr, const BYTE* const litLimit,
911
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
912
+ {
913
+ BYTE* const oLitEnd = op + sequence.litLength;
914
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
915
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
916
+ const BYTE* match = oLitEnd - sequence.offset;
917
+
918
+
919
+ /* bounds checks : careful of address space overflow in 32-bit mode */
920
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
921
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
922
+ assert(op < op + sequenceLength);
923
+ assert(oLitEnd < op + sequenceLength);
924
+
925
+ /* copy literals */
926
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
927
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
928
+ op = oLitEnd;
929
+ *litPtr = iLitEnd;
930
+
931
+ /* copy Match */
932
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
933
+ /* offset beyond prefix */
934
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
935
+ match = dictEnd - (prefixStart - match);
936
+ if (match + sequence.matchLength <= dictEnd) {
937
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
938
+ return sequenceLength;
939
+ }
940
+ /* span extDict & currentPrefixSegment */
941
+ { size_t const length1 = dictEnd - match;
942
+ ZSTD_memmove(oLitEnd, match, length1);
943
+ op = oLitEnd + length1;
944
+ sequence.matchLength -= length1;
945
+ match = prefixStart;
946
+ }
947
+ }
799
948
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
800
949
  return sequenceLength;
801
950
  }
802
951
 
803
952
  HINT_INLINE
804
953
  size_t ZSTD_execSequence(BYTE* op,
805
- BYTE* const oend, seq_t sequence,
806
- const BYTE** litPtr, const BYTE* const litLimit,
807
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
954
+ BYTE* const oend, seq_t sequence,
955
+ const BYTE** litPtr, const BYTE* const litLimit,
956
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
808
957
  {
809
958
  BYTE* const oLitEnd = op + sequence.litLength;
810
959
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -813,6 +962,98 @@ size_t ZSTD_execSequence(BYTE* op,
813
962
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
814
963
  const BYTE* match = oLitEnd - sequence.offset;
815
964
 
965
+ assert(op != NULL /* Precondition */);
966
+ assert(oend_w < oend /* No underflow */);
967
+ /* Handle edge cases in a slow path:
968
+ * - Read beyond end of literals
969
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
970
+ * - 32-bit mode and the match length overflows
971
+ */
972
+ if (UNLIKELY(
973
+ iLitEnd > litLimit ||
974
+ oMatchEnd > oend_w ||
975
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
976
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
977
+
978
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
979
+ assert(op <= oLitEnd /* No overflow */);
980
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
981
+ assert(oMatchEnd <= oend /* No underflow */);
982
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
983
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
984
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
985
+
986
+ /* Copy Literals:
987
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
988
+ * We likely don't need the full 32-byte wildcopy.
989
+ */
990
+ assert(WILDCOPY_OVERLENGTH >= 16);
991
+ ZSTD_copy16(op, (*litPtr));
992
+ if (UNLIKELY(sequence.litLength > 16)) {
993
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
994
+ }
995
+ op = oLitEnd;
996
+ *litPtr = iLitEnd; /* update for next sequence */
997
+
998
+ /* Copy Match */
999
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1000
+ /* offset beyond prefix -> go into extDict */
1001
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1002
+ match = dictEnd + (match - prefixStart);
1003
+ if (match + sequence.matchLength <= dictEnd) {
1004
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1005
+ return sequenceLength;
1006
+ }
1007
+ /* span extDict & currentPrefixSegment */
1008
+ { size_t const length1 = dictEnd - match;
1009
+ ZSTD_memmove(oLitEnd, match, length1);
1010
+ op = oLitEnd + length1;
1011
+ sequence.matchLength -= length1;
1012
+ match = prefixStart;
1013
+ }
1014
+ }
1015
+ /* Match within prefix of 1 or more bytes */
1016
+ assert(op <= oMatchEnd);
1017
+ assert(oMatchEnd <= oend_w);
1018
+ assert(match >= prefixStart);
1019
+ assert(sequence.matchLength >= 1);
1020
+
1021
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1022
+ * without overlap checking.
1023
+ */
1024
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1025
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1026
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1027
+ * than 16 bytes.
1028
+ */
1029
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1030
+ return sequenceLength;
1031
+ }
1032
+ assert(sequence.offset < WILDCOPY_VECLEN);
1033
+
1034
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1035
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1036
+
1037
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1038
+ if (sequence.matchLength > 8) {
1039
+ assert(op < oMatchEnd);
1040
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1041
+ }
1042
+ return sequenceLength;
1043
+ }
1044
+
1045
+ HINT_INLINE
1046
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1047
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1048
+ const BYTE** litPtr, const BYTE* const litLimit,
1049
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1050
+ {
1051
+ BYTE* const oLitEnd = op + sequence.litLength;
1052
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1053
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1054
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1055
+ const BYTE* match = oLitEnd - sequence.offset;
1056
+
816
1057
  assert(op != NULL /* Precondition */);
817
1058
  assert(oend_w < oend /* No underflow */);
818
1059
  /* Handle edge cases in a slow path:
@@ -824,7 +1065,7 @@ size_t ZSTD_execSequence(BYTE* op,
824
1065
  iLitEnd > litLimit ||
825
1066
  oMatchEnd > oend_w ||
826
1067
  (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
827
- return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1068
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
828
1069
 
829
1070
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
830
1071
  assert(op <= oLitEnd /* No overflow */);
@@ -892,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op,
892
1133
  return sequenceLength;
893
1134
  }
894
1135
 
1136
+
895
1137
  static void
896
1138
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
897
1139
  {
@@ -905,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
905
1147
  }
906
1148
 
907
1149
  FORCE_INLINE_TEMPLATE void
908
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
909
- {
910
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
911
- U32 const nbBits = DInfo.nbBits;
912
- size_t const lowBits = BIT_readBits(bitD, nbBits);
913
- DStatePtr->state = DInfo.nextState + lowBits;
914
- }
915
-
916
- FORCE_INLINE_TEMPLATE void
917
- ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
1150
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
918
1151
  {
919
- U32 const nbBits = DInfo.nbBits;
920
1152
  size_t const lowBits = BIT_readBits(bitD, nbBits);
921
- DStatePtr->state = DInfo.nextState + lowBits;
1153
+ DStatePtr->state = nextState + lowBits;
922
1154
  }
923
1155
 
924
1156
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
@@ -937,102 +1169,100 @@ FORCE_INLINE_TEMPLATE seq_t
937
1169
  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
938
1170
  {
939
1171
  seq_t seq;
940
- ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
941
- ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
942
- ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
943
- U32 const llBase = llDInfo.baseValue;
944
- U32 const mlBase = mlDInfo.baseValue;
945
- U32 const ofBase = ofDInfo.baseValue;
946
- BYTE const llBits = llDInfo.nbAdditionalBits;
947
- BYTE const mlBits = mlDInfo.nbAdditionalBits;
948
- BYTE const ofBits = ofDInfo.nbAdditionalBits;
949
- BYTE const totalBits = llBits+mlBits+ofBits;
950
-
951
- /* sequence */
952
- { size_t offset;
953
- if (ofBits > 1) {
954
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
955
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
956
- assert(ofBits <= MaxOff);
957
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
958
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
959
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
960
- BIT_reloadDStream(&seqState->DStream);
961
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
962
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
963
- } else {
964
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
965
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
966
- }
967
- seqState->prevOffset[2] = seqState->prevOffset[1];
968
- seqState->prevOffset[1] = seqState->prevOffset[0];
969
- seqState->prevOffset[0] = offset;
970
- } else {
971
- U32 const ll0 = (llBase == 0);
972
- if (LIKELY((ofBits == 0))) {
973
- if (LIKELY(!ll0))
974
- offset = seqState->prevOffset[0];
975
- else {
976
- offset = seqState->prevOffset[1];
977
- seqState->prevOffset[1] = seqState->prevOffset[0];
978
- seqState->prevOffset[0] = offset;
1172
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1173
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1174
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1175
+ seq.matchLength = mlDInfo->baseValue;
1176
+ seq.litLength = llDInfo->baseValue;
1177
+ { U32 const ofBase = ofDInfo->baseValue;
1178
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1179
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1180
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1181
+ BYTE const totalBits = llBits+mlBits+ofBits;
1182
+
1183
+ U16 const llNext = llDInfo->nextState;
1184
+ U16 const mlNext = mlDInfo->nextState;
1185
+ U16 const ofNext = ofDInfo->nextState;
1186
+ U32 const llnbBits = llDInfo->nbBits;
1187
+ U32 const mlnbBits = mlDInfo->nbBits;
1188
+ U32 const ofnbBits = ofDInfo->nbBits;
1189
+ /*
1190
+ * As gcc has better branch and block analyzers, sometimes it is only
1191
+ * valuable to mark likelyness for clang, it gives around 3-4% of
1192
+ * performance.
1193
+ */
1194
+
1195
+ /* sequence */
1196
+ { size_t offset;
1197
+ #if defined(__clang__)
1198
+ if (LIKELY(ofBits > 1)) {
1199
+ #else
1200
+ if (ofBits > 1) {
1201
+ #endif
1202
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1203
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1204
+ assert(ofBits <= MaxOff);
1205
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1206
+ U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
1207
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1208
+ BIT_reloadDStream(&seqState->DStream);
1209
+ if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1210
+ assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
1211
+ } else {
1212
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1213
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
979
1214
  }
1215
+ seqState->prevOffset[2] = seqState->prevOffset[1];
1216
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1217
+ seqState->prevOffset[0] = offset;
980
1218
  } else {
981
- offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
982
- { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
983
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
984
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
985
- seqState->prevOffset[1] = seqState->prevOffset[0];
986
- seqState->prevOffset[0] = offset = temp;
987
- } } }
988
- seq.offset = offset;
989
- }
990
-
991
- seq.matchLength = mlBase;
992
- if (mlBits > 0)
993
- seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
994
-
995
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
996
- BIT_reloadDStream(&seqState->DStream);
997
- if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
998
- BIT_reloadDStream(&seqState->DStream);
999
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1000
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1001
-
1002
- seq.litLength = llBase;
1003
- if (llBits > 0)
1004
- seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1005
-
1006
- if (MEM_32bits())
1007
- BIT_reloadDStream(&seqState->DStream);
1008
-
1009
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1010
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1011
-
1012
- /* ANS state update
1013
- * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
1014
- * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
1015
- * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
1016
- * better option, so it is the default for other compilers. But, if you
1017
- * measure that it is worse, please put up a pull request.
1018
- */
1019
- {
1020
- #if defined(__GNUC__) && !defined(__clang__)
1021
- const int kUseUpdateFseState = 1;
1022
- #else
1023
- const int kUseUpdateFseState = 0;
1024
- #endif
1025
- if (kUseUpdateFseState) {
1026
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
1027
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
1028
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1029
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
1030
- } else {
1031
- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
1032
- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
1033
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1034
- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1219
+ U32 const ll0 = (llDInfo->baseValue == 0);
1220
+ if (LIKELY((ofBits == 0))) {
1221
+ offset = seqState->prevOffset[ll0];
1222
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1223
+ seqState->prevOffset[0] = offset;
1224
+ } else {
1225
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1226
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1227
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1228
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1229
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1230
+ seqState->prevOffset[0] = offset = temp;
1231
+ } } }
1232
+ seq.offset = offset;
1035
1233
  }
1234
+
1235
+ #if defined(__clang__)
1236
+ if (UNLIKELY(mlBits > 0))
1237
+ #else
1238
+ if (mlBits > 0)
1239
+ #endif
1240
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1241
+
1242
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1243
+ BIT_reloadDStream(&seqState->DStream);
1244
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1245
+ BIT_reloadDStream(&seqState->DStream);
1246
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1247
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1248
+
1249
+ #if defined(__clang__)
1250
+ if (UNLIKELY(llBits > 0))
1251
+ #else
1252
+ if (llBits > 0)
1253
+ #endif
1254
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1255
+
1256
+ if (MEM_32bits())
1257
+ BIT_reloadDStream(&seqState->DStream);
1258
+
1259
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1260
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1261
+
1262
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1263
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1264
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1265
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
1036
1266
  }
1037
1267
 
1038
1268
  return seq;
@@ -1085,9 +1315,11 @@ MEM_STATIC void ZSTD_assertValidSequence(
1085
1315
  #endif
1086
1316
 
1087
1317
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1318
+
1319
+
1088
1320
  FORCE_INLINE_TEMPLATE size_t
1089
1321
  DONT_VECTORIZE
1090
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1322
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
1091
1323
  void* dst, size_t maxDstSize,
1092
1324
  const void* seqStart, size_t seqSize, int nbSeq,
1093
1325
  const ZSTD_longOffset_e isLongOffset,
@@ -1099,11 +1331,11 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1099
1331
  BYTE* const oend = ostart + maxDstSize;
1100
1332
  BYTE* op = ostart;
1101
1333
  const BYTE* litPtr = dctx->litPtr;
1102
- const BYTE* const litEnd = litPtr + dctx->litSize;
1334
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1103
1335
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1104
1336
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1105
1337
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1106
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1338
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1107
1339
  (void)frame;
1108
1340
 
1109
1341
  /* Regen sequences */
@@ -1124,55 +1356,237 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1124
1356
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1125
1357
  BIT_DStream_completed < BIT_DStream_overflow);
1126
1358
 
1359
+ /* decompress without overrunning litPtr begins */
1360
+ {
1361
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1362
+ /* Align the decompression loop to 32 + 16 bytes.
1363
+ *
1364
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1365
+ * speed swings based on the alignment of the decompression loop. This
1366
+ * performance swing is caused by parts of the decompression loop falling
1367
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1368
+ * when it can't we get much worse performance. You can measure if you've
1369
+ * hit the good case or the bad case with this perf command for some
1370
+ * compressed file test.zst:
1371
+ *
1372
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1373
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1374
+ *
1375
+ * If you see most cycles served out of the MITE you've hit the bad case.
1376
+ * If you see most cycles served out of the DSB you've hit the good case.
1377
+ * If it is pretty even then you may be in an okay case.
1378
+ *
1379
+ * This issue has been reproduced on the following CPUs:
1380
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1381
+ * Use Instruments->Counters to get DSB/MITE cycles.
1382
+ * I never got performance swings, but I was able to
1383
+ * go from the good case of mostly DSB to half of the
1384
+ * cycles served from MITE.
1385
+ * - Coffeelake: Intel i9-9900k
1386
+ * - Coffeelake: Intel i7-9700k
1387
+ *
1388
+ * I haven't been able to reproduce the instability or DSB misses on any
1389
+ * of the following CPUS:
1390
+ * - Haswell
1391
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1392
+ * - Skylake
1393
+ *
1394
+ * Alignment is done for each of the three major decompression loops:
1395
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1396
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1397
+ * - ZSTD_decompressSequences_body
1398
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1399
+ * from changes external to this code, rather than to overoptimize on the current commit.
1400
+ *
1401
+ * If you are seeing performance stability this script can help test.
1402
+ * It tests on 4 commits in zstd where I saw performance change.
1403
+ *
1404
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1405
+ */
1127
1406
  #if defined(__GNUC__) && defined(__x86_64__)
1128
- /* Align the decompression loop to 32 + 16 bytes.
1129
- *
1130
- * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1131
- * speed swings based on the alignment of the decompression loop. This
1132
- * performance swing is caused by parts of the decompression loop falling
1133
- * out of the DSB. The entire decompression loop should fit in the DSB,
1134
- * when it can't we get much worse performance. You can measure if you've
1135
- * hit the good case or the bad case with this perf command for some
1136
- * compressed file test.zst:
1137
- *
1138
- * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1139
- * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1140
- *
1141
- * If you see most cycles served out of the MITE you've hit the bad case.
1142
- * If you see most cycles served out of the DSB you've hit the good case.
1143
- * If it is pretty even then you may be in an okay case.
1144
- *
1145
- * This issue has been reproduced on the following CPUs:
1146
- * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1147
- * Use Instruments->Counters to get DSB/MITE cycles.
1148
- * I never got performance swings, but I was able to
1149
- * go from the good case of mostly DSB to half of the
1150
- * cycles served from MITE.
1151
- * - Coffeelake: Intel i9-9900k
1152
- * - Coffeelake: Intel i7-9700k
1153
- *
1154
- * I haven't been able to reproduce the instability or DSB misses on any
1155
- * of the following CPUS:
1156
- * - Haswell
1157
- * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1158
- * - Skylake
1159
- *
1160
- * If you are seeing performance stability this script can help test.
1161
- * It tests on 4 commits in zstd where I saw performance change.
1162
- *
1163
- * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1164
- */
1165
- __asm__(".p2align 6");
1166
- __asm__("nop");
1167
- __asm__(".p2align 5");
1168
- __asm__("nop");
1169
- # if __GNUC__ >= 9
1170
- /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */
1171
- __asm__(".p2align 3");
1407
+ __asm__(".p2align 6");
1408
+ # if __GNUC__ >= 7
1409
+ /* good for gcc-7, gcc-9, and gcc-11 */
1410
+ __asm__("nop");
1411
+ __asm__(".p2align 5");
1412
+ __asm__("nop");
1413
+ __asm__(".p2align 4");
1414
+ # if __GNUC__ == 8 || __GNUC__ == 10
1415
+ /* good for gcc-8 and gcc-10 */
1416
+ __asm__("nop");
1417
+ __asm__(".p2align 3");
1418
+ # endif
1419
+ # endif
1420
+ #endif
1421
+
1422
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1423
+ for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1424
+ size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1425
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1426
+ assert(!ZSTD_isError(oneSeqSize));
1427
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1428
+ #endif
1429
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1430
+ return oneSeqSize;
1431
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1432
+ op += oneSeqSize;
1433
+ if (UNLIKELY(!--nbSeq))
1434
+ break;
1435
+ BIT_reloadDStream(&(seqState.DStream));
1436
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1437
+ }
1438
+
1439
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1440
+ if (nbSeq > 0) {
1441
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1442
+ if (leftoverLit)
1443
+ {
1444
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1445
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1446
+ sequence.litLength -= leftoverLit;
1447
+ op += leftoverLit;
1448
+ }
1449
+ litPtr = dctx->litExtraBuffer;
1450
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1451
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1452
+ {
1453
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
+ assert(!ZSTD_isError(oneSeqSize));
1456
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
+ #endif
1458
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
+ return oneSeqSize;
1460
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1461
+ op += oneSeqSize;
1462
+ if (--nbSeq)
1463
+ BIT_reloadDStream(&(seqState.DStream));
1464
+ }
1465
+ }
1466
+ }
1467
+
1468
+ if (nbSeq > 0) /* there is remaining lit from extra buffer */
1469
+ {
1470
+
1471
+ #if defined(__GNUC__) && defined(__x86_64__)
1472
+ __asm__(".p2align 6");
1473
+ __asm__("nop");
1474
+ # if __GNUC__ != 7
1475
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1476
+ __asm__(".p2align 4");
1477
+ __asm__("nop");
1478
+ __asm__(".p2align 3");
1479
+ # elif __GNUC__ >= 11
1480
+ __asm__(".p2align 3");
1481
+ # else
1482
+ __asm__(".p2align 5");
1483
+ __asm__("nop");
1484
+ __asm__(".p2align 3");
1485
+ # endif
1486
+ #endif
1487
+
1488
+ for (; ; ) {
1489
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1490
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1491
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1492
+ assert(!ZSTD_isError(oneSeqSize));
1493
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1494
+ #endif
1495
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1496
+ return oneSeqSize;
1497
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1498
+ op += oneSeqSize;
1499
+ if (UNLIKELY(!--nbSeq))
1500
+ break;
1501
+ BIT_reloadDStream(&(seqState.DStream));
1502
+ }
1503
+ }
1504
+
1505
+ /* check if reached exact end */
1506
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1507
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1508
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1509
+ /* save reps for next block */
1510
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1511
+ }
1512
+
1513
+ /* last literal segment */
1514
+ if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1515
+ {
1516
+ size_t const lastLLSize = litBufferEnd - litPtr;
1517
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1518
+ if (op != NULL) {
1519
+ ZSTD_memmove(op, litPtr, lastLLSize);
1520
+ op += lastLLSize;
1521
+ }
1522
+ litPtr = dctx->litExtraBuffer;
1523
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1524
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1525
+ }
1526
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1527
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1528
+ if (op != NULL) {
1529
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1530
+ op += lastLLSize;
1531
+ }
1532
+ }
1533
+
1534
+ return op-ostart;
1535
+ }
1536
+
1537
+ FORCE_INLINE_TEMPLATE size_t
1538
+ DONT_VECTORIZE
1539
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1540
+ void* dst, size_t maxDstSize,
1541
+ const void* seqStart, size_t seqSize, int nbSeq,
1542
+ const ZSTD_longOffset_e isLongOffset,
1543
+ const int frame)
1544
+ {
1545
+ const BYTE* ip = (const BYTE*)seqStart;
1546
+ const BYTE* const iend = ip + seqSize;
1547
+ BYTE* const ostart = (BYTE*)dst;
1548
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1549
+ BYTE* op = ostart;
1550
+ const BYTE* litPtr = dctx->litPtr;
1551
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1552
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1553
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1554
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1555
+ DEBUGLOG(5, "ZSTD_decompressSequences_body");
1556
+ (void)frame;
1557
+
1558
+ /* Regen sequences */
1559
+ if (nbSeq) {
1560
+ seqState_t seqState;
1561
+ dctx->fseEntropy = 1;
1562
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1563
+ RETURN_ERROR_IF(
1564
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1565
+ corruption_detected, "");
1566
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1567
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1568
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1569
+ assert(dst != NULL);
1570
+
1571
+ ZSTD_STATIC_ASSERT(
1572
+ BIT_DStream_unfinished < BIT_DStream_completed &&
1573
+ BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1574
+ BIT_DStream_completed < BIT_DStream_overflow);
1575
+
1576
+ #if defined(__GNUC__) && defined(__x86_64__)
1577
+ __asm__(".p2align 6");
1578
+ __asm__("nop");
1579
+ # if __GNUC__ >= 7
1580
+ __asm__(".p2align 5");
1581
+ __asm__("nop");
1582
+ __asm__(".p2align 3");
1172
1583
  # else
1173
- __asm__(".p2align 4");
1584
+ __asm__(".p2align 4");
1585
+ __asm__("nop");
1586
+ __asm__(".p2align 3");
1174
1587
  # endif
1175
1588
  #endif
1589
+
1176
1590
  for ( ; ; ) {
1177
1591
  seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1178
1592
  size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
@@ -1218,6 +1632,16 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1218
1632
  {
1219
1633
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1220
1634
  }
1635
+
1636
+ static size_t
1637
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1638
+ void* dst, size_t maxDstSize,
1639
+ const void* seqStart, size_t seqSize, int nbSeq,
1640
+ const ZSTD_longOffset_e isLongOffset,
1641
+ const int frame)
1642
+ {
1643
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1644
+ }
1221
1645
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1222
1646
 
1223
1647
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
@@ -1250,10 +1674,10 @@ ZSTD_decompressSequencesLong_body(
1250
1674
  const BYTE* ip = (const BYTE*)seqStart;
1251
1675
  const BYTE* const iend = ip + seqSize;
1252
1676
  BYTE* const ostart = (BYTE*)dst;
1253
- BYTE* const oend = ostart + maxDstSize;
1677
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1254
1678
  BYTE* op = ostart;
1255
1679
  const BYTE* litPtr = dctx->litPtr;
1256
- const BYTE* const litEnd = litPtr + dctx->litSize;
1680
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1257
1681
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1258
1682
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1259
1683
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
@@ -1289,32 +1713,94 @@ ZSTD_decompressSequencesLong_body(
1289
1713
  }
1290
1714
  RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1291
1715
 
1292
- /* decode and decompress */
1293
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1294
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1295
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1716
+ /* decompress without stomping litBuffer */
1717
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1718
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1719
+ size_t oneSeqSize;
1720
+
1721
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1722
+ {
1723
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1724
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1725
+ if (leftoverLit)
1726
+ {
1727
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1728
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1729
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1730
+ op += leftoverLit;
1731
+ }
1732
+ litPtr = dctx->litExtraBuffer;
1733
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1734
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1735
+ oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1296
1736
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1297
- assert(!ZSTD_isError(oneSeqSize));
1298
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1737
+ assert(!ZSTD_isError(oneSeqSize));
1738
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1299
1739
  #endif
1300
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1740
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1301
1741
 
1302
- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1303
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1304
- op += oneSeqSize;
1742
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1743
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1744
+ op += oneSeqSize;
1745
+ }
1746
+ else
1747
+ {
1748
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1749
+ oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1750
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1751
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1752
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1753
+ assert(!ZSTD_isError(oneSeqSize));
1754
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1755
+ #endif
1756
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1757
+
1758
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1759
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1760
+ op += oneSeqSize;
1761
+ }
1305
1762
  }
1306
1763
  RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1307
1764
 
1308
1765
  /* finish queue */
1309
1766
  seqNb -= seqAdvance;
1310
1767
  for ( ; seqNb<nbSeq ; seqNb++) {
1311
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1768
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1769
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1770
+ {
1771
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1772
+ if (leftoverLit)
1773
+ {
1774
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1775
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1776
+ sequence->litLength -= leftoverLit;
1777
+ op += leftoverLit;
1778
+ }
1779
+ litPtr = dctx->litExtraBuffer;
1780
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1781
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1782
+ {
1783
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1312
1784
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1313
- assert(!ZSTD_isError(oneSeqSize));
1314
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1785
+ assert(!ZSTD_isError(oneSeqSize));
1786
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1315
1787
  #endif
1316
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1317
- op += oneSeqSize;
1788
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1789
+ op += oneSeqSize;
1790
+ }
1791
+ }
1792
+ else
1793
+ {
1794
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1795
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1796
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1797
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1798
+ assert(!ZSTD_isError(oneSeqSize));
1799
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1800
+ #endif
1801
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
+ op += oneSeqSize;
1803
+ }
1318
1804
  }
1319
1805
 
1320
1806
  /* save reps for next block */
@@ -1322,10 +1808,21 @@ ZSTD_decompressSequencesLong_body(
1322
1808
  }
1323
1809
 
1324
1810
  /* last literal segment */
1325
- { size_t const lastLLSize = litEnd - litPtr;
1811
+ if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
1812
+ {
1813
+ size_t const lastLLSize = litBufferEnd - litPtr;
1814
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1815
+ if (op != NULL) {
1816
+ ZSTD_memmove(op, litPtr, lastLLSize);
1817
+ op += lastLLSize;
1818
+ }
1819
+ litPtr = dctx->litExtraBuffer;
1820
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1821
+ }
1822
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1326
1823
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1327
1824
  if (op != NULL) {
1328
- ZSTD_memcpy(op, litPtr, lastLLSize);
1825
+ ZSTD_memmove(op, litPtr, lastLLSize);
1329
1826
  op += lastLLSize;
1330
1827
  }
1331
1828
  }
@@ -1349,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1349
1846
  #if DYNAMIC_BMI2
1350
1847
 
1351
1848
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1352
- static TARGET_ATTRIBUTE("bmi2") size_t
1849
+ static BMI2_TARGET_ATTRIBUTE size_t
1353
1850
  DONT_VECTORIZE
1354
1851
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1355
1852
  void* dst, size_t maxDstSize,
@@ -1359,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1359
1856
  {
1360
1857
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1361
1858
  }
1859
+ static BMI2_TARGET_ATTRIBUTE size_t
1860
+ DONT_VECTORIZE
1861
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1862
+ void* dst, size_t maxDstSize,
1863
+ const void* seqStart, size_t seqSize, int nbSeq,
1864
+ const ZSTD_longOffset_e isLongOffset,
1865
+ const int frame)
1866
+ {
1867
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1868
+ }
1362
1869
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1363
1870
 
1364
1871
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1365
- static TARGET_ATTRIBUTE("bmi2") size_t
1872
+ static BMI2_TARGET_ATTRIBUTE size_t
1366
1873
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1367
1874
  void* dst, size_t maxDstSize,
1368
1875
  const void* seqStart, size_t seqSize, int nbSeq,
@@ -1391,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1391
1898
  {
1392
1899
  DEBUGLOG(5, "ZSTD_decompressSequences");
1393
1900
  #if DYNAMIC_BMI2
1394
- if (dctx->bmi2) {
1901
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1395
1902
  return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1396
1903
  }
1397
1904
  #endif
1398
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1905
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1906
+ }
1907
+ static size_t
1908
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1909
+ const void* seqStart, size_t seqSize, int nbSeq,
1910
+ const ZSTD_longOffset_e isLongOffset,
1911
+ const int frame)
1912
+ {
1913
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1914
+ #if DYNAMIC_BMI2
1915
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1916
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1917
+ }
1918
+ #endif
1919
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1399
1920
  }
1400
1921
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1401
1922
 
@@ -1415,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1415
1936
  {
1416
1937
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1417
1938
  #if DYNAMIC_BMI2
1418
- if (dctx->bmi2) {
1939
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1419
1940
  return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1420
1941
  }
1421
1942
  #endif
@@ -1456,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1456
1977
  size_t
1457
1978
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1458
1979
  void* dst, size_t dstCapacity,
1459
- const void* src, size_t srcSize, const int frame)
1980
+ const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1460
1981
  { /* blockType == blockCompressed */
1461
1982
  const BYTE* ip = (const BYTE*)src;
1462
1983
  /* isLongOffset must be true if there are long offsets.
@@ -1471,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1471
1992
  RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1472
1993
 
1473
1994
  /* Decode literals section */
1474
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1995
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
1475
1996
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
1476
1997
  if (ZSTD_isError(litCSize)) return litCSize;
1477
1998
  ip += litCSize;
@@ -1519,7 +2040,10 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1519
2040
 
1520
2041
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1521
2042
  /* else */
1522
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2043
+ if (dctx->litBufferLocation == ZSTD_split)
2044
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2045
+ else
2046
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1523
2047
  #endif
1524
2048
  }
1525
2049
  }
@@ -1542,7 +2066,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1542
2066
  {
1543
2067
  size_t dSize;
1544
2068
  ZSTD_checkContinuity(dctx, dst, dstCapacity);
1545
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2069
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
1546
2070
  dctx->previousDstEnd = (char*)dst + dSize;
1547
2071
  return dSize;
1548
2072
  }