zstdlib 0.7.0-x86-mingw32 → 0.10.0-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +20 -0
  3. data/README.md +7 -1
  4. data/Rakefile +38 -8
  5. data/ext/{zstdlib → zstdlib_c}/extconf.rb +11 -6
  6. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.2/zstdlib.c +2 -2
  7. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.3/zstdlib.c +2 -2
  8. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.4/zstdlib.c +2 -2
  9. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.5/zstdlib.c +2 -2
  10. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.6/zstdlib.c +2 -2
  11. data/ext/{zstdlib → zstdlib_c}/ruby/zlib-2.7/zstdlib.c +2 -2
  12. data/ext/zstdlib_c/ruby/zlib-3.0/zstdlib.c +4994 -0
  13. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  14. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/adler32.c +0 -0
  15. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/compress.c +0 -0
  16. data/ext/zstdlib_c/zlib-1.2.12/crc32.c +1116 -0
  17. data/ext/zstdlib_c/zlib-1.2.12/crc32.h +9446 -0
  18. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/deflate.c +78 -30
  19. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/deflate.h +12 -15
  20. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzclose.c +0 -0
  21. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzguts.h +3 -2
  22. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzlib.c +5 -3
  23. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzread.c +5 -7
  24. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/gzwrite.c +25 -13
  25. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/infback.c +2 -1
  26. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inffast.c +14 -14
  27. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inffast.h +0 -0
  28. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inffixed.h +0 -0
  29. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inflate.c +39 -8
  30. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inflate.h +3 -2
  31. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inftrees.c +3 -3
  32. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/inftrees.h +0 -0
  33. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/trees.c +27 -48
  34. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/trees.h +0 -0
  35. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/uncompr.c +0 -0
  36. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zconf.h +0 -0
  37. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zlib.h +123 -100
  38. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zutil.c +2 -2
  39. data/ext/{zstdlib/zlib-1.2.11 → zstdlib_c/zlib-1.2.12}/zutil.h +12 -9
  40. data/ext/{zstdlib → zstdlib_c}/zlib.mk +0 -0
  41. data/ext/{zstdlib → zstdlib_c}/zlibwrapper/zlibwrapper.c +1 -5
  42. data/ext/{zstdlib → zstdlib_c}/zlibwrapper.mk +0 -0
  43. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/bitstream.h +46 -22
  44. data/ext/zstdlib_c/zstd-1.5.2/lib/common/compiler.h +335 -0
  45. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/cpu.h +1 -3
  46. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/debug.c +1 -1
  47. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/debug.h +12 -19
  48. data/ext/zstdlib_c/zstd-1.5.2/lib/common/entropy_common.c +368 -0
  49. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/error_private.c +2 -1
  50. data/ext/zstdlib_c/zstd-1.5.2/lib/common/error_private.h +159 -0
  51. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/fse.h +41 -12
  52. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/fse_decompress.c +139 -22
  53. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/huf.h +47 -23
  54. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/mem.h +87 -98
  55. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/pool.c +34 -23
  56. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/pool.h +4 -4
  57. data/ext/zstdlib_c/zstd-1.5.2/lib/common/portability_macros.h +137 -0
  58. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/threading.c +6 -5
  59. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/threading.h +0 -0
  60. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.c +24 -0
  61. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.h +5686 -0
  62. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_common.c +10 -10
  63. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_deps.h +111 -0
  64. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/common/zstd_internal.h +191 -145
  65. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_trace.h +163 -0
  66. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/clevels.h +134 -0
  67. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/fse_compress.c +89 -46
  68. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.c +27 -29
  69. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/hist.h +2 -2
  70. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/huf_compress.c +1370 -0
  71. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress.c +2917 -868
  72. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_internal.h +458 -125
  73. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.c +12 -11
  74. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_literals.h +4 -2
  75. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.c +41 -18
  76. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_sequences.h +1 -1
  77. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.c +26 -298
  78. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_compress_superblock.h +1 -1
  79. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_cwksp.h +234 -83
  80. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.c +313 -138
  81. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_double_fast.h +1 -1
  82. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.c +329 -150
  83. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_fast.h +1 -1
  84. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.c +2104 -0
  85. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.h +125 -0
  86. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.c +321 -216
  87. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_ldm.h +9 -2
  88. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm_geartab.h +106 -0
  89. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.c +412 -166
  90. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstd_opt.h +1 -1
  91. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/compress/zstdmt_compress.c +169 -453
  92. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstdmt_compress.h +113 -0
  93. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/huf_decompress.c +1044 -403
  94. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress_amd64.S +585 -0
  95. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.c +9 -9
  96. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_ddict.h +2 -2
  97. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress.c +450 -105
  98. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.c +913 -273
  99. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_block.h +14 -5
  100. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/decompress/zstd_decompress_internal.h +59 -12
  101. data/ext/zstdlib_c/zstd-1.5.2/lib/zdict.h +452 -0
  102. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/lib/zstd.h +699 -214
  103. data/ext/{zstdlib/zstd-1.4.5/lib/common → zstdlib_c/zstd-1.5.2/lib}/zstd_errors.h +2 -1
  104. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzclose.c +0 -0
  105. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzcompatibility.h +1 -1
  106. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzguts.h +0 -0
  107. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzlib.c +0 -0
  108. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzread.c +0 -0
  109. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/gzwrite.c +0 -0
  110. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.c +133 -44
  111. data/ext/{zstdlib/zstd-1.4.5 → zstdlib_c/zstd-1.5.2}/zlibWrapper/zstd_zlibwrapper.h +1 -1
  112. data/ext/zstdlib_c/zstd.mk +15 -0
  113. data/lib/2.4/zstdlib_c.so +0 -0
  114. data/lib/2.5/zstdlib_c.so +0 -0
  115. data/lib/2.6/zstdlib_c.so +0 -0
  116. data/lib/2.7/zstdlib_c.so +0 -0
  117. data/lib/3.0/zstdlib_c.so +0 -0
  118. data/lib/3.1/zstdlib_c.so +0 -0
  119. data/lib/zstdlib.rb +2 -2
  120. metadata +125 -116
  121. data/ext/zstdlib/zlib-1.2.11/crc32.c +0 -442
  122. data/ext/zstdlib/zlib-1.2.11/crc32.h +0 -441
  123. data/ext/zstdlib/zstd-1.4.5/lib/common/compiler.h +0 -175
  124. data/ext/zstdlib/zstd-1.4.5/lib/common/entropy_common.c +0 -216
  125. data/ext/zstdlib/zstd-1.4.5/lib/common/error_private.h +0 -80
  126. data/ext/zstdlib/zstd-1.4.5/lib/common/xxhash.c +0 -864
  127. data/ext/zstdlib/zstd-1.4.5/lib/common/xxhash.h +0 -285
  128. data/ext/zstdlib/zstd-1.4.5/lib/compress/huf_compress.c +0 -798
  129. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.c +0 -1138
  130. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstd_lazy.h +0 -67
  131. data/ext/zstdlib/zstd-1.4.5/lib/compress/zstdmt_compress.h +0 -192
  132. data/ext/zstdlib/zstd.mk +0 -14
  133. data/lib/2.2/zstdlib.so +0 -0
  134. data/lib/2.3/zstdlib.so +0 -0
  135. data/lib/2.4/zstdlib.so +0 -0
  136. data/lib/2.5/zstdlib.so +0 -0
  137. data/lib/2.6/zstdlib.so +0 -0
  138. data/lib/2.7/zstdlib.so +0 -0
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +14,7 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
- #include <string.h> /* memcpy, memmove, memset */
17
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
18
  #include "../common/compiler.h" /* prefetch */
19
19
  #include "../common/cpu.h" /* bmi2 */
20
20
  #include "../common/mem.h" /* low level memory routines */
@@ -44,7 +44,7 @@
44
44
  /*_*******************************************************
45
45
  * Memory operations
46
46
  **********************************************************/
47
- static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
47
+ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
48
 
49
49
 
50
50
  /*-*************************************************************
@@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
69
69
  }
70
70
  }
71
71
 
72
+ /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
+ const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
+ {
76
+ if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
+ {
78
+ /* room for litbuffer to fit without read faulting */
79
+ dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
81
+ dctx->litBufferLocation = ZSTD_in_dst;
82
+ }
83
+ else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
+ {
85
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
+ if (splitImmediately) {
87
+ /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
+ }
91
+ else {
92
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
93
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
+ }
96
+ dctx->litBufferLocation = ZSTD_split;
97
+ }
98
+ else
99
+ {
100
+ /* fits entirely within litExtraBuffer, so no split is necessary */
101
+ dctx->litBuffer = dctx->litExtraBuffer;
102
+ dctx->litBufferEnd = dctx->litBuffer + litSize;
103
+ dctx->litBufferLocation = ZSTD_not_in_dst;
104
+ }
105
+ }
72
106
 
73
107
  /* Hidden declaration for fullbench */
74
108
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
75
- const void* src, size_t srcSize);
109
+ const void* src, size_t srcSize,
110
+ void* dst, size_t dstCapacity, const streaming_operation streaming);
76
111
  /*! ZSTD_decodeLiteralsBlock() :
112
+ * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
+ * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
114
+ * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
+ *
77
117
  * @return : nb of bytes read from src (< srcSize )
78
118
  * note : symbol not declared but exposed for fullbench */
79
119
  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
80
- const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
120
+ const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
121
+ void* dst, size_t dstCapacity, const streaming_operation streaming)
81
122
  {
82
123
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
83
124
  RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
@@ -90,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
90
131
  case set_repeat:
91
132
  DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
92
133
  RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
93
- /* fall-through */
134
+ ZSTD_FALLTHROUGH;
94
135
 
95
136
  case set_compressed:
96
137
  RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
@@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
99
140
  U32 const lhlCode = (istart[0] >> 2) & 3;
100
141
  U32 const lhc = MEM_readLE32(istart);
101
142
  size_t hufSuccess;
143
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
102
144
  switch(lhlCode)
103
145
  {
104
146
  case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
121
163
  litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
122
164
  break;
123
165
  }
166
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
124
167
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
125
168
  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
169
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
170
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
126
171
 
127
172
  /* prefetch huffman table if cold */
128
173
  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
133
178
  if (singleStream) {
134
179
  hufSuccess = HUF_decompress1X_usingDTable_bmi2(
135
180
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
136
- dctx->HUFptr, dctx->bmi2);
181
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
137
182
  } else {
138
183
  hufSuccess = HUF_decompress4X_usingDTable_bmi2(
139
184
  dctx->litBuffer, litSize, istart+lhSize, litCSize,
140
- dctx->HUFptr, dctx->bmi2);
185
+ dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
141
186
  }
142
187
  } else {
143
188
  if (singleStream) {
@@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
150
195
  hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
151
196
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
152
197
  istart+lhSize, litCSize, dctx->workspace,
153
- sizeof(dctx->workspace), dctx->bmi2);
198
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
154
199
  #endif
155
200
  } else {
156
201
  hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
157
202
  dctx->entropy.hufTable, dctx->litBuffer, litSize,
158
203
  istart+lhSize, litCSize, dctx->workspace,
159
- sizeof(dctx->workspace), dctx->bmi2);
204
+ sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
160
205
  }
161
206
  }
207
+ if (dctx->litBufferLocation == ZSTD_split)
208
+ {
209
+ ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
210
+ ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
211
+ dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
212
+ dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
213
+ }
162
214
 
163
215
  RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
164
216
 
@@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
166
218
  dctx->litSize = litSize;
167
219
  dctx->litEntropy = 1;
168
220
  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
169
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
170
221
  return litCSize + lhSize;
171
222
  }
172
223
 
173
224
  case set_basic:
174
225
  { size_t litSize, lhSize;
175
226
  U32 const lhlCode = ((istart[0]) >> 2) & 3;
227
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
176
228
  switch(lhlCode)
177
229
  {
178
230
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
189
241
  break;
190
242
  }
191
243
 
244
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
245
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
246
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
192
247
  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
193
248
  RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
194
- memcpy(dctx->litBuffer, istart+lhSize, litSize);
249
+ if (dctx->litBufferLocation == ZSTD_split)
250
+ {
251
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
252
+ ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
253
+ }
254
+ else
255
+ {
256
+ ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
257
+ }
195
258
  dctx->litPtr = dctx->litBuffer;
196
259
  dctx->litSize = litSize;
197
- memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
198
260
  return lhSize+litSize;
199
261
  }
200
262
  /* direct reference into compressed stream */
201
263
  dctx->litPtr = istart+lhSize;
202
264
  dctx->litSize = litSize;
265
+ dctx->litBufferEnd = dctx->litPtr + litSize;
266
+ dctx->litBufferLocation = ZSTD_not_in_dst;
203
267
  return lhSize+litSize;
204
268
  }
205
269
 
206
270
  case set_rle:
207
271
  { U32 const lhlCode = ((istart[0]) >> 2) & 3;
208
272
  size_t litSize, lhSize;
273
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
209
274
  switch(lhlCode)
210
275
  {
211
276
  case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
222
287
  RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
223
288
  break;
224
289
  }
290
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
225
291
  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
226
- memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
292
+ RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
293
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
294
+ if (dctx->litBufferLocation == ZSTD_split)
295
+ {
296
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
297
+ ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
298
+ }
299
+ else
300
+ {
301
+ ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
302
+ }
227
303
  dctx->litPtr = dctx->litBuffer;
228
304
  dctx->litSize = litSize;
229
305
  return lhSize+1;
@@ -236,7 +312,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
236
312
 
237
313
  /* Default FSE distribution tables.
238
314
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
239
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
315
+ * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
240
316
  * They were generated programmatically with following method :
241
317
  * - start from default distributions, present in /lib/common/zstd_internal.h
242
318
  * - generate tables normally, using ZSTD_buildFSETable()
@@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
343
419
  }; /* ML_defaultDTable */
344
420
 
345
421
 
346
- static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
422
+ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
347
423
  {
348
424
  void* ptr = dt;
349
425
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
355
431
  cell->nbBits = 0;
356
432
  cell->nextState = 0;
357
433
  assert(nbAddBits < 255);
358
- cell->nbAdditionalBits = (BYTE)nbAddBits;
434
+ cell->nbAdditionalBits = nbAddBits;
359
435
  cell->baseValue = baseValue;
360
436
  }
361
437
 
@@ -364,23 +440,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
364
440
  * generate FSE decoding table for one symbol (ll, ml or off)
365
441
  * cannot fail if input is valid =>
366
442
  * all inputs are presumed validated at this stage */
367
- void
368
- ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
443
+ FORCE_INLINE_TEMPLATE
444
+ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
369
445
  const short* normalizedCounter, unsigned maxSymbolValue,
370
- const U32* baseValue, const U32* nbAdditionalBits,
371
- unsigned tableLog)
446
+ const U32* baseValue, const U8* nbAdditionalBits,
447
+ unsigned tableLog, void* wksp, size_t wkspSize)
372
448
  {
373
449
  ZSTD_seqSymbol* const tableDecode = dt+1;
374
- U16 symbolNext[MaxSeq+1];
375
-
376
450
  U32 const maxSV1 = maxSymbolValue + 1;
377
451
  U32 const tableSize = 1 << tableLog;
378
- U32 highThreshold = tableSize-1;
452
+
453
+ U16* symbolNext = (U16*)wksp;
454
+ BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
455
+ U32 highThreshold = tableSize - 1;
456
+
379
457
 
380
458
  /* Sanity Checks */
381
459
  assert(maxSymbolValue <= MaxSeq);
382
460
  assert(tableLog <= MaxFSELog);
383
-
461
+ assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
462
+ (void)wkspSize;
384
463
  /* Init, lay down lowprob symbols */
385
464
  { ZSTD_seqSymbol_header DTableH;
386
465
  DTableH.tableLog = tableLog;
@@ -396,16 +475,69 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
396
475
  assert(normalizedCounter[s]>=0);
397
476
  symbolNext[s] = (U16)normalizedCounter[s];
398
477
  } } }
399
- memcpy(dt, &DTableH, sizeof(DTableH));
478
+ ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
400
479
  }
401
480
 
402
481
  /* Spread symbols */
403
- { U32 const tableMask = tableSize-1;
482
+ assert(tableSize <= 512);
483
+ /* Specialized symbol spreading for the case when there are
484
+ * no low probability (-1 count) symbols. When compressing
485
+ * small blocks we avoid low probability symbols to hit this
486
+ * case, since header decoding speed matters more.
487
+ */
488
+ if (highThreshold == tableSize - 1) {
489
+ size_t const tableMask = tableSize-1;
490
+ size_t const step = FSE_TABLESTEP(tableSize);
491
+ /* First lay down the symbols in order.
492
+ * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
493
+ * misses since small blocks generally have small table logs, so nearly
494
+ * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
495
+ * our buffer to handle the over-write.
496
+ */
497
+ {
498
+ U64 const add = 0x0101010101010101ull;
499
+ size_t pos = 0;
500
+ U64 sv = 0;
501
+ U32 s;
502
+ for (s=0; s<maxSV1; ++s, sv += add) {
503
+ int i;
504
+ int const n = normalizedCounter[s];
505
+ MEM_write64(spread + pos, sv);
506
+ for (i = 8; i < n; i += 8) {
507
+ MEM_write64(spread + pos + i, sv);
508
+ }
509
+ pos += n;
510
+ }
511
+ }
512
+ /* Now we spread those positions across the table.
513
+ * The benefit of doing it in two stages is that we avoid the the
514
+ * variable size inner loop, which caused lots of branch misses.
515
+ * Now we can run through all the positions without any branch misses.
516
+ * We unroll the loop twice, since that is what emperically worked best.
517
+ */
518
+ {
519
+ size_t position = 0;
520
+ size_t s;
521
+ size_t const unroll = 2;
522
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
523
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
524
+ size_t u;
525
+ for (u = 0; u < unroll; ++u) {
526
+ size_t const uPosition = (position + (u * step)) & tableMask;
527
+ tableDecode[uPosition].baseValue = spread[s + u];
528
+ }
529
+ position = (position + (unroll * step)) & tableMask;
530
+ }
531
+ assert(position == 0);
532
+ }
533
+ } else {
534
+ U32 const tableMask = tableSize-1;
404
535
  U32 const step = FSE_TABLESTEP(tableSize);
405
536
  U32 s, position = 0;
406
537
  for (s=0; s<maxSV1; s++) {
407
538
  int i;
408
- for (i=0; i<normalizedCounter[s]; i++) {
539
+ int const n = normalizedCounter[s];
540
+ for (i=0; i<n; i++) {
409
541
  tableDecode[position].baseValue = s;
410
542
  position = (position + step) & tableMask;
411
543
  while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
@@ -414,16 +546,56 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
414
546
  }
415
547
 
416
548
  /* Build Decoding table */
417
- { U32 u;
549
+ {
550
+ U32 u;
418
551
  for (u=0; u<tableSize; u++) {
419
552
  U32 const symbol = tableDecode[u].baseValue;
420
553
  U32 const nextState = symbolNext[symbol]++;
421
554
  tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
422
555
  tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
423
556
  assert(nbAdditionalBits[symbol] < 255);
424
- tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
557
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
425
558
  tableDecode[u].baseValue = baseValue[symbol];
426
- } }
559
+ }
560
+ }
561
+ }
562
+
563
+ /* Avoids the FORCE_INLINE of the _body() function. */
564
+ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
565
+ const short* normalizedCounter, unsigned maxSymbolValue,
566
+ const U32* baseValue, const U8* nbAdditionalBits,
567
+ unsigned tableLog, void* wksp, size_t wkspSize)
568
+ {
569
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
570
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
571
+ }
572
+
573
+ #if DYNAMIC_BMI2
574
+ BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
575
+ const short* normalizedCounter, unsigned maxSymbolValue,
576
+ const U32* baseValue, const U8* nbAdditionalBits,
577
+ unsigned tableLog, void* wksp, size_t wkspSize)
578
+ {
579
+ ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
580
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
581
+ }
582
+ #endif
583
+
584
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
585
+ const short* normalizedCounter, unsigned maxSymbolValue,
586
+ const U32* baseValue, const U8* nbAdditionalBits,
587
+ unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
588
+ {
589
+ #if DYNAMIC_BMI2
590
+ if (bmi2) {
591
+ ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
592
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
593
+ return;
594
+ }
595
+ #endif
596
+ (void)bmi2;
597
+ ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
598
+ baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
427
599
  }
428
600
 
429
601
 
@@ -433,9 +605,10 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
433
605
  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
434
606
  symbolEncodingType_e type, unsigned max, U32 maxLog,
435
607
  const void* src, size_t srcSize,
436
- const U32* baseValue, const U32* nbAdditionalBits,
608
+ const U32* baseValue, const U8* nbAdditionalBits,
437
609
  const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
438
- int ddictIsCold, int nbSeq)
610
+ int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
611
+ int bmi2)
439
612
  {
440
613
  switch(type)
441
614
  {
@@ -444,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
444
617
  RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
445
618
  { U32 const symbol = *(const BYTE*)src;
446
619
  U32 const baseline = baseValue[symbol];
447
- U32 const nbBits = nbAdditionalBits[symbol];
620
+ U8 const nbBits = nbAdditionalBits[symbol];
448
621
  ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
449
622
  }
450
623
  *DTablePtr = DTableSpace;
@@ -467,7 +640,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
467
640
  size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
468
641
  RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
469
642
  RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
470
- ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
643
+ ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
471
644
  *DTablePtr = DTableSpace;
472
645
  return headerSize;
473
646
  }
@@ -480,7 +653,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
480
653
  size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
481
654
  const void* src, size_t srcSize)
482
655
  {
483
- const BYTE* const istart = (const BYTE* const)src;
656
+ const BYTE* const istart = (const BYTE*)src;
484
657
  const BYTE* const iend = istart + srcSize;
485
658
  const BYTE* ip = istart;
486
659
  int nbSeq;
@@ -499,7 +672,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
499
672
  if (nbSeq > 0x7F) {
500
673
  if (nbSeq == 0xFF) {
501
674
  RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
502
- nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
675
+ nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
676
+ ip+=2;
503
677
  } else {
504
678
  RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
505
679
  nbSeq = ((nbSeq-0x80)<<8) + *ip++;
@@ -520,7 +694,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
520
694
  ip, iend-ip,
521
695
  LL_base, LL_bits,
522
696
  LL_defaultDTable, dctx->fseEntropy,
523
- dctx->ddictIsCold, nbSeq);
697
+ dctx->ddictIsCold, nbSeq,
698
+ dctx->workspace, sizeof(dctx->workspace),
699
+ ZSTD_DCtx_get_bmi2(dctx));
524
700
  RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
525
701
  ip += llhSize;
526
702
  }
@@ -530,7 +706,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
530
706
  ip, iend-ip,
531
707
  OF_base, OF_bits,
532
708
  OF_defaultDTable, dctx->fseEntropy,
533
- dctx->ddictIsCold, nbSeq);
709
+ dctx->ddictIsCold, nbSeq,
710
+ dctx->workspace, sizeof(dctx->workspace),
711
+ ZSTD_DCtx_get_bmi2(dctx));
534
712
  RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
535
713
  ip += ofhSize;
536
714
  }
@@ -540,7 +718,9 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
540
718
  ip, iend-ip,
541
719
  ML_base, ML_bits,
542
720
  ML_defaultDTable, dctx->fseEntropy,
543
- dctx->ddictIsCold, nbSeq);
721
+ dctx->ddictIsCold, nbSeq,
722
+ dctx->workspace, sizeof(dctx->workspace),
723
+ ZSTD_DCtx_get_bmi2(dctx));
544
724
  RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
545
725
  ip += mlhSize;
546
726
  }
@@ -554,7 +734,6 @@ typedef struct {
554
734
  size_t litLength;
555
735
  size_t matchLength;
556
736
  size_t offset;
557
- const BYTE* match;
558
737
  } seq_t;
559
738
 
560
739
  typedef struct {
@@ -568,9 +747,6 @@ typedef struct {
568
747
  ZSTD_fseState stateOffb;
569
748
  ZSTD_fseState stateML;
570
749
  size_t prevOffset[ZSTD_REP_NUM];
571
- const BYTE* prefixStart;
572
- const BYTE* dictEnd;
573
- size_t pos;
574
750
  } seqState_t;
575
751
 
576
752
  /*! ZSTD_overlapCopy8() :
@@ -613,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
613
789
  * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
614
790
  * The src buffer must be before the dst buffer.
615
791
  */
616
- static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
792
+ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
617
793
  ptrdiff_t const diff = op - ip;
618
794
  BYTE* const oend = op + length;
619
795
 
@@ -629,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
629
805
  /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
630
806
  assert(length >= 8);
631
807
  ZSTD_overlapCopy8(&op, &ip, diff);
808
+ length -= 8;
632
809
  assert(op - ip >= 8);
633
810
  assert(op <= oend);
634
811
  }
@@ -643,8 +820,31 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
643
820
  assert(oend > oend_w);
644
821
  ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
645
822
  ip += oend_w - op;
646
- op = oend_w;
823
+ op += oend_w - op;
824
+ }
825
+ /* Handle the leftovers. */
826
+ while (op < oend) *op++ = *ip++;
827
+ }
828
+
829
+ /* ZSTD_safecopyDstBeforeSrc():
830
+ * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
831
+ * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
832
+ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
833
+ ptrdiff_t const diff = op - ip;
834
+ BYTE* const oend = op + length;
835
+
836
+ if (length < 8 || diff > -8) {
837
+ /* Handle short lengths, close overlaps, and dst not before src. */
838
+ while (op < oend) *op++ = *ip++;
839
+ return;
840
+ }
841
+
842
+ if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
843
+ ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
844
+ ip += oend - WILDCOPY_OVERLENGTH - op;
845
+ op += oend - WILDCOPY_OVERLENGTH - op;
647
846
  }
847
+
648
848
  /* Handle the leftovers. */
649
849
  while (op < oend) *op++ = *ip++;
650
850
  }
@@ -659,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_
659
859
  */
660
860
  FORCE_NOINLINE
661
861
  size_t ZSTD_execSequenceEnd(BYTE* op,
662
- BYTE* const oend, seq_t sequence,
663
- const BYTE** litPtr, const BYTE* const litLimit,
664
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
862
+ BYTE* const oend, seq_t sequence,
863
+ const BYTE** litPtr, const BYTE* const litLimit,
864
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
665
865
  {
666
866
  BYTE* const oLitEnd = op + sequence.litLength;
667
867
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -684,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
684
884
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
685
885
  /* offset beyond prefix */
686
886
  RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
687
- match = dictEnd - (prefixStart-match);
887
+ match = dictEnd - (prefixStart - match);
688
888
  if (match + sequence.matchLength <= dictEnd) {
689
- memmove(oLitEnd, match, sequence.matchLength);
889
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
690
890
  return sequenceLength;
691
891
  }
692
892
  /* span extDict & currentPrefixSegment */
693
893
  { size_t const length1 = dictEnd - match;
694
- memmove(oLitEnd, match, length1);
695
- op = oLitEnd + length1;
696
- sequence.matchLength -= length1;
697
- match = prefixStart;
698
- } }
894
+ ZSTD_memmove(oLitEnd, match, length1);
895
+ op = oLitEnd + length1;
896
+ sequence.matchLength -= length1;
897
+ match = prefixStart;
898
+ }
899
+ }
900
+ ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
901
+ return sequenceLength;
902
+ }
903
+
904
+ /* ZSTD_execSequenceEndSplitLitBuffer():
905
+ * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
906
+ */
907
+ FORCE_NOINLINE
908
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
909
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
910
+ const BYTE** litPtr, const BYTE* const litLimit,
911
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
912
+ {
913
+ BYTE* const oLitEnd = op + sequence.litLength;
914
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
915
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
916
+ const BYTE* match = oLitEnd - sequence.offset;
917
+
918
+
919
+ /* bounds checks : careful of address space overflow in 32-bit mode */
920
+ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
921
+ RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
922
+ assert(op < op + sequenceLength);
923
+ assert(oLitEnd < op + sequenceLength);
924
+
925
+ /* copy literals */
926
+ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
927
+ ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
928
+ op = oLitEnd;
929
+ *litPtr = iLitEnd;
930
+
931
+ /* copy Match */
932
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
933
+ /* offset beyond prefix */
934
+ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
935
+ match = dictEnd - (prefixStart - match);
936
+ if (match + sequence.matchLength <= dictEnd) {
937
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
938
+ return sequenceLength;
939
+ }
940
+ /* span extDict & currentPrefixSegment */
941
+ { size_t const length1 = dictEnd - match;
942
+ ZSTD_memmove(oLitEnd, match, length1);
943
+ op = oLitEnd + length1;
944
+ sequence.matchLength -= length1;
945
+ match = prefixStart;
946
+ }
947
+ }
699
948
  ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
700
949
  return sequenceLength;
701
950
  }
702
951
 
703
952
  HINT_INLINE
704
953
  size_t ZSTD_execSequence(BYTE* op,
705
- BYTE* const oend, seq_t sequence,
706
- const BYTE** litPtr, const BYTE* const litLimit,
707
- const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
954
+ BYTE* const oend, seq_t sequence,
955
+ const BYTE** litPtr, const BYTE* const litLimit,
956
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
708
957
  {
709
958
  BYTE* const oLitEnd = op + sequence.litLength;
710
959
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -713,6 +962,98 @@ size_t ZSTD_execSequence(BYTE* op,
713
962
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
714
963
  const BYTE* match = oLitEnd - sequence.offset;
715
964
 
965
+ assert(op != NULL /* Precondition */);
966
+ assert(oend_w < oend /* No underflow */);
967
+ /* Handle edge cases in a slow path:
968
+ * - Read beyond end of literals
969
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
970
+ * - 32-bit mode and the match length overflows
971
+ */
972
+ if (UNLIKELY(
973
+ iLitEnd > litLimit ||
974
+ oMatchEnd > oend_w ||
975
+ (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
976
+ return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
977
+
978
+ /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
979
+ assert(op <= oLitEnd /* No overflow */);
980
+ assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
981
+ assert(oMatchEnd <= oend /* No underflow */);
982
+ assert(iLitEnd <= litLimit /* Literal length is in bounds */);
983
+ assert(oLitEnd <= oend_w /* Can wildcopy literals */);
984
+ assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
985
+
986
+ /* Copy Literals:
987
+ * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
988
+ * We likely don't need the full 32-byte wildcopy.
989
+ */
990
+ assert(WILDCOPY_OVERLENGTH >= 16);
991
+ ZSTD_copy16(op, (*litPtr));
992
+ if (UNLIKELY(sequence.litLength > 16)) {
993
+ ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
994
+ }
995
+ op = oLitEnd;
996
+ *litPtr = iLitEnd; /* update for next sequence */
997
+
998
+ /* Copy Match */
999
+ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1000
+ /* offset beyond prefix -> go into extDict */
1001
+ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1002
+ match = dictEnd + (match - prefixStart);
1003
+ if (match + sequence.matchLength <= dictEnd) {
1004
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1005
+ return sequenceLength;
1006
+ }
1007
+ /* span extDict & currentPrefixSegment */
1008
+ { size_t const length1 = dictEnd - match;
1009
+ ZSTD_memmove(oLitEnd, match, length1);
1010
+ op = oLitEnd + length1;
1011
+ sequence.matchLength -= length1;
1012
+ match = prefixStart;
1013
+ }
1014
+ }
1015
+ /* Match within prefix of 1 or more bytes */
1016
+ assert(op <= oMatchEnd);
1017
+ assert(oMatchEnd <= oend_w);
1018
+ assert(match >= prefixStart);
1019
+ assert(sequence.matchLength >= 1);
1020
+
1021
+ /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1022
+ * without overlap checking.
1023
+ */
1024
+ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1025
+ /* We bet on a full wildcopy for matches, since we expect matches to be
1026
+ * longer than literals (in general). In silesia, ~10% of matches are longer
1027
+ * than 16 bytes.
1028
+ */
1029
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1030
+ return sequenceLength;
1031
+ }
1032
+ assert(sequence.offset < WILDCOPY_VECLEN);
1033
+
1034
+ /* Copy 8 bytes and spread the offset to be >= 8. */
1035
+ ZSTD_overlapCopy8(&op, &match, sequence.offset);
1036
+
1037
+ /* If the match length is > 8 bytes, then continue with the wildcopy. */
1038
+ if (sequence.matchLength > 8) {
1039
+ assert(op < oMatchEnd);
1040
+ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1041
+ }
1042
+ return sequenceLength;
1043
+ }
1044
+
1045
+ HINT_INLINE
1046
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1047
+ BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1048
+ const BYTE** litPtr, const BYTE* const litLimit,
1049
+ const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1050
+ {
1051
+ BYTE* const oLitEnd = op + sequence.litLength;
1052
+ size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1053
+ BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1054
+ const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1055
+ const BYTE* match = oLitEnd - sequence.offset;
1056
+
716
1057
  assert(op != NULL /* Precondition */);
717
1058
  assert(oend_w < oend /* No underflow */);
718
1059
  /* Handle edge cases in a slow path:
@@ -724,7 +1065,7 @@ size_t ZSTD_execSequence(BYTE* op,
724
1065
  iLitEnd > litLimit ||
725
1066
  oMatchEnd > oend_w ||
726
1067
  (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
727
- return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1068
+ return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
728
1069
 
729
1070
  /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
730
1071
  assert(op <= oLitEnd /* No overflow */);
@@ -752,12 +1093,12 @@ size_t ZSTD_execSequence(BYTE* op,
752
1093
  RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
753
1094
  match = dictEnd + (match - prefixStart);
754
1095
  if (match + sequence.matchLength <= dictEnd) {
755
- memmove(oLitEnd, match, sequence.matchLength);
1096
+ ZSTD_memmove(oLitEnd, match, sequence.matchLength);
756
1097
  return sequenceLength;
757
1098
  }
758
1099
  /* span extDict & currentPrefixSegment */
759
1100
  { size_t const length1 = dictEnd - match;
760
- memmove(oLitEnd, match, length1);
1101
+ ZSTD_memmove(oLitEnd, match, length1);
761
1102
  op = oLitEnd + length1;
762
1103
  sequence.matchLength -= length1;
763
1104
  match = prefixStart;
@@ -792,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op,
792
1133
  return sequenceLength;
793
1134
  }
794
1135
 
1136
+
795
1137
  static void
796
1138
  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
797
1139
  {
@@ -805,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS
805
1147
  }
806
1148
 
807
1149
  FORCE_INLINE_TEMPLATE void
808
- ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
1150
+ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
809
1151
  {
810
- ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
811
- U32 const nbBits = DInfo.nbBits;
812
1152
  size_t const lowBits = BIT_readBits(bitD, nbBits);
813
- DStatePtr->state = DInfo.nextState + lowBits;
814
- }
815
-
816
- FORCE_INLINE_TEMPLATE void
817
- ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
818
- {
819
- U32 const nbBits = DInfo.nbBits;
820
- size_t const lowBits = BIT_readBits(bitD, nbBits);
821
- DStatePtr->state = DInfo.nextState + lowBits;
1153
+ DStatePtr->state = nextState + lowBits;
822
1154
  }
823
1155
 
824
1156
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
@@ -832,123 +1164,112 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD
832
1164
  : 0)
833
1165
 
834
1166
  typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
835
- typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
836
1167
 
837
1168
  FORCE_INLINE_TEMPLATE seq_t
838
- ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
1169
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
839
1170
  {
840
1171
  seq_t seq;
841
- ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
842
- ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
843
- ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
844
- U32 const llBase = llDInfo.baseValue;
845
- U32 const mlBase = mlDInfo.baseValue;
846
- U32 const ofBase = ofDInfo.baseValue;
847
- BYTE const llBits = llDInfo.nbAdditionalBits;
848
- BYTE const mlBits = mlDInfo.nbAdditionalBits;
849
- BYTE const ofBits = ofDInfo.nbAdditionalBits;
850
- BYTE const totalBits = llBits+mlBits+ofBits;
851
-
852
- /* sequence */
853
- { size_t offset;
854
- if (ofBits > 1) {
855
- ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
856
- ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
857
- assert(ofBits <= MaxOff);
858
- if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
859
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
860
- offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
861
- BIT_reloadDStream(&seqState->DStream);
862
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
863
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
864
- } else {
865
- offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
866
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
867
- }
868
- seqState->prevOffset[2] = seqState->prevOffset[1];
869
- seqState->prevOffset[1] = seqState->prevOffset[0];
870
- seqState->prevOffset[0] = offset;
871
- } else {
872
- U32 const ll0 = (llBase == 0);
873
- if (LIKELY((ofBits == 0))) {
874
- if (LIKELY(!ll0))
875
- offset = seqState->prevOffset[0];
876
- else {
877
- offset = seqState->prevOffset[1];
878
- seqState->prevOffset[1] = seqState->prevOffset[0];
879
- seqState->prevOffset[0] = offset;
1172
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1173
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1174
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1175
+ seq.matchLength = mlDInfo->baseValue;
1176
+ seq.litLength = llDInfo->baseValue;
1177
+ { U32 const ofBase = ofDInfo->baseValue;
1178
+ BYTE const llBits = llDInfo->nbAdditionalBits;
1179
+ BYTE const mlBits = mlDInfo->nbAdditionalBits;
1180
+ BYTE const ofBits = ofDInfo->nbAdditionalBits;
1181
+ BYTE const totalBits = llBits+mlBits+ofBits;
1182
+
1183
+ U16 const llNext = llDInfo->nextState;
1184
+ U16 const mlNext = mlDInfo->nextState;
1185
+ U16 const ofNext = ofDInfo->nextState;
1186
+ U32 const llnbBits = llDInfo->nbBits;
1187
+ U32 const mlnbBits = mlDInfo->nbBits;
1188
+ U32 const ofnbBits = ofDInfo->nbBits;
1189
+ /*
1190
+ * As gcc has better branch and block analyzers, sometimes it is only
1191
+ * valuable to mark likelyness for clang, it gives around 3-4% of
1192
+ * performance.
1193
+ */
1194
+
1195
+ /* sequence */
1196
+ { size_t offset;
1197
+ #if defined(__clang__)
1198
+ if (LIKELY(ofBits > 1)) {
1199
+ #else
1200
+ if (ofBits > 1) {
1201
+ #endif
1202
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1203
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1204
+ assert(ofBits <= MaxOff);
1205
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1206
+ U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
1207
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1208
+ BIT_reloadDStream(&seqState->DStream);
1209
+ if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1210
+ assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
1211
+ } else {
1212
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1213
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
880
1214
  }
1215
+ seqState->prevOffset[2] = seqState->prevOffset[1];
1216
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1217
+ seqState->prevOffset[0] = offset;
881
1218
  } else {
882
- offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
883
- { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
884
- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
885
- if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
886
- seqState->prevOffset[1] = seqState->prevOffset[0];
887
- seqState->prevOffset[0] = offset = temp;
888
- } } }
889
- seq.offset = offset;
890
- }
891
-
892
- seq.matchLength = mlBase;
893
- if (mlBits > 0)
894
- seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
895
-
896
- if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
897
- BIT_reloadDStream(&seqState->DStream);
898
- if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
899
- BIT_reloadDStream(&seqState->DStream);
900
- /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
901
- ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
902
-
903
- seq.litLength = llBase;
904
- if (llBits > 0)
905
- seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
906
-
907
- if (MEM_32bits())
908
- BIT_reloadDStream(&seqState->DStream);
909
-
910
- DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
911
- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
912
-
913
- if (prefetch == ZSTD_p_prefetch) {
914
- size_t const pos = seqState->pos + seq.litLength;
915
- const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
916
- seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
917
- * No consequence though : no memory access will occur, offset is only used for prefetching */
918
- seqState->pos = pos + seq.matchLength;
919
- }
920
-
921
- /* ANS state update
922
- * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
923
- * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
924
- * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
925
- * better option, so it is the default for other compilers. But, if you
926
- * measure that it is worse, please put up a pull request.
927
- */
928
- {
929
- #if defined(__GNUC__) && !defined(__clang__)
930
- const int kUseUpdateFseState = 1;
931
- #else
932
- const int kUseUpdateFseState = 0;
933
- #endif
934
- if (kUseUpdateFseState) {
935
- ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
936
- ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
937
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
938
- ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
939
- } else {
940
- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
941
- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
942
- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
943
- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
1219
+ U32 const ll0 = (llDInfo->baseValue == 0);
1220
+ if (LIKELY((ofBits == 0))) {
1221
+ offset = seqState->prevOffset[ll0];
1222
+ seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1223
+ seqState->prevOffset[0] = offset;
1224
+ } else {
1225
+ offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1226
+ { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1227
+ temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1228
+ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1229
+ seqState->prevOffset[1] = seqState->prevOffset[0];
1230
+ seqState->prevOffset[0] = offset = temp;
1231
+ } } }
1232
+ seq.offset = offset;
944
1233
  }
1234
+
1235
+ #if defined(__clang__)
1236
+ if (UNLIKELY(mlBits > 0))
1237
+ #else
1238
+ if (mlBits > 0)
1239
+ #endif
1240
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1241
+
1242
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1243
+ BIT_reloadDStream(&seqState->DStream);
1244
+ if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1245
+ BIT_reloadDStream(&seqState->DStream);
1246
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1247
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1248
+
1249
+ #if defined(__clang__)
1250
+ if (UNLIKELY(llBits > 0))
1251
+ #else
1252
+ if (llBits > 0)
1253
+ #endif
1254
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1255
+
1256
+ if (MEM_32bits())
1257
+ BIT_reloadDStream(&seqState->DStream);
1258
+
1259
+ DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1260
+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1261
+
1262
+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1263
+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1264
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1265
+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
945
1266
  }
946
1267
 
947
1268
  return seq;
948
1269
  }
949
1270
 
950
1271
  #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
951
- static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1272
+ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
952
1273
  {
953
1274
  size_t const windowSize = dctx->fParams.windowSize;
954
1275
  /* No dictionary used. */
@@ -969,6 +1290,7 @@ MEM_STATIC void ZSTD_assertValidSequence(
969
1290
  seq_t const seq,
970
1291
  BYTE const* prefixStart, BYTE const* virtualStart)
971
1292
  {
1293
+ #if DEBUGLEVEL >= 1
972
1294
  size_t const windowSize = dctx->fParams.windowSize;
973
1295
  size_t const sequenceSize = seq.litLength + seq.matchLength;
974
1296
  BYTE const* const oLitEnd = op + seq.litLength;
@@ -986,13 +1308,18 @@ MEM_STATIC void ZSTD_assertValidSequence(
986
1308
  /* Offset must be within our window. */
987
1309
  assert(seq.offset <= windowSize);
988
1310
  }
1311
+ #else
1312
+ (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1313
+ #endif
989
1314
  }
990
1315
  #endif
991
1316
 
992
1317
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1318
+
1319
+
993
1320
  FORCE_INLINE_TEMPLATE size_t
994
1321
  DONT_VECTORIZE
995
- ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1322
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
996
1323
  void* dst, size_t maxDstSize,
997
1324
  const void* seqStart, size_t seqSize, int nbSeq,
998
1325
  const ZSTD_longOffset_e isLongOffset,
@@ -1000,21 +1327,20 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1000
1327
  {
1001
1328
  const BYTE* ip = (const BYTE*)seqStart;
1002
1329
  const BYTE* const iend = ip + seqSize;
1003
- BYTE* const ostart = (BYTE* const)dst;
1330
+ BYTE* const ostart = (BYTE*)dst;
1004
1331
  BYTE* const oend = ostart + maxDstSize;
1005
1332
  BYTE* op = ostart;
1006
1333
  const BYTE* litPtr = dctx->litPtr;
1007
- const BYTE* const litEnd = litPtr + dctx->litSize;
1334
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1008
1335
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1009
1336
  const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1010
1337
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1011
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
1338
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1012
1339
  (void)frame;
1013
1340
 
1014
1341
  /* Regen sequences */
1015
1342
  if (nbSeq) {
1016
1343
  seqState_t seqState;
1017
- size_t error = 0;
1018
1344
  dctx->fseEntropy = 1;
1019
1345
  { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1020
1346
  RETURN_ERROR_IF(
@@ -1030,70 +1356,255 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1030
1356
  BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1031
1357
  BIT_DStream_completed < BIT_DStream_overflow);
1032
1358
 
1359
+ /* decompress without overrunning litPtr begins */
1360
+ {
1361
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1362
+ /* Align the decompression loop to 32 + 16 bytes.
1363
+ *
1364
+ * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1365
+ * speed swings based on the alignment of the decompression loop. This
1366
+ * performance swing is caused by parts of the decompression loop falling
1367
+ * out of the DSB. The entire decompression loop should fit in the DSB,
1368
+ * when it can't we get much worse performance. You can measure if you've
1369
+ * hit the good case or the bad case with this perf command for some
1370
+ * compressed file test.zst:
1371
+ *
1372
+ * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1373
+ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1374
+ *
1375
+ * If you see most cycles served out of the MITE you've hit the bad case.
1376
+ * If you see most cycles served out of the DSB you've hit the good case.
1377
+ * If it is pretty even then you may be in an okay case.
1378
+ *
1379
+ * This issue has been reproduced on the following CPUs:
1380
+ * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1381
+ * Use Instruments->Counters to get DSB/MITE cycles.
1382
+ * I never got performance swings, but I was able to
1383
+ * go from the good case of mostly DSB to half of the
1384
+ * cycles served from MITE.
1385
+ * - Coffeelake: Intel i9-9900k
1386
+ * - Coffeelake: Intel i7-9700k
1387
+ *
1388
+ * I haven't been able to reproduce the instability or DSB misses on any
1389
+ * of the following CPUS:
1390
+ * - Haswell
1391
+ * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1392
+ * - Skylake
1393
+ *
1394
+ * Alignment is done for each of the three major decompression loops:
1395
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1396
+ * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1397
+ * - ZSTD_decompressSequences_body
1398
+ * Alignment choices are made to minimize large swings on bad cases and influence on performance
1399
+ * from changes external to this code, rather than to overoptimize on the current commit.
1400
+ *
1401
+ * If you are seeing performance stability this script can help test.
1402
+ * It tests on 4 commits in zstd where I saw performance change.
1403
+ *
1404
+ * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1405
+ */
1033
1406
  #if defined(__GNUC__) && defined(__x86_64__)
1034
- /* Align the decompression loop to 32 + 16 bytes.
1035
- *
1036
- * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1037
- * speed swings based on the alignment of the decompression loop. This
1038
- * performance swing is caused by parts of the decompression loop falling
1039
- * out of the DSB. The entire decompression loop should fit in the DSB,
1040
- * when it can't we get much worse performance. You can measure if you've
1041
- * hit the good case or the bad case with this perf command for some
1042
- * compressed file test.zst:
1043
- *
1044
- * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1045
- * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1046
- *
1047
- * If you see most cycles served out of the MITE you've hit the bad case.
1048
- * If you see most cycles served out of the DSB you've hit the good case.
1049
- * If it is pretty even then you may be in an okay case.
1050
- *
1051
- * I've been able to reproduce this issue on the following CPUs:
1052
- * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1053
- * Use Instruments->Counters to get DSB/MITE cycles.
1054
- * I never got performance swings, but I was able to
1055
- * go from the good case of mostly DSB to half of the
1056
- * cycles served from MITE.
1057
- * - Coffeelake: Intel i9-9900k
1058
- *
1059
- * I haven't been able to reproduce the instability or DSB misses on any
1060
- * of the following CPUS:
1061
- * - Haswell
1062
- * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1063
- * - Skylake
1064
- *
1065
- * If you are seeing performance stability this script can help test.
1066
- * It tests on 4 commits in zstd where I saw performance change.
1067
- *
1068
- * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1069
- */
1070
- __asm__(".p2align 5");
1071
- __asm__("nop");
1072
- __asm__(".p2align 4");
1407
+ __asm__(".p2align 6");
1408
+ # if __GNUC__ >= 7
1409
+ /* good for gcc-7, gcc-9, and gcc-11 */
1410
+ __asm__("nop");
1411
+ __asm__(".p2align 5");
1412
+ __asm__("nop");
1413
+ __asm__(".p2align 4");
1414
+ # if __GNUC__ == 8 || __GNUC__ == 10
1415
+ /* good for gcc-8 and gcc-10 */
1416
+ __asm__("nop");
1417
+ __asm__(".p2align 3");
1418
+ # endif
1419
+ # endif
1420
+ #endif
1421
+
1422
+ /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1423
+ for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1424
+ size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1425
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1426
+ assert(!ZSTD_isError(oneSeqSize));
1427
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1428
+ #endif
1429
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1430
+ return oneSeqSize;
1431
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1432
+ op += oneSeqSize;
1433
+ if (UNLIKELY(!--nbSeq))
1434
+ break;
1435
+ BIT_reloadDStream(&(seqState.DStream));
1436
+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1437
+ }
1438
+
1439
+ /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1440
+ if (nbSeq > 0) {
1441
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1442
+ if (leftoverLit)
1443
+ {
1444
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1445
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1446
+ sequence.litLength -= leftoverLit;
1447
+ op += leftoverLit;
1448
+ }
1449
+ litPtr = dctx->litExtraBuffer;
1450
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1451
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1452
+ {
1453
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
+ assert(!ZSTD_isError(oneSeqSize));
1456
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
+ #endif
1458
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
+ return oneSeqSize;
1460
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1461
+ op += oneSeqSize;
1462
+ if (--nbSeq)
1463
+ BIT_reloadDStream(&(seqState.DStream));
1464
+ }
1465
+ }
1466
+ }
1467
+
1468
+ if (nbSeq > 0) /* there is remaining lit from extra buffer */
1469
+ {
1470
+
1471
+ #if defined(__GNUC__) && defined(__x86_64__)
1472
+ __asm__(".p2align 6");
1473
+ __asm__("nop");
1474
+ # if __GNUC__ != 7
1475
+ /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1476
+ __asm__(".p2align 4");
1477
+ __asm__("nop");
1478
+ __asm__(".p2align 3");
1479
+ # elif __GNUC__ >= 11
1480
+ __asm__(".p2align 3");
1481
+ # else
1482
+ __asm__(".p2align 5");
1483
+ __asm__("nop");
1484
+ __asm__(".p2align 3");
1485
+ # endif
1486
+ #endif
1487
+
1488
+ for (; ; ) {
1489
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1490
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1491
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1492
+ assert(!ZSTD_isError(oneSeqSize));
1493
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1494
+ #endif
1495
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1496
+ return oneSeqSize;
1497
+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1498
+ op += oneSeqSize;
1499
+ if (UNLIKELY(!--nbSeq))
1500
+ break;
1501
+ BIT_reloadDStream(&(seqState.DStream));
1502
+ }
1503
+ }
1504
+
1505
+ /* check if reached exact end */
1506
+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1507
+ RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1508
+ RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1509
+ /* save reps for next block */
1510
+ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1511
+ }
1512
+
1513
+ /* last literal segment */
1514
+ if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1515
+ {
1516
+ size_t const lastLLSize = litBufferEnd - litPtr;
1517
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1518
+ if (op != NULL) {
1519
+ ZSTD_memmove(op, litPtr, lastLLSize);
1520
+ op += lastLLSize;
1521
+ }
1522
+ litPtr = dctx->litExtraBuffer;
1523
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1524
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1525
+ }
1526
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1527
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1528
+ if (op != NULL) {
1529
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1530
+ op += lastLLSize;
1531
+ }
1532
+ }
1533
+
1534
+ return op-ostart;
1535
+ }
1536
+
1537
+ FORCE_INLINE_TEMPLATE size_t
1538
+ DONT_VECTORIZE
1539
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1540
+ void* dst, size_t maxDstSize,
1541
+ const void* seqStart, size_t seqSize, int nbSeq,
1542
+ const ZSTD_longOffset_e isLongOffset,
1543
+ const int frame)
1544
+ {
1545
+ const BYTE* ip = (const BYTE*)seqStart;
1546
+ const BYTE* const iend = ip + seqSize;
1547
+ BYTE* const ostart = (BYTE*)dst;
1548
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1549
+ BYTE* op = ostart;
1550
+ const BYTE* litPtr = dctx->litPtr;
1551
+ const BYTE* const litEnd = litPtr + dctx->litSize;
1552
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1553
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1554
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1555
+ DEBUGLOG(5, "ZSTD_decompressSequences_body");
1556
+ (void)frame;
1557
+
1558
+ /* Regen sequences */
1559
+ if (nbSeq) {
1560
+ seqState_t seqState;
1561
+ dctx->fseEntropy = 1;
1562
+ { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1563
+ RETURN_ERROR_IF(
1564
+ ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1565
+ corruption_detected, "");
1566
+ ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1567
+ ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1568
+ ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1569
+ assert(dst != NULL);
1570
+
1571
+ ZSTD_STATIC_ASSERT(
1572
+ BIT_DStream_unfinished < BIT_DStream_completed &&
1573
+ BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1574
+ BIT_DStream_completed < BIT_DStream_overflow);
1575
+
1576
+ #if defined(__GNUC__) && defined(__x86_64__)
1577
+ __asm__(".p2align 6");
1578
+ __asm__("nop");
1579
+ # if __GNUC__ >= 7
1580
+ __asm__(".p2align 5");
1581
+ __asm__("nop");
1582
+ __asm__(".p2align 3");
1583
+ # else
1584
+ __asm__(".p2align 4");
1585
+ __asm__("nop");
1586
+ __asm__(".p2align 3");
1587
+ # endif
1073
1588
  #endif
1589
+
1074
1590
  for ( ; ; ) {
1075
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
1591
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1076
1592
  size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1077
1593
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1078
1594
  assert(!ZSTD_isError(oneSeqSize));
1079
1595
  if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1080
1596
  #endif
1597
+ if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1598
+ return oneSeqSize;
1081
1599
  DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1600
+ op += oneSeqSize;
1601
+ if (UNLIKELY(!--nbSeq))
1602
+ break;
1082
1603
  BIT_reloadDStream(&(seqState.DStream));
1083
- /* gcc and clang both don't like early returns in this loop.
1084
- * gcc doesn't like early breaks either.
1085
- * Instead save an error and report it at the end.
1086
- * When there is an error, don't increment op, so we don't
1087
- * overwrite.
1088
- */
1089
- if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize;
1090
- else op += oneSeqSize;
1091
- if (UNLIKELY(!--nbSeq)) break;
1092
1604
  }
1093
1605
 
1094
1606
  /* check if reached exact end */
1095
1607
  DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
1096
- if (ZSTD_isError(error)) return error;
1097
1608
  RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1098
1609
  RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1099
1610
  /* save reps for next block */
@@ -1104,7 +1615,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
1104
1615
  { size_t const lastLLSize = litEnd - litPtr;
1105
1616
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1106
1617
  if (op != NULL) {
1107
- memcpy(op, litPtr, lastLLSize);
1618
+ ZSTD_memcpy(op, litPtr, lastLLSize);
1108
1619
  op += lastLLSize;
1109
1620
  }
1110
1621
  }
@@ -1121,9 +1632,37 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1121
1632
  {
1122
1633
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1123
1634
  }
1635
+
1636
+ static size_t
1637
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1638
+ void* dst, size_t maxDstSize,
1639
+ const void* seqStart, size_t seqSize, int nbSeq,
1640
+ const ZSTD_longOffset_e isLongOffset,
1641
+ const int frame)
1642
+ {
1643
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1644
+ }
1124
1645
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1125
1646
 
1126
1647
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1648
+
1649
+ FORCE_INLINE_TEMPLATE size_t
1650
+ ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1651
+ const BYTE* const prefixStart, const BYTE* const dictEnd)
1652
+ {
1653
+ prefetchPos += sequence.litLength;
1654
+ { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1655
+ const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1656
+ * No consequence though : memory address is only used for prefetching, not for dereferencing */
1657
+ PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1658
+ }
1659
+ return prefetchPos + sequence.matchLength;
1660
+ }
1661
+
1662
+ /* This decoding function employs prefetching
1663
+ * to reduce latency impact of cache misses.
1664
+ * It's generally employed when block contains a significant portion of long-distance matches
1665
+ * or when coupled with a "cold" dictionary */
1127
1666
  FORCE_INLINE_TEMPLATE size_t
1128
1667
  ZSTD_decompressSequencesLong_body(
1129
1668
  ZSTD_DCtx* dctx,
@@ -1134,11 +1673,11 @@ ZSTD_decompressSequencesLong_body(
1134
1673
  {
1135
1674
  const BYTE* ip = (const BYTE*)seqStart;
1136
1675
  const BYTE* const iend = ip + seqSize;
1137
- BYTE* const ostart = (BYTE* const)dst;
1138
- BYTE* const oend = ostart + maxDstSize;
1676
+ BYTE* const ostart = (BYTE*)dst;
1677
+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1139
1678
  BYTE* op = ostart;
1140
1679
  const BYTE* litPtr = dctx->litPtr;
1141
- const BYTE* const litEnd = litPtr + dctx->litSize;
1680
+ const BYTE* litBufferEnd = dctx->litBufferEnd;
1142
1681
  const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1143
1682
  const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1144
1683
  const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
@@ -1146,18 +1685,17 @@ ZSTD_decompressSequencesLong_body(
1146
1685
 
1147
1686
  /* Regen sequences */
1148
1687
  if (nbSeq) {
1149
- #define STORED_SEQS 4
1688
+ #define STORED_SEQS 8
1150
1689
  #define STORED_SEQS_MASK (STORED_SEQS-1)
1151
- #define ADVANCED_SEQS 4
1690
+ #define ADVANCED_SEQS STORED_SEQS
1152
1691
  seq_t sequences[STORED_SEQS];
1153
1692
  int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1154
1693
  seqState_t seqState;
1155
1694
  int seqNb;
1695
+ size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1696
+
1156
1697
  dctx->fseEntropy = 1;
1157
1698
  { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1158
- seqState.prefixStart = prefixStart;
1159
- seqState.pos = (size_t)(op-prefixStart);
1160
- seqState.dictEnd = dictEnd;
1161
1699
  assert(dst != NULL);
1162
1700
  assert(iend >= ip);
1163
1701
  RETURN_ERROR_IF(
@@ -1169,36 +1707,100 @@ ZSTD_decompressSequencesLong_body(
1169
1707
 
1170
1708
  /* prepare in advance */
1171
1709
  for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1172
- sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1173
- PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1710
+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1711
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1712
+ sequences[seqNb] = sequence;
1174
1713
  }
1175
1714
  RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1176
1715
 
1177
- /* decode and decompress */
1178
- for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
1179
- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
1180
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1716
+ /* decompress without stomping litBuffer */
1717
+ for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1718
+ seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1719
+ size_t oneSeqSize;
1720
+
1721
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1722
+ {
1723
+ /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1724
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1725
+ if (leftoverLit)
1726
+ {
1727
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1728
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1729
+ sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1730
+ op += leftoverLit;
1731
+ }
1732
+ litPtr = dctx->litExtraBuffer;
1733
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1734
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1735
+ oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1181
1736
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1182
- assert(!ZSTD_isError(oneSeqSize));
1183
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1737
+ assert(!ZSTD_isError(oneSeqSize));
1738
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1184
1739
  #endif
1185
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1186
- PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1187
- sequences[seqNb & STORED_SEQS_MASK] = sequence;
1188
- op += oneSeqSize;
1740
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1741
+
1742
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1743
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1744
+ op += oneSeqSize;
1745
+ }
1746
+ else
1747
+ {
1748
+ /* lit buffer is either wholly contained in first or second split, or not split at all*/
1749
+ oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1750
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1751
+ ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1752
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1753
+ assert(!ZSTD_isError(oneSeqSize));
1754
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1755
+ #endif
1756
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1757
+
1758
+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1759
+ sequences[seqNb & STORED_SEQS_MASK] = sequence;
1760
+ op += oneSeqSize;
1761
+ }
1189
1762
  }
1190
1763
  RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1191
1764
 
1192
1765
  /* finish queue */
1193
1766
  seqNb -= seqAdvance;
1194
1767
  for ( ; seqNb<nbSeq ; seqNb++) {
1195
- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
1768
+ seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1769
+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1770
+ {
1771
+ const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1772
+ if (leftoverLit)
1773
+ {
1774
+ RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1775
+ ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1776
+ sequence->litLength -= leftoverLit;
1777
+ op += leftoverLit;
1778
+ }
1779
+ litPtr = dctx->litExtraBuffer;
1780
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1781
+ dctx->litBufferLocation = ZSTD_not_in_dst;
1782
+ {
1783
+ size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1196
1784
  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1197
- assert(!ZSTD_isError(oneSeqSize));
1198
- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1785
+ assert(!ZSTD_isError(oneSeqSize));
1786
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1199
1787
  #endif
1200
- if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1201
- op += oneSeqSize;
1788
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1789
+ op += oneSeqSize;
1790
+ }
1791
+ }
1792
+ else
1793
+ {
1794
+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1795
+ ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1796
+ ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1797
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1798
+ assert(!ZSTD_isError(oneSeqSize));
1799
+ if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1800
+ #endif
1801
+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
+ op += oneSeqSize;
1803
+ }
1202
1804
  }
1203
1805
 
1204
1806
  /* save reps for next block */
@@ -1206,10 +1808,21 @@ ZSTD_decompressSequencesLong_body(
1206
1808
  }
1207
1809
 
1208
1810
  /* last literal segment */
1209
- { size_t const lastLLSize = litEnd - litPtr;
1811
+ if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
1812
+ {
1813
+ size_t const lastLLSize = litBufferEnd - litPtr;
1814
+ RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1815
+ if (op != NULL) {
1816
+ ZSTD_memmove(op, litPtr, lastLLSize);
1817
+ op += lastLLSize;
1818
+ }
1819
+ litPtr = dctx->litExtraBuffer;
1820
+ litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1821
+ }
1822
+ { size_t const lastLLSize = litBufferEnd - litPtr;
1210
1823
  RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1211
1824
  if (op != NULL) {
1212
- memcpy(op, litPtr, lastLLSize);
1825
+ ZSTD_memmove(op, litPtr, lastLLSize);
1213
1826
  op += lastLLSize;
1214
1827
  }
1215
1828
  }
@@ -1233,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1233
1846
  #if DYNAMIC_BMI2
1234
1847
 
1235
1848
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1236
- static TARGET_ATTRIBUTE("bmi2") size_t
1849
+ static BMI2_TARGET_ATTRIBUTE size_t
1237
1850
  DONT_VECTORIZE
1238
1851
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1239
1852
  void* dst, size_t maxDstSize,
@@ -1243,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1243
1856
  {
1244
1857
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1245
1858
  }
1859
+ static BMI2_TARGET_ATTRIBUTE size_t
1860
+ DONT_VECTORIZE
1861
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1862
+ void* dst, size_t maxDstSize,
1863
+ const void* seqStart, size_t seqSize, int nbSeq,
1864
+ const ZSTD_longOffset_e isLongOffset,
1865
+ const int frame)
1866
+ {
1867
+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1868
+ }
1246
1869
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1247
1870
 
1248
1871
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1249
- static TARGET_ATTRIBUTE("bmi2") size_t
1872
+ static BMI2_TARGET_ATTRIBUTE size_t
1250
1873
  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1251
1874
  void* dst, size_t maxDstSize,
1252
1875
  const void* seqStart, size_t seqSize, int nbSeq,
@@ -1275,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1275
1898
  {
1276
1899
  DEBUGLOG(5, "ZSTD_decompressSequences");
1277
1900
  #if DYNAMIC_BMI2
1278
- if (dctx->bmi2) {
1901
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1279
1902
  return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1280
1903
  }
1281
1904
  #endif
1282
- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1905
+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1906
+ }
1907
+ static size_t
1908
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1909
+ const void* seqStart, size_t seqSize, int nbSeq,
1910
+ const ZSTD_longOffset_e isLongOffset,
1911
+ const int frame)
1912
+ {
1913
+ DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1914
+ #if DYNAMIC_BMI2
1915
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1916
+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1917
+ }
1918
+ #endif
1919
+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1283
1920
  }
1284
1921
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1285
1922
 
@@ -1299,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1299
1936
  {
1300
1937
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1301
1938
  #if DYNAMIC_BMI2
1302
- if (dctx->bmi2) {
1939
+ if (ZSTD_DCtx_get_bmi2(dctx)) {
1303
1940
  return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1304
1941
  }
1305
1942
  #endif
@@ -1340,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1340
1977
  size_t
1341
1978
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1342
1979
  void* dst, size_t dstCapacity,
1343
- const void* src, size_t srcSize, const int frame)
1980
+ const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1344
1981
  { /* blockType == blockCompressed */
1345
1982
  const BYTE* ip = (const BYTE*)src;
1346
1983
  /* isLongOffset must be true if there are long offsets.
@@ -1355,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1355
1992
  RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1356
1993
 
1357
1994
  /* Decode literals section */
1358
- { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
1995
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
1359
1996
  DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
1360
1997
  if (ZSTD_isError(litCSize)) return litCSize;
1361
1998
  ip += litCSize;
@@ -1403,15 +2040,18 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1403
2040
 
1404
2041
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1405
2042
  /* else */
1406
- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2043
+ if (dctx->litBufferLocation == ZSTD_split)
2044
+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2045
+ else
2046
+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
1407
2047
  #endif
1408
2048
  }
1409
2049
  }
1410
2050
 
1411
2051
 
1412
- void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
2052
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
1413
2053
  {
1414
- if (dst != dctx->previousDstEnd) { /* not contiguous */
2054
+ if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
1415
2055
  dctx->dictEnd = dctx->previousDstEnd;
1416
2056
  dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
1417
2057
  dctx->prefixStart = dst;
@@ -1425,8 +2065,8 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
1425
2065
  const void* src, size_t srcSize)
1426
2066
  {
1427
2067
  size_t dSize;
1428
- ZSTD_checkContinuity(dctx, dst);
1429
- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
2068
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
2069
+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
1430
2070
  dctx->previousDstEnd = (char*)dst + dSize;
1431
2071
  return dSize;
1432
2072
  }