zstd-ruby 1.5.0.0 → 1.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +1 -0
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  16. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  17. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  18. data/ext/zstdruby/libzstd/common/zstd_internal.h +92 -88
  19. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  20. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  21. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  22. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  23. data/ext/zstdruby/libzstd/compress/zstd_compress.c +194 -278
  24. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +102 -44
  25. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +5 -4
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +3 -2
  29. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +3 -3
  30. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +289 -114
  31. data/ext/zstdruby/libzstd/compress/zstd_fast.c +302 -123
  32. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +418 -502
  33. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +4 -4
  34. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  36. data/ext/zstdruby/libzstd/compress/zstd_opt.c +186 -108
  37. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +59 -29
  38. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  39. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  40. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  41. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  42. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  44. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  45. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  46. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  47. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +99 -28
  48. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  49. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  50. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  51. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  52. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  55. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  56. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  57. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  58. data/ext/zstdruby/libzstd/zdict.h +4 -4
  59. data/ext/zstdruby/libzstd/zstd.h +179 -136
  60. data/ext/zstdruby/zstdruby.c +2 -2
  61. data/lib/zstd-ruby/version.rb +1 -1
  62. metadata +8 -3
@@ -43,145 +43,294 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
43
43
  }
44
44
 
45
45
 
46
+ /**
47
+ * If you squint hard enough (and ignore repcodes), the search operation at any
48
+ * given position is broken into 4 stages:
49
+ *
50
+ * 1. Hash (map position to hash value via input read)
51
+ * 2. Lookup (map hash val to index via hashtable read)
52
+ * 3. Load (map index to value at that position via input read)
53
+ * 4. Compare
54
+ *
55
+ * Each of these steps involves a memory read at an address which is computed
56
+ * from the previous step. This means these steps must be sequenced and their
57
+ * latencies are cumulative.
58
+ *
59
+ * Rather than do 1->2->3->4 sequentially for a single position before moving
60
+ * onto the next, this implementation interleaves these operations across the
61
+ * next few positions:
62
+ *
63
+ * R = Repcode Read & Compare
64
+ * H = Hash
65
+ * T = Table Lookup
66
+ * M = Match Read & Compare
67
+ *
68
+ * Pos | Time -->
69
+ * ----+-------------------
70
+ * N | ... M
71
+ * N+1 | ... TM
72
+ * N+2 | R H T M
73
+ * N+3 | H TM
74
+ * N+4 | R H T M
75
+ * N+5 | H ...
76
+ * N+6 | R ...
77
+ *
78
+ * This is very much analogous to the pipelining of execution in a CPU. And just
79
+ * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
80
+ * branch).
81
+ *
82
+ * When this happens, we throw away our current state, and do the following prep
83
+ * to re-enter the loop:
84
+ *
85
+ * Pos | Time -->
86
+ * ----+-------------------
87
+ * N | H T
88
+ * N+1 | H
89
+ *
90
+ * This is also the work we do at the beginning to enter the loop initially.
91
+ */
46
92
  FORCE_INLINE_TEMPLATE size_t
47
- ZSTD_compressBlock_fast_generic(
93
+ ZSTD_compressBlock_fast_noDict_generic(
48
94
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
49
95
  void const* src, size_t srcSize,
50
- U32 const mls)
96
+ U32 const mls, U32 const hasStep)
51
97
  {
52
98
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
53
99
  U32* const hashTable = ms->hashTable;
54
100
  U32 const hlog = cParams->hashLog;
55
101
  /* support stepSize of 0 */
56
- size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
102
+ size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
57
103
  const BYTE* const base = ms->window.base;
58
104
  const BYTE* const istart = (const BYTE*)src;
59
- /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */
60
- const BYTE* ip0 = istart;
61
- const BYTE* ip1;
62
- const BYTE* anchor = istart;
63
105
  const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
64
106
  const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
65
107
  const BYTE* const prefixStart = base + prefixStartIndex;
66
108
  const BYTE* const iend = istart + srcSize;
67
109
  const BYTE* const ilimit = iend - HASH_READ_SIZE;
68
- U32 offset_1=rep[0], offset_2=rep[1];
110
+
111
+ const BYTE* anchor = istart;
112
+ const BYTE* ip0 = istart;
113
+ const BYTE* ip1;
114
+ const BYTE* ip2;
115
+ const BYTE* ip3;
116
+ U32 current0;
117
+
118
+ U32 rep_offset1 = rep[0];
119
+ U32 rep_offset2 = rep[1];
69
120
  U32 offsetSaved = 0;
70
121
 
71
- /* init */
122
+ size_t hash0; /* hash for ip0 */
123
+ size_t hash1; /* hash for ip1 */
124
+ U32 idx; /* match idx for ip0 */
125
+ U32 mval; /* src value at match idx */
126
+
127
+ U32 offcode;
128
+ const BYTE* match0;
129
+ size_t mLength;
130
+
131
+ /* ip0 and ip1 are always adjacent. The targetLength skipping and
132
+ * uncompressibility acceleration is applied to every other position,
133
+ * matching the behavior of #1562. step therefore represents the gap
134
+ * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
135
+ size_t step;
136
+ const BYTE* nextStep;
137
+ const size_t kStepIncr = (1 << (kSearchStrength - 1));
138
+
72
139
  DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
73
140
  ip0 += (ip0 == prefixStart);
74
- ip1 = ip0 + 1;
75
141
  { U32 const curr = (U32)(ip0 - base);
76
142
  U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
77
143
  U32 const maxRep = curr - windowLow;
78
- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
79
- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
144
+ if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
145
+ if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
80
146
  }
81
147
 
82
- /* Main Search Loop */
83
- #ifdef __INTEL_COMPILER
84
- /* From intel 'The vector pragma indicates that the loop should be
85
- * vectorized if it is legal to do so'. Can be used together with
86
- * #pragma ivdep (but have opted to exclude that because intel
87
- * warns against using it).*/
88
- #pragma vector always
89
- #endif
90
- while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */
91
- size_t mLength;
92
- BYTE const* ip2 = ip0 + 2;
93
- size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls);
94
- U32 const val0 = MEM_read32(ip0);
95
- size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls);
96
- U32 const val1 = MEM_read32(ip1);
97
- U32 const current0 = (U32)(ip0-base);
98
- U32 const current1 = (U32)(ip1-base);
99
- U32 const matchIndex0 = hashTable[h0];
100
- U32 const matchIndex1 = hashTable[h1];
101
- BYTE const* repMatch = ip2 - offset_1;
102
- const BYTE* match0 = base + matchIndex0;
103
- const BYTE* match1 = base + matchIndex1;
104
- U32 offcode;
105
-
106
- #if defined(__aarch64__)
107
- PREFETCH_L1(ip0+256);
108
- #endif
109
-
110
- hashTable[h0] = current0; /* update hash table */
111
- hashTable[h1] = current1; /* update hash table */
112
-
113
- assert(ip0 + 1 == ip1);
114
-
115
- if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) {
116
- mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0;
117
- ip0 = ip2 - mLength;
118
- match0 = repMatch - mLength;
119
- mLength += 4;
148
+ /* start each op */
149
+ _start: /* Requires: ip0 */
150
+
151
+ step = stepSize;
152
+ nextStep = ip0 + kStepIncr;
153
+
154
+ /* calculate positions, ip0 - anchor == 0, so we skip step calc */
155
+ ip1 = ip0 + 1;
156
+ ip2 = ip0 + step;
157
+ ip3 = ip2 + 1;
158
+
159
+ if (ip3 >= ilimit) {
160
+ goto _cleanup;
161
+ }
162
+
163
+ hash0 = ZSTD_hashPtr(ip0, hlog, mls);
164
+ hash1 = ZSTD_hashPtr(ip1, hlog, mls);
165
+
166
+ idx = hashTable[hash0];
167
+
168
+ do {
169
+ /* load repcode match for ip[2]*/
170
+ const U32 rval = MEM_read32(ip2 - rep_offset1);
171
+
172
+ /* write back hash table entry */
173
+ current0 = (U32)(ip0 - base);
174
+ hashTable[hash0] = current0;
175
+
176
+ /* check repcode at ip[2] */
177
+ if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
178
+ ip0 = ip2;
179
+ match0 = ip0 - rep_offset1;
180
+ mLength = ip0[-1] == match0[-1];
181
+ ip0 -= mLength;
182
+ match0 -= mLength;
120
183
  offcode = 0;
184
+ mLength += 4;
121
185
  goto _match;
122
186
  }
123
- if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) {
124
- /* found a regular match */
125
- goto _offset;
187
+
188
+ /* load match for ip[0] */
189
+ if (idx >= prefixStartIndex) {
190
+ mval = MEM_read32(base + idx);
191
+ } else {
192
+ mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
126
193
  }
127
- if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) {
128
- /* found a regular match after one literal */
129
- ip0 = ip1;
130
- match0 = match1;
194
+
195
+ /* check match at ip[0] */
196
+ if (MEM_read32(ip0) == mval) {
197
+ /* found a match! */
131
198
  goto _offset;
132
199
  }
133
- { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize;
134
- assert(step >= 2);
135
- ip0 += step;
136
- ip1 += step;
137
- continue;
200
+
201
+ /* lookup ip[1] */
202
+ idx = hashTable[hash1];
203
+
204
+ /* hash ip[2] */
205
+ hash0 = hash1;
206
+ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
207
+
208
+ /* advance to next positions */
209
+ ip0 = ip1;
210
+ ip1 = ip2;
211
+ ip2 = ip3;
212
+
213
+ /* write back hash table entry */
214
+ current0 = (U32)(ip0 - base);
215
+ hashTable[hash0] = current0;
216
+
217
+ /* load match for ip[0] */
218
+ if (idx >= prefixStartIndex) {
219
+ mval = MEM_read32(base + idx);
220
+ } else {
221
+ mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
138
222
  }
139
- _offset: /* Requires: ip0, match0 */
140
- /* Compute the offset code */
141
- offset_2 = offset_1;
142
- offset_1 = (U32)(ip0-match0);
143
- offcode = offset_1 + ZSTD_REP_MOVE;
144
- mLength = 4;
145
- /* Count the backwards match length */
146
- while (((ip0>anchor) & (match0>prefixStart))
147
- && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */
148
223
 
149
- _match: /* Requires: ip0, match0, offcode */
150
- /* Count the forward length */
151
- mLength += ZSTD_count(ip0+mLength, match0+mLength, iend);
152
- ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH);
153
- /* match found */
154
- ip0 += mLength;
155
- anchor = ip0;
224
+ /* check match at ip[0] */
225
+ if (MEM_read32(ip0) == mval) {
226
+ /* found a match! */
227
+ goto _offset;
228
+ }
156
229
 
157
- if (ip0 <= ilimit) {
158
- /* Fill Table */
159
- assert(base+current0+2 > istart); /* check base overflow */
160
- hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
161
- hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
162
-
163
- if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */
164
- while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) {
165
- /* store sequence */
166
- size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4;
167
- { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
168
- hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
169
- ip0 += rLength;
170
- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH);
171
- anchor = ip0;
172
- continue; /* faster when present (confirmed on gcc-8) ... (?) */
173
- } } }
174
- ip1 = ip0 + 1;
175
- }
230
+ /* lookup ip[1] */
231
+ idx = hashTable[hash1];
232
+
233
+ /* hash ip[2] */
234
+ hash0 = hash1;
235
+ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
236
+
237
+ /* advance to next positions */
238
+ ip0 = ip1;
239
+ ip1 = ip2;
240
+ ip2 = ip0 + step;
241
+ ip3 = ip1 + step;
242
+
243
+ /* calculate step */
244
+ if (ip2 >= nextStep) {
245
+ step++;
246
+ PREFETCH_L1(ip1 + 64);
247
+ PREFETCH_L1(ip1 + 128);
248
+ nextStep += kStepIncr;
249
+ }
250
+ } while (ip3 < ilimit);
251
+
252
+ _cleanup:
253
+ /* Note that there are probably still a couple positions we could search.
254
+ * However, it seems to be a meaningful performance hit to try to search
255
+ * them. So let's not. */
176
256
 
177
257
  /* save reps for next block */
178
- rep[0] = offset_1 ? offset_1 : offsetSaved;
179
- rep[1] = offset_2 ? offset_2 : offsetSaved;
258
+ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
259
+ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
180
260
 
181
261
  /* Return the last literals size */
182
262
  return (size_t)(iend - anchor);
263
+
264
+ _offset: /* Requires: ip0, idx */
265
+
266
+ /* Compute the offset code. */
267
+ match0 = base + idx;
268
+ rep_offset2 = rep_offset1;
269
+ rep_offset1 = (U32)(ip0-match0);
270
+ offcode = rep_offset1 + ZSTD_REP_MOVE;
271
+ mLength = 4;
272
+
273
+ /* Count the backwards match length. */
274
+ while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
275
+ ip0--;
276
+ match0--;
277
+ mLength++;
278
+ }
279
+
280
+ _match: /* Requires: ip0, match0, offcode */
281
+
282
+ /* Count the forward length. */
283
+ mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
284
+
285
+ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength - MINMATCH);
286
+
287
+ ip0 += mLength;
288
+ anchor = ip0;
289
+
290
+ /* write next hash table entry */
291
+ if (ip1 < ip0) {
292
+ hashTable[hash1] = (U32)(ip1 - base);
293
+ }
294
+
295
+ /* Fill table and check for immediate repcode. */
296
+ if (ip0 <= ilimit) {
297
+ /* Fill Table */
298
+ assert(base+current0+2 > istart); /* check base overflow */
299
+ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
300
+ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
301
+
302
+ if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
303
+ while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
304
+ /* store sequence */
305
+ size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
306
+ { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
307
+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
308
+ ip0 += rLength;
309
+ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH);
310
+ anchor = ip0;
311
+ continue; /* faster when present (confirmed on gcc-8) ... (?) */
312
+ } } }
313
+
314
+ goto _start;
183
315
  }
184
316
 
317
+ #define ZSTD_GEN_FAST_FN(dictMode, mls, step) \
318
+ static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \
319
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
320
+ void const* src, size_t srcSize) \
321
+ { \
322
+ return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
323
+ }
324
+
325
+ ZSTD_GEN_FAST_FN(noDict, 4, 1)
326
+ ZSTD_GEN_FAST_FN(noDict, 5, 1)
327
+ ZSTD_GEN_FAST_FN(noDict, 6, 1)
328
+ ZSTD_GEN_FAST_FN(noDict, 7, 1)
329
+
330
+ ZSTD_GEN_FAST_FN(noDict, 4, 0)
331
+ ZSTD_GEN_FAST_FN(noDict, 5, 0)
332
+ ZSTD_GEN_FAST_FN(noDict, 6, 0)
333
+ ZSTD_GEN_FAST_FN(noDict, 7, 0)
185
334
 
186
335
  size_t ZSTD_compressBlock_fast(
187
336
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -189,24 +338,40 @@ size_t ZSTD_compressBlock_fast(
189
338
  {
190
339
  U32 const mls = ms->cParams.minMatch;
191
340
  assert(ms->dictMatchState == NULL);
192
- switch(mls)
193
- {
194
- default: /* includes case 3 */
195
- case 4 :
196
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4);
197
- case 5 :
198
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5);
199
- case 6 :
200
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6);
201
- case 7 :
202
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7);
341
+ if (ms->cParams.targetLength > 1) {
342
+ switch(mls)
343
+ {
344
+ default: /* includes case 3 */
345
+ case 4 :
346
+ return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
347
+ case 5 :
348
+ return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
349
+ case 6 :
350
+ return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
351
+ case 7 :
352
+ return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
353
+ }
354
+ } else {
355
+ switch(mls)
356
+ {
357
+ default: /* includes case 3 */
358
+ case 4 :
359
+ return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
360
+ case 5 :
361
+ return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
362
+ case 6 :
363
+ return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
364
+ case 7 :
365
+ return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
366
+ }
367
+
203
368
  }
204
369
  }
205
370
 
206
371
  FORCE_INLINE_TEMPLATE
207
372
  size_t ZSTD_compressBlock_fast_dictMatchState_generic(
208
373
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
209
- void const* src, size_t srcSize, U32 const mls)
374
+ void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
210
375
  {
211
376
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
212
377
  U32* const hashTable = ms->hashTable;
@@ -242,6 +407,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
242
407
  assert(endIndex - prefixStartIndex <= maxDistance);
243
408
  (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
244
409
 
410
+ (void)hasStep; /* not currently specialized on whether it's accelerated */
411
+
245
412
  /* ensure there will be no underflow
246
413
  * when translating a dict index into a local index */
247
414
  assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
@@ -351,6 +518,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
351
518
  return (size_t)(iend - anchor);
352
519
  }
353
520
 
521
+
522
+ ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
523
+ ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
524
+ ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
525
+ ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
526
+
354
527
  size_t ZSTD_compressBlock_fast_dictMatchState(
355
528
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
356
529
  void const* src, size_t srcSize)
@@ -361,20 +534,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
361
534
  {
362
535
  default: /* includes case 3 */
363
536
  case 4 :
364
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4);
537
+ return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
365
538
  case 5 :
366
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5);
539
+ return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
367
540
  case 6 :
368
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6);
541
+ return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
369
542
  case 7 :
370
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7);
543
+ return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
371
544
  }
372
545
  }
373
546
 
374
547
 
375
548
  static size_t ZSTD_compressBlock_fast_extDict_generic(
376
549
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
377
- void const* src, size_t srcSize, U32 const mls)
550
+ void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
378
551
  {
379
552
  const ZSTD_compressionParameters* const cParams = &ms->cParams;
380
553
  U32* const hashTable = ms->hashTable;
@@ -398,11 +571,13 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
398
571
  const BYTE* const ilimit = iend - 8;
399
572
  U32 offset_1=rep[0], offset_2=rep[1];
400
573
 
574
+ (void)hasStep; /* not currently specialized on whether it's accelerated */
575
+
401
576
  DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
402
577
 
403
578
  /* switch to "regular" variant if extDict is invalidated due to maxDistance */
404
579
  if (prefixStartIndex == dictStartIndex)
405
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls);
580
+ return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
406
581
 
407
582
  /* Search Loop */
408
583
  while (ip < ilimit) { /* < instead of <=, because (ip+1) */
@@ -418,7 +593,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
418
593
  DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
419
594
 
420
595
  if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
421
- & (offset_1 < curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
596
+ & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
422
597
  && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
423
598
  const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
424
599
  size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
@@ -453,7 +628,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
453
628
  U32 const current2 = (U32)(ip-base);
454
629
  U32 const repIndex2 = current2 - offset_2;
455
630
  const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
456
- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 < curr - dictStartIndex)) /* intentional overflow */
631
+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */
457
632
  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
458
633
  const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
459
634
  size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
@@ -475,6 +650,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
475
650
  return (size_t)(iend - anchor);
476
651
  }
477
652
 
653
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
654
+ ZSTD_GEN_FAST_FN(extDict, 5, 0)
655
+ ZSTD_GEN_FAST_FN(extDict, 6, 0)
656
+ ZSTD_GEN_FAST_FN(extDict, 7, 0)
478
657
 
479
658
  size_t ZSTD_compressBlock_fast_extDict(
480
659
  ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -485,12 +664,12 @@ size_t ZSTD_compressBlock_fast_extDict(
485
664
  {
486
665
  default: /* includes case 3 */
487
666
  case 4 :
488
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
667
+ return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
489
668
  case 5 :
490
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
669
+ return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
491
670
  case 6 :
492
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
671
+ return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
493
672
  case 7 :
494
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
673
+ return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
495
674
  }
496
675
  }