zstd-ruby 1.4.9.0 → 1.5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/README.md +1 -1
  4. data/ext/zstdruby/libzstd/BUCK +5 -7
  5. data/ext/zstdruby/libzstd/Makefile +42 -13
  6. data/ext/zstdruby/libzstd/README.md +8 -4
  7. data/ext/zstdruby/libzstd/common/bitstream.h +1 -1
  8. data/ext/zstdruby/libzstd/common/compiler.h +1 -1
  9. data/ext/zstdruby/libzstd/common/cpu.h +1 -1
  10. data/ext/zstdruby/libzstd/common/debug.c +1 -1
  11. data/ext/zstdruby/libzstd/common/debug.h +1 -1
  12. data/ext/zstdruby/libzstd/common/entropy_common.c +1 -1
  13. data/ext/zstdruby/libzstd/common/error_private.c +1 -1
  14. data/ext/zstdruby/libzstd/common/error_private.h +3 -3
  15. data/ext/zstdruby/libzstd/common/fse.h +2 -2
  16. data/ext/zstdruby/libzstd/common/fse_decompress.c +25 -15
  17. data/ext/zstdruby/libzstd/common/huf.h +3 -2
  18. data/ext/zstdruby/libzstd/common/mem.h +3 -5
  19. data/ext/zstdruby/libzstd/common/pool.c +1 -1
  20. data/ext/zstdruby/libzstd/common/pool.h +1 -1
  21. data/ext/zstdruby/libzstd/common/xxhash.c +2 -4
  22. data/ext/zstdruby/libzstd/common/xxhash.h +1 -1
  23. data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
  24. data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
  25. data/ext/zstdruby/libzstd/common/zstd_internal.h +21 -9
  26. data/ext/zstdruby/libzstd/common/zstd_trace.h +7 -5
  27. data/ext/zstdruby/libzstd/compress/fse_compress.c +1 -1
  28. data/ext/zstdruby/libzstd/compress/hist.c +1 -1
  29. data/ext/zstdruby/libzstd/compress/hist.h +1 -1
  30. data/ext/zstdruby/libzstd/compress/huf_compress.c +51 -28
  31. data/ext/zstdruby/libzstd/compress/zstd_compress.c +1373 -275
  32. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +164 -21
  33. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +2 -2
  34. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +1 -1
  35. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +14 -6
  36. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
  37. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +5 -282
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
  39. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +147 -46
  40. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +3 -3
  41. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +1 -1
  42. data/ext/zstdruby/libzstd/compress/zstd_fast.c +4 -4
  43. data/ext/zstdruby/libzstd/compress/zstd_fast.h +1 -1
  44. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +914 -142
  45. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +39 -1
  46. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +51 -15
  47. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +2 -1
  48. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
  49. data/ext/zstdruby/libzstd/compress/zstd_opt.c +1 -1
  50. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  51. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +15 -6
  52. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +5 -5
  53. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +44 -43
  54. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +1 -1
  55. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
  56. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +3 -4
  57. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +44 -36
  58. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
  59. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +1 -2
  60. data/ext/zstdruby/libzstd/deprecated/zbuff.h +1 -1
  61. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +1 -1
  62. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
  63. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  64. data/ext/zstdruby/libzstd/dictBuilder/cover.c +7 -6
  65. data/ext/zstdruby/libzstd/dictBuilder/cover.h +6 -5
  66. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +7 -6
  67. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +8 -7
  68. data/ext/zstdruby/libzstd/dll/example/Makefile +1 -1
  69. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +1 -1
  70. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +1 -1
  71. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  72. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +1 -1
  73. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  74. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +1 -1
  75. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  76. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +1 -1
  77. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  78. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +1 -1
  79. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +1 -1
  80. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  82. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  84. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +148 -2
  85. data/ext/zstdruby/libzstd/zstd.h +165 -83
  86. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +1 -1
  87. data/lib/zstd-ruby/version.rb +1 -1
  88. metadata +5 -5
  89. data/ext/zstdruby/libzstd/common/zstd_trace.c +0 -42
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -35,6 +35,10 @@ extern "C" {
35
35
  #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
36
36
  #endif
37
37
 
38
+
39
+ /* Set our tables and aligneds to align by 64 bytes */
40
+ #define ZSTD_CWKSP_ALIGNMENT_BYTES 64
41
+
38
42
  /*-*************************************
39
43
  * Structures
40
44
  ***************************************/
@@ -117,10 +121,11 @@ typedef enum {
117
121
  * - Tables: these are any of several different datastructures (hash tables,
118
122
  * chain tables, binary trees) that all respect a common format: they are
119
123
  * uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
120
- * Their sizes depend on the cparams.
124
+ * Their sizes depend on the cparams. These tables are 64-byte aligned.
121
125
  *
122
126
  * - Aligned: these buffers are used for various purposes that require 4 byte
123
- * alignment, but don't require any initialization before they're used.
127
+ * alignment, but don't require any initialization before they're used. These
128
+ * buffers are each aligned to 64 bytes.
124
129
  *
125
130
  * - Buffers: these buffers are used for various purposes that don't require
126
131
  * any alignment or initialization before they're used. This means they can
@@ -133,8 +138,7 @@ typedef enum {
133
138
  *
134
139
  * 1. Objects
135
140
  * 2. Buffers
136
- * 3. Aligned
137
- * 4. Tables
141
+ * 3. Aligned/Tables
138
142
  *
139
143
  * Attempts to reserve objects of different types out of order will fail.
140
144
  */
@@ -187,6 +191,8 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
187
191
  * Since tables aren't currently redzoned, you don't need to call through this
188
192
  * to figure out how much space you need for the matchState tables. Everything
189
193
  * else is though.
194
+ *
195
+ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
190
196
  */
191
197
  MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
192
198
  if (size == 0)
@@ -198,30 +204,110 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
198
204
  #endif
199
205
  }
200
206
 
201
- MEM_STATIC void ZSTD_cwksp_internal_advance_phase(
207
+ /**
208
+ * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
209
+ * Used to determine the number of bytes required for a given "aligned".
210
+ */
211
+ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
212
+ return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
213
+ }
214
+
215
+ /**
216
+ * Returns the amount of additional space the cwksp must allocate
217
+ * for internal purposes (currently only alignment).
218
+ */
219
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
220
+ /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
221
+ * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
222
+ * to align the beginning of the aligned secion.
223
+ *
224
+ * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
225
+ * aligneds being sized in multiples of 64 bytes.
226
+ */
227
+ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
228
+ return slackSpace;
229
+ }
230
+
231
+
232
+ /**
233
+ * Return the number of additional bytes required to align a pointer to the given number of bytes.
234
+ * alignBytes must be a power of two.
235
+ */
236
+ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
237
+ size_t const alignBytesMask = alignBytes - 1;
238
+ size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
239
+ assert((alignBytes & alignBytesMask) == 0);
240
+ assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
241
+ return bytes;
242
+ }
243
+
244
+ /**
245
+ * Internal function. Do not use directly.
246
+ * Reserves the given number of bytes within the aligned/buffer segment of the wksp, which
247
+ * counts from the end of the wksp. (as opposed to the object/table segment)
248
+ *
249
+ * Returns a pointer to the beginning of that space.
250
+ */
251
+ MEM_STATIC void* ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) {
252
+ void* const alloc = (BYTE*)ws->allocStart - bytes;
253
+ void* const bottom = ws->tableEnd;
254
+ DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
255
+ alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
256
+ ZSTD_cwksp_assert_internal_consistency(ws);
257
+ assert(alloc >= bottom);
258
+ if (alloc < bottom) {
259
+ DEBUGLOG(4, "cwksp: alloc failed!");
260
+ ws->allocFailed = 1;
261
+ return NULL;
262
+ }
263
+ if (alloc < ws->tableValidEnd) {
264
+ ws->tableValidEnd = alloc;
265
+ }
266
+ ws->allocStart = alloc;
267
+ return alloc;
268
+ }
269
+
270
+ /**
271
+ * Moves the cwksp to the next phase, and does any necessary allocations.
272
+ * Returns a 0 on success, or zstd error
273
+ */
274
+ MEM_STATIC size_t ZSTD_cwksp_internal_advance_phase(
202
275
  ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) {
203
276
  assert(phase >= ws->phase);
204
277
  if (phase > ws->phase) {
278
+ /* Going from allocating objects to allocating buffers */
205
279
  if (ws->phase < ZSTD_cwksp_alloc_buffers &&
206
280
  phase >= ZSTD_cwksp_alloc_buffers) {
207
281
  ws->tableValidEnd = ws->objectEnd;
208
282
  }
283
+
284
+ /* Going from allocating buffers to allocating aligneds/tables */
209
285
  if (ws->phase < ZSTD_cwksp_alloc_aligned &&
210
286
  phase >= ZSTD_cwksp_alloc_aligned) {
211
- /* If unaligned allocations down from a too-large top have left us
212
- * unaligned, we need to realign our alloc ptr. Technically, this
213
- * can consume space that is unaccounted for in the neededSpace
214
- * calculation. However, I believe this can only happen when the
215
- * workspace is too large, and specifically when it is too large
216
- * by a larger margin than the space that will be consumed. */
217
- /* TODO: cleaner, compiler warning friendly way to do this??? */
218
- ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1));
219
- if (ws->allocStart < ws->tableValidEnd) {
220
- ws->tableValidEnd = ws->allocStart;
287
+ { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
288
+ size_t const bytesToAlign =
289
+ ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
290
+ DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
291
+ ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
292
+ RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
293
+ memory_allocation, "aligned phase - alignment initial allocation failed!");
294
+ }
295
+ { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
296
+ void* const alloc = ws->objectEnd;
297
+ size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
298
+ void* const end = (BYTE*)alloc + bytesToAlign;
299
+ DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
300
+ RETURN_ERROR_IF(end > ws->workspaceEnd, memory_allocation,
301
+ "table phase - alignment initial allocation failed!");
302
+ ws->objectEnd = end;
303
+ ws->tableEnd = end;
304
+ ws->tableValidEnd = end;
221
305
  }
222
306
  }
223
307
  ws->phase = phase;
308
+ ZSTD_cwksp_assert_internal_consistency(ws);
224
309
  }
310
+ return 0;
225
311
  }
226
312
 
227
313
  /**
@@ -237,38 +323,25 @@ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) {
237
323
  MEM_STATIC void* ZSTD_cwksp_reserve_internal(
238
324
  ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) {
239
325
  void* alloc;
240
- void* bottom = ws->tableEnd;
241
- ZSTD_cwksp_internal_advance_phase(ws, phase);
242
- alloc = (BYTE *)ws->allocStart - bytes;
243
-
244
- if (bytes == 0)
326
+ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {
245
327
  return NULL;
328
+ }
246
329
 
247
330
  #if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
248
331
  /* over-reserve space */
249
- alloc = (BYTE *)alloc - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
332
+ bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
250
333
  #endif
251
334
 
252
- DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
253
- alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
254
- ZSTD_cwksp_assert_internal_consistency(ws);
255
- assert(alloc >= bottom);
256
- if (alloc < bottom) {
257
- DEBUGLOG(4, "cwksp: alloc failed!");
258
- ws->allocFailed = 1;
259
- return NULL;
260
- }
261
- if (alloc < ws->tableValidEnd) {
262
- ws->tableValidEnd = alloc;
263
- }
264
- ws->allocStart = alloc;
335
+ alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);
265
336
 
266
337
  #if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
267
338
  /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
268
339
  * either size. */
269
- alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
270
- if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
271
- __asan_unpoison_memory_region(alloc, bytes);
340
+ if (alloc) {
341
+ alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
342
+ if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
343
+ __asan_unpoison_memory_region(alloc, bytes);
344
+ }
272
345
  }
273
346
  #endif
274
347
 
@@ -283,28 +356,36 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) {
283
356
  }
284
357
 
285
358
  /**
286
- * Reserves and returns memory sized on and aligned on sizeof(unsigned).
359
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
287
360
  */
288
361
  MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
289
- assert((bytes & (sizeof(U32)-1)) == 0);
290
- return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned);
362
+ void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
363
+ ZSTD_cwksp_alloc_aligned);
364
+ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
365
+ return ptr;
291
366
  }
292
367
 
293
368
  /**
294
- * Aligned on sizeof(unsigned). These buffers have the special property that
369
+ * Aligned on 64 bytes. These buffers have the special property that
295
370
  * their values remain constrained, allowing us to re-use them without
296
371
  * memset()-ing them.
297
372
  */
298
373
  MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
299
374
  const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
300
- void* alloc = ws->tableEnd;
301
- void* end = (BYTE *)alloc + bytes;
302
- void* top = ws->allocStart;
375
+ void* alloc;
376
+ void* end;
377
+ void* top;
378
+
379
+ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
380
+ return NULL;
381
+ }
382
+ alloc = ws->tableEnd;
383
+ end = (BYTE *)alloc + bytes;
384
+ top = ws->allocStart;
303
385
 
304
386
  DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
305
387
  alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
306
388
  assert((bytes & (sizeof(U32)-1)) == 0);
307
- ZSTD_cwksp_internal_advance_phase(ws, phase);
308
389
  ZSTD_cwksp_assert_internal_consistency(ws);
309
390
  assert(end <= top);
310
391
  if (end > top) {
@@ -320,6 +401,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
320
401
  }
321
402
  #endif
322
403
 
404
+ assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
405
+ assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
323
406
  return alloc;
324
407
  }
325
408
 
@@ -527,6 +610,24 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
527
610
  * Functions Checking Free Space
528
611
  ***************************************/
529
612
 
613
+ /* ZSTD_alignmentSpaceWithinBounds() :
614
+ * Returns if the estimated space needed for a wksp is within an acceptable limit of the
615
+ * actual amount of space used.
616
+ */
617
+ MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
618
+ size_t const estimatedSpace, int resizedWorkspace) {
619
+ if (resizedWorkspace) {
620
+ /* Resized/newly allocated wksp should have exact bounds */
621
+ return ZSTD_cwksp_used(ws) == estimatedSpace;
622
+ } else {
623
+ /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
624
+ * than estimatedSpace. See the comments in zstd_cwksp.h for details.
625
+ */
626
+ return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
627
+ }
628
+ }
629
+
630
+
530
631
  MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
531
632
  return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
532
633
  }
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -409,7 +409,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
409
409
  hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */
410
410
 
411
411
  if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
412
- & (repIndex > dictStartIndex))
412
+ & (offset_1 < curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
413
413
  && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
414
414
  const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
415
415
  mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
@@ -477,7 +477,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
477
477
  U32 const repIndex2 = current2 - offset_2;
478
478
  const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
479
479
  if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
480
- & (repIndex2 > dictStartIndex))
480
+ & (offset_2 < current2 - dictStartIndex))
481
481
  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
482
482
  const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
483
483
  size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -416,9 +416,9 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
416
416
  const BYTE* const repMatch = repBase + repIndex;
417
417
  hashTable[h] = curr; /* update hash table */
418
418
  DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
419
- assert(offset_1 <= curr +1); /* check repIndex */
420
419
 
421
- if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex))
420
+ if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
421
+ & (offset_1 < curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
422
422
  && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
423
423
  const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
424
424
  size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
@@ -453,7 +453,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
453
453
  U32 const current2 = (U32)(ip-base);
454
454
  U32 const repIndex2 = current2 - offset_2;
455
455
  const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
456
- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */
456
+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 < curr - dictStartIndex)) /* intentional overflow */
457
457
  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
458
458
  const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
459
459
  size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -438,43 +438,9 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
438
438
  }
439
439
  }
440
440
 
441
-
442
-
443
- /* *********************************
444
- * Hash Chain
441
+ /***********************************
442
+ * Dedicated dict search
445
443
  ***********************************/
446
- #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
447
-
448
- /* Update chains up to ip (excluded)
449
- Assumption : always within prefix (i.e. not within extDict) */
450
- FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
451
- ZSTD_matchState_t* ms,
452
- const ZSTD_compressionParameters* const cParams,
453
- const BYTE* ip, U32 const mls)
454
- {
455
- U32* const hashTable = ms->hashTable;
456
- const U32 hashLog = cParams->hashLog;
457
- U32* const chainTable = ms->chainTable;
458
- const U32 chainMask = (1 << cParams->chainLog) - 1;
459
- const BYTE* const base = ms->window.base;
460
- const U32 target = (U32)(ip - base);
461
- U32 idx = ms->nextToUpdate;
462
-
463
- while(idx < target) { /* catch up */
464
- size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
465
- NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
466
- hashTable[h] = idx;
467
- idx++;
468
- }
469
-
470
- ms->nextToUpdate = target;
471
- return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
472
- }
473
-
474
- U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
475
- const ZSTD_compressionParameters* const cParams = &ms->cParams;
476
- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
477
- }
478
444
 
479
445
  void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
480
446
  {
@@ -500,11 +466,10 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
500
466
  U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
501
467
  U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
502
468
  U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
503
-
504
469
  U32 hashIdx;
505
470
 
506
471
  assert(ms->cParams.chainLog <= 24);
507
- assert(ms->cParams.hashLog >= ms->cParams.chainLog);
472
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
508
473
  assert(idx != 0);
509
474
  assert(tmpMinChain <= minChain);
510
475
 
@@ -535,7 +500,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
535
500
  if (count == cacheSize) {
536
501
  for (count = 0; count < chainLimit;) {
537
502
  if (i < minChain) {
538
- if (!i || countBeyondMinChain++ > cacheSize) {
503
+ if (!i || ++countBeyondMinChain > cacheSize) {
539
504
  /* only allow pulling `cacheSize` number of entries
540
505
  * into the cache or chainTable beyond `minChain`,
541
506
  * to replace the entries pulled out of the
@@ -591,6 +556,139 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
591
556
  ms->nextToUpdate = target;
592
557
  }
593
558
 
559
+ /* Returns the longest match length found in the dedicated dict search structure.
560
+ * If none are longer than the argument ml, then ml will be returned.
561
+ */
562
+ FORCE_INLINE_TEMPLATE
563
+ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
564
+ const ZSTD_matchState_t* const dms,
565
+ const BYTE* const ip, const BYTE* const iLimit,
566
+ const BYTE* const prefixStart, const U32 curr,
567
+ const U32 dictLimit, const size_t ddsIdx) {
568
+ const U32 ddsLowestIndex = dms->window.dictLimit;
569
+ const BYTE* const ddsBase = dms->window.base;
570
+ const BYTE* const ddsEnd = dms->window.nextSrc;
571
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
572
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
573
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
574
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
575
+ U32 ddsAttempt;
576
+ U32 matchIndex;
577
+
578
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
579
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
580
+ }
581
+
582
+ {
583
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
584
+ U32 const chainIndex = chainPackedPointer >> 8;
585
+
586
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
587
+ }
588
+
589
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
590
+ size_t currentMl=0;
591
+ const BYTE* match;
592
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
593
+ match = ddsBase + matchIndex;
594
+
595
+ if (!matchIndex) {
596
+ return ml;
597
+ }
598
+
599
+ /* guaranteed by table construction */
600
+ (void)ddsLowestIndex;
601
+ assert(matchIndex >= ddsLowestIndex);
602
+ assert(match+4 <= ddsEnd);
603
+ if (MEM_read32(match) == MEM_read32(ip)) {
604
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
605
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
606
+ }
607
+
608
+ /* save best solution */
609
+ if (currentMl > ml) {
610
+ ml = currentMl;
611
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
612
+ if (ip+currentMl == iLimit) {
613
+ /* best possible, avoids read overflow on next attempt */
614
+ return ml;
615
+ }
616
+ }
617
+ }
618
+
619
+ {
620
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
621
+ U32 chainIndex = chainPackedPointer >> 8;
622
+ U32 const chainLength = chainPackedPointer & 0xFF;
623
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
624
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
625
+ U32 chainAttempt;
626
+
627
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
628
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
629
+ }
630
+
631
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
632
+ size_t currentMl=0;
633
+ const BYTE* match;
634
+ matchIndex = dms->chainTable[chainIndex];
635
+ match = ddsBase + matchIndex;
636
+
637
+ /* guaranteed by table construction */
638
+ assert(matchIndex >= ddsLowestIndex);
639
+ assert(match+4 <= ddsEnd);
640
+ if (MEM_read32(match) == MEM_read32(ip)) {
641
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
642
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
643
+ }
644
+
645
+ /* save best solution */
646
+ if (currentMl > ml) {
647
+ ml = currentMl;
648
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
649
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
650
+ }
651
+ }
652
+ }
653
+ return ml;
654
+ }
655
+
656
+
657
+ /* *********************************
658
+ * Hash Chain
659
+ ***********************************/
660
+ #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
661
+
662
+ /* Update chains up to ip (excluded)
663
+ Assumption : always within prefix (i.e. not within extDict) */
664
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
665
+ ZSTD_matchState_t* ms,
666
+ const ZSTD_compressionParameters* const cParams,
667
+ const BYTE* ip, U32 const mls)
668
+ {
669
+ U32* const hashTable = ms->hashTable;
670
+ const U32 hashLog = cParams->hashLog;
671
+ U32* const chainTable = ms->chainTable;
672
+ const U32 chainMask = (1 << cParams->chainLog) - 1;
673
+ const BYTE* const base = ms->window.base;
674
+ const U32 target = (U32)(ip - base);
675
+ U32 idx = ms->nextToUpdate;
676
+
677
+ while(idx < target) { /* catch up */
678
+ size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
679
+ NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
680
+ hashTable[h] = idx;
681
+ idx++;
682
+ }
683
+
684
+ ms->nextToUpdate = target;
685
+ return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
686
+ }
687
+
688
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
689
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
690
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
691
+ }
594
692
 
595
693
  /* inlining is important to hardwire a hot branch (template emulation) */
596
694
  FORCE_INLINE_TEMPLATE
@@ -661,90 +759,8 @@ size_t ZSTD_HcFindBestMatch_generic (
661
759
  }
662
760
 
663
761
  if (dictMode == ZSTD_dedicatedDictSearch) {
664
- const U32 ddsLowestIndex = dms->window.dictLimit;
665
- const BYTE* const ddsBase = dms->window.base;
666
- const BYTE* const ddsEnd = dms->window.nextSrc;
667
- const U32 ddsSize = (U32)(ddsEnd - ddsBase);
668
- const U32 ddsIndexDelta = dictLimit - ddsSize;
669
- const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
670
- const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
671
- U32 ddsAttempt;
672
-
673
- for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
674
- PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
675
- }
676
-
677
- {
678
- U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
679
- U32 const chainIndex = chainPackedPointer >> 8;
680
-
681
- PREFETCH_L1(&dms->chainTable[chainIndex]);
682
- }
683
-
684
- for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
685
- size_t currentMl=0;
686
- const BYTE* match;
687
- matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
688
- match = ddsBase + matchIndex;
689
-
690
- if (!matchIndex) {
691
- return ml;
692
- }
693
-
694
- /* guaranteed by table construction */
695
- (void)ddsLowestIndex;
696
- assert(matchIndex >= ddsLowestIndex);
697
- assert(match+4 <= ddsEnd);
698
- if (MEM_read32(match) == MEM_read32(ip)) {
699
- /* assumption : matchIndex <= dictLimit-4 (by table construction) */
700
- currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
701
- }
702
-
703
- /* save best solution */
704
- if (currentMl > ml) {
705
- ml = currentMl;
706
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
707
- if (ip+currentMl == iLimit) {
708
- /* best possible, avoids read overflow on next attempt */
709
- return ml;
710
- }
711
- }
712
- }
713
-
714
- {
715
- U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
716
- U32 chainIndex = chainPackedPointer >> 8;
717
- U32 const chainLength = chainPackedPointer & 0xFF;
718
- U32 const chainAttempts = nbAttempts - ddsAttempt;
719
- U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
720
- U32 chainAttempt;
721
-
722
- for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
723
- PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
724
- }
725
-
726
- for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
727
- size_t currentMl=0;
728
- const BYTE* match;
729
- matchIndex = dms->chainTable[chainIndex];
730
- match = ddsBase + matchIndex;
731
-
732
- /* guaranteed by table construction */
733
- assert(matchIndex >= ddsLowestIndex);
734
- assert(match+4 <= ddsEnd);
735
- if (MEM_read32(match) == MEM_read32(ip)) {
736
- /* assumption : matchIndex <= dictLimit-4 (by table construction) */
737
- currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
738
- }
739
-
740
- /* save best solution */
741
- if (currentMl > ml) {
742
- ml = currentMl;
743
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
744
- if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
745
- }
746
- }
747
- }
762
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
748
764
  } else if (dictMode == ZSTD_dictMatchState) {
749
765
  const U32* const dmsChainTable = dms->chainTable;
750
766
  const U32 dmsChainSize = (1 << dms->cParams.chainLog);
@@ -845,11 +861,657 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
845
861
  }
846
862
  }
847
863
 
864
+ /* *********************************
865
+ * (SIMD) Row-based matchfinder
866
+ ***********************************/
867
+ /* Constants for row-based hash */
868
+ #define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869
+ #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
870
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
871
+
872
+ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873
+
874
+ typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
875
+
876
+ #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877
+
878
+ #include <emmintrin.h>
879
+ typedef __m128i ZSTD_Vec128;
880
+
881
+ /* Returns a 128-bit container with 128-bits from src */
882
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883
+ return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884
+ }
885
+
886
+ /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
887
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888
+ return _mm_set1_epi8((char)val);
889
+ }
890
+
891
+ /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892
+ * into a 32-bit mask that is the MSB of each byte.
893
+ * */
894
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895
+ return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896
+ }
897
+
898
+ typedef struct {
899
+ __m128i fst;
900
+ __m128i snd;
901
+ } ZSTD_Vec256;
902
+
903
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904
+ ZSTD_Vec256 v;
905
+ v.fst = ZSTD_Vec128_read(ptr);
906
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907
+ return v;
908
+ }
909
+
910
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911
+ ZSTD_Vec256 v;
912
+ v.fst = ZSTD_Vec128_set8(val);
913
+ v.snd = ZSTD_Vec128_set8(val);
914
+ return v;
915
+ }
916
+
917
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918
+ ZSTD_VecMask fstMask;
919
+ ZSTD_VecMask sndMask;
920
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922
+ return fstMask | (sndMask << 16);
923
+ }
924
+
925
+ #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926
+
927
+ #include <arm_neon.h>
928
+ typedef uint8x16_t ZSTD_Vec128;
929
+
930
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931
+ return vld1q_u8((const BYTE* const)src);
932
+ }
933
+
934
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935
+ return vdupq_n_u8(val);
936
+ }
937
+
938
+ /* Mimics '_mm_movemask_epi8()' from SSE */
939
+ static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940
+ /* Shift out everything but the MSB bits in each byte */
941
+ uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942
+ /* Merge the even lanes together with vsra (right shift and add) */
943
+ uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944
+ uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945
+ uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946
+ /* Extract the low 8 bits from each lane, merge */
947
+ return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948
+ }
949
+
950
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951
+ return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952
+ }
953
+
954
+ typedef struct {
955
+ uint8x16_t fst;
956
+ uint8x16_t snd;
957
+ } ZSTD_Vec256;
958
+
959
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960
+ ZSTD_Vec256 v;
961
+ v.fst = ZSTD_Vec128_read(ptr);
962
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963
+ return v;
964
+ }
965
+
966
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967
+ ZSTD_Vec256 v;
968
+ v.fst = ZSTD_Vec128_set8(val);
969
+ v.snd = ZSTD_Vec128_set8(val);
970
+ return v;
971
+ }
972
+
973
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974
+ ZSTD_VecMask fstMask;
975
+ ZSTD_VecMask sndMask;
976
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978
+ return fstMask | (sndMask << 16);
979
+ }
980
+
981
+ #else /* Scalar fallback version */
982
+
983
+ #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984
+ typedef struct {
985
+ size_t vec[VEC128_NB_SIZE_T];
986
+ } ZSTD_Vec128;
987
+
988
+ static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989
+ ZSTD_Vec128 ret;
990
+ ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991
+ return ret;
992
+ }
993
+
994
+ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995
+ ZSTD_Vec128 ret = { {0} };
996
+ int startBit = sizeof(size_t) * 8 - 8;
997
+ for (;startBit >= 0; startBit -= 8) {
998
+ unsigned j = 0;
999
+ for (;j < VEC128_NB_SIZE_T; ++j) {
1000
+ ret.vec[j] |= ((size_t)val << startBit);
1001
+ }
1002
+ }
1003
+ return ret;
1004
+ }
1005
+
1006
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1007
+ static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008
+ ZSTD_VecMask res = 0;
1009
+ unsigned i = 0;
1010
+ unsigned l = 0;
1011
+ for (; i < VEC128_NB_SIZE_T; ++i) {
1012
+ const size_t cmp1 = x.vec[i];
1013
+ const size_t cmp2 = y.vec[i];
1014
+ unsigned j = 0;
1015
+ for (; j < sizeof(size_t); ++j, ++l) {
1016
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1018
+ }
1019
+ }
1020
+ }
1021
+ return res;
1022
+ }
1023
+
1024
+ #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025
+ typedef struct {
1026
+ size_t vec[VEC256_NB_SIZE_T];
1027
+ } ZSTD_Vec256;
1028
+
1029
+ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030
+ ZSTD_Vec256 ret;
1031
+ ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032
+ return ret;
1033
+ }
1034
+
1035
+ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036
+ ZSTD_Vec256 ret = { {0} };
1037
+ int startBit = sizeof(size_t) * 8 - 8;
1038
+ for (;startBit >= 0; startBit -= 8) {
1039
+ unsigned j = 0;
1040
+ for (;j < VEC256_NB_SIZE_T; ++j) {
1041
+ ret.vec[j] |= ((size_t)val << startBit);
1042
+ }
1043
+ }
1044
+ return ret;
1045
+ }
1046
+
1047
+ /* Compare x to y, byte by byte, generating a "matches" bitfield */
1048
+ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049
+ ZSTD_VecMask res = 0;
1050
+ unsigned i = 0;
1051
+ unsigned l = 0;
1052
+ for (; i < VEC256_NB_SIZE_T; ++i) {
1053
+ const size_t cmp1 = x.vec[i];
1054
+ const size_t cmp2 = y.vec[i];
1055
+ unsigned j = 0;
1056
+ for (; j < sizeof(size_t); ++j, ++l) {
1057
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
1059
+ }
1060
+ }
1061
+ }
1062
+ return res;
1063
+ }
1064
+
1065
+ #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
1066
+
1067
+ /* ZSTD_VecMask_next():
1068
+ * Starting from the LSB, returns the idx of the next non-zero bit.
1069
+ * Basically counting the nb of trailing zeroes.
1070
+ */
1071
+ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072
+ # if defined(_MSC_VER) /* Visual */
1073
+ unsigned long r=0;
1074
+ return _BitScanForward(&r, val) ? (U32)r : 0;
1075
+ # elif defined(__GNUC__) && (__GNUC__ >= 3)
1076
+ return (U32)__builtin_ctz(val);
1077
+ # else
1078
+ /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079
+ static const U32 multiplyDeBruijnBitPosition[32] =
1080
+ {
1081
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083
+ };
1084
+ return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
1085
+ # endif
1086
+ }
1087
+
1088
+ /* ZSTD_VecMask_rotateRight():
1089
+ * Rotates a bitfield to the right by "rotation" bits.
1090
+ * If the rotation is greater than totalBits, the returned mask is 0.
1091
+ */
1092
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
1093
+ ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1094
+ if (rotation == 0)
1095
+ return mask;
1096
+ switch (totalBits) {
1097
+ default:
1098
+ assert(0);
1099
+ case 16:
1100
+ return (mask >> rotation) | (U16)(mask << (16 - rotation));
1101
+ case 32:
1102
+ return (mask >> rotation) | (U32)(mask << (32 - rotation));
1103
+ }
1104
+ }
1105
+
1106
+ /* ZSTD_row_nextIndex():
1107
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
1108
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
1109
+ */
1110
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
1111
+ U32 const next = (*tagRow - 1) & rowMask;
1112
+ *tagRow = (BYTE)next;
1113
+ return next;
1114
+ }
1115
+
1116
+ /* ZSTD_isAligned():
1117
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
1118
+ */
1119
+ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1120
+ assert((align & (align - 1)) == 0);
1121
+ return (((size_t)ptr) & (align - 1)) == 0;
1122
+ }
1123
+
1124
+ /* ZSTD_row_prefetch():
1125
+ * Performs prefetching for the hashTable and tagTable at a given row.
1126
+ */
1127
+ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1128
+ PREFETCH_L1(hashTable + relRow);
1129
+ if (rowLog == 5) {
1130
+ PREFETCH_L1(hashTable + relRow + 16);
1131
+ }
1132
+ PREFETCH_L1(tagTable + relRow);
1133
+ assert(rowLog == 4 || rowLog == 5);
1134
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
1135
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
1136
+ }
1137
+
1138
+ /* ZSTD_row_fillHashCache():
1139
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1140
+ * but not beyond iLimit.
1141
+ */
1142
+ static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1143
+ U32 const rowLog, U32 const mls,
1144
+ U32 idx, const BYTE* const iLimit)
1145
+ {
1146
+ U32 const* const hashTable = ms->hashTable;
1147
+ U16 const* const tagTable = ms->tagTable;
1148
+ U32 const hashLog = ms->rowHashLog;
1149
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
1150
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
1151
+
1152
+ for (; idx < lim; ++idx) {
1153
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1154
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1155
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1156
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
1157
+ }
1158
+
1159
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
1160
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
1161
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
1162
+ }
1163
+
1164
+ /* ZSTD_row_nextCachedHash():
1165
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
1166
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
1167
+ */
1168
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
1169
+ U16 const* tagTable, BYTE const* base,
1170
+ U32 idx, U32 const hashLog,
1171
+ U32 const rowLog, U32 const mls)
1172
+ {
1173
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1174
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1175
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1176
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
1177
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
1178
+ return hash;
1179
+ }
1180
+ }
1181
+
1182
+ /* ZSTD_row_update_internal():
1183
+ * Inserts the byte at ip into the appropriate position in the hash table.
1184
+ * Determines the relative row, and the position within the {16, 32} entry row to insert at.
1185
+ */
1186
+ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1187
+ U32 const mls, U32 const rowLog,
1188
+ U32 const rowMask, U32 const useCache)
1189
+ {
1190
+ U32* const hashTable = ms->hashTable;
1191
+ U16* const tagTable = ms->tagTable;
1192
+ U32 const hashLog = ms->rowHashLog;
1193
+ const BYTE* const base = ms->window.base;
1194
+ const U32 target = (U32)(ip - base);
1195
+ U32 idx = ms->nextToUpdate;
1196
+
1197
+ DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1198
+ for (; idx < target; ++idx) {
1199
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1200
+ : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1201
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1202
+ U32* const row = hashTable + relRow;
1203
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1204
+ Explicit cast allows us to get exact desired position within each row */
1205
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1206
+
1207
+ assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1208
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1209
+ row[pos] = idx;
1210
+ }
1211
+ ms->nextToUpdate = target;
1212
+ }
1213
+
1214
+ /* ZSTD_row_update():
1215
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
1216
+ * processing.
1217
+ */
1218
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1219
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1220
+ const U32 rowMask = (1u << rowLog) - 1;
1221
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1222
+
1223
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
1224
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1225
+ }
1226
+
1227
+ /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1228
+ * the hash at the nth position in a row of the tagTable.
1229
+ */
1230
+ FORCE_INLINE_TEMPLATE
1231
+ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1232
+ ZSTD_VecMask matches = 0;
1233
+ if (rowEntries == 16) {
1234
+ ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1235
+ ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
1236
+ matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1237
+ } else if (rowEntries == 32) {
1238
+ ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1239
+ ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
1240
+ matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1241
+ } else {
1242
+ assert(0);
1243
+ }
1244
+ /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1245
+ to match up with the actual layout of the entries within the hashTable */
1246
+ return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1247
+ }
1248
+
1249
+ /* The high-level approach of the SIMD row based match finder is as follows:
1250
+ * - Figure out where to insert the new entry:
1251
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1252
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1253
+ * which row to insert into.
1254
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1255
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
1256
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
1257
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1258
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1259
+ * for alignment/performance reasons, leaving some bytes unused.
1260
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1261
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
1262
+ * - Pick the longest match.
1263
+ */
1264
+ FORCE_INLINE_TEMPLATE
1265
+ size_t ZSTD_RowFindBestMatch_generic (
1266
+ ZSTD_matchState_t* ms,
1267
+ const BYTE* const ip, const BYTE* const iLimit,
1268
+ size_t* offsetPtr,
1269
+ const U32 mls, const ZSTD_dictMode_e dictMode,
1270
+ const U32 rowLog)
1271
+ {
1272
+ U32* const hashTable = ms->hashTable;
1273
+ U16* const tagTable = ms->tagTable;
1274
+ U32* const hashCache = ms->hashCache;
1275
+ const U32 hashLog = ms->rowHashLog;
1276
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
1277
+ const BYTE* const base = ms->window.base;
1278
+ const BYTE* const dictBase = ms->window.dictBase;
1279
+ const U32 dictLimit = ms->window.dictLimit;
1280
+ const BYTE* const prefixStart = base + dictLimit;
1281
+ const BYTE* const dictEnd = dictBase + dictLimit;
1282
+ const U32 curr = (U32)(ip-base);
1283
+ const U32 maxDistance = 1U << cParams->windowLog;
1284
+ const U32 lowestValid = ms->window.lowLimit;
1285
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1286
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
1287
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1288
+ const U32 rowEntries = (1U << rowLog);
1289
+ const U32 rowMask = rowEntries - 1;
1290
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1291
+ U32 nbAttempts = 1U << cappedSearchLog;
1292
+ size_t ml=4-1;
1293
+
1294
+ /* DMS/DDS variables that may be referenced laster */
1295
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
1296
+ size_t ddsIdx;
1297
+ U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1298
+ U32 dmsTag;
1299
+ U32* dmsRow;
1300
+ BYTE* dmsTagRow;
1301
+
1302
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1303
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1304
+ { /* Prefetch DDS hashtable entry */
1305
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1306
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
1307
+ }
1308
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1309
+ }
1310
+
1311
+ if (dictMode == ZSTD_dictMatchState) {
1312
+ /* Prefetch DMS rows */
1313
+ U32* const dmsHashTable = dms->hashTable;
1314
+ U16* const dmsTagTable = dms->tagTable;
1315
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1316
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1317
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1318
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1319
+ dmsRow = dmsHashTable + dmsRelRow;
1320
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1321
+ }
1322
+
1323
+ /* Update the hashTable and tagTable up to (but not including) ip */
1324
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1325
+ { /* Get the hash for ip, compute the appropriate row */
1326
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1327
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1328
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1329
+ U32* const row = hashTable + relRow;
1330
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
1331
+ U32 const head = *tagRow & rowMask;
1332
+ U32 matchBuffer[32 /* maximum nb entries per row */];
1333
+ size_t numMatches = 0;
1334
+ size_t currMatch = 0;
1335
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1336
+
1337
+ /* Cycle through the matches and prefetch */
1338
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1339
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1340
+ U32 const matchIndex = row[matchPos];
1341
+ assert(numMatches < rowEntries);
1342
+ if (matchIndex < lowLimit)
1343
+ break;
1344
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1345
+ PREFETCH_L1(base + matchIndex);
1346
+ } else {
1347
+ PREFETCH_L1(dictBase + matchIndex);
1348
+ }
1349
+ matchBuffer[numMatches++] = matchIndex;
1350
+ }
1351
+
1352
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1353
+ in ZSTD_row_update_internal() at the next search. */
1354
+ {
1355
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1356
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1357
+ row[pos] = ms->nextToUpdate++;
1358
+ }
1359
+
1360
+ /* Return the longest match */
1361
+ for (; currMatch < numMatches; ++currMatch) {
1362
+ U32 const matchIndex = matchBuffer[currMatch];
1363
+ size_t currentMl=0;
1364
+ assert(matchIndex < curr);
1365
+ assert(matchIndex >= lowLimit);
1366
+
1367
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1368
+ const BYTE* const match = base + matchIndex;
1369
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1370
+ if (match[ml] == ip[ml]) /* potentially better */
1371
+ currentMl = ZSTD_count(ip, match, iLimit);
1372
+ } else {
1373
+ const BYTE* const match = dictBase + matchIndex;
1374
+ assert(match+4 <= dictEnd);
1375
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1376
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1377
+ }
1378
+
1379
+ /* Save best solution */
1380
+ if (currentMl > ml) {
1381
+ ml = currentMl;
1382
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1383
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+ if (dictMode == ZSTD_dedicatedDictSearch) {
1389
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1390
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1391
+ } else if (dictMode == ZSTD_dictMatchState) {
1392
+ /* TODO: Measure and potentially add prefetching to DMS */
1393
+ const U32 dmsLowestIndex = dms->window.dictLimit;
1394
+ const BYTE* const dmsBase = dms->window.base;
1395
+ const BYTE* const dmsEnd = dms->window.nextSrc;
1396
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1397
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
1398
+
1399
+ { U32 const head = *dmsTagRow & rowMask;
1400
+ U32 matchBuffer[32 /* maximum nb row entries */];
1401
+ size_t numMatches = 0;
1402
+ size_t currMatch = 0;
1403
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1404
+
1405
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1406
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1407
+ U32 const matchIndex = dmsRow[matchPos];
1408
+ if (matchIndex < dmsLowestIndex)
1409
+ break;
1410
+ PREFETCH_L1(dmsBase + matchIndex);
1411
+ matchBuffer[numMatches++] = matchIndex;
1412
+ }
1413
+
1414
+ /* Return the longest match */
1415
+ for (; currMatch < numMatches; ++currMatch) {
1416
+ U32 const matchIndex = matchBuffer[currMatch];
1417
+ size_t currentMl=0;
1418
+ assert(matchIndex >= dmsLowestIndex);
1419
+ assert(matchIndex < curr);
1420
+
1421
+ { const BYTE* const match = dmsBase + matchIndex;
1422
+ assert(match+4 <= dmsEnd);
1423
+ if (MEM_read32(match) == MEM_read32(ip))
1424
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1425
+ }
1426
+
1427
+ if (currentMl > ml) {
1428
+ ml = currentMl;
1429
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1430
+ if (ip+currentMl == iLimit) break;
1431
+ }
1432
+ }
1433
+ }
1434
+ }
1435
+ return ml;
1436
+ }
1437
+
1438
+ /* Inlining is important to hardwire a hot branch (template emulation) */
1439
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1440
+ ZSTD_matchState_t* ms,
1441
+ const BYTE* ip, const BYTE* const iLimit,
1442
+ const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1443
+ {
1444
+ switch(ms->cParams.minMatch)
1445
+ {
1446
+ default : /* includes case 3 */
1447
+ case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1448
+ case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1449
+ case 7 :
1450
+ case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1451
+ }
1452
+ }
1453
+
1454
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1455
+ ZSTD_matchState_t* ms,
1456
+ const BYTE* ip, const BYTE* const iLimit,
1457
+ size_t* offsetPtr)
1458
+ {
1459
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1460
+ switch(cappedSearchLog)
1461
+ {
1462
+ default :
1463
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1464
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1465
+ }
1466
+ }
1467
+
1468
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1469
+ ZSTD_matchState_t* ms,
1470
+ const BYTE* ip, const BYTE* const iLimit,
1471
+ size_t* offsetPtr)
1472
+ {
1473
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1474
+ switch(cappedSearchLog)
1475
+ {
1476
+ default :
1477
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1478
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1479
+ }
1480
+ }
1481
+
1482
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1483
+ ZSTD_matchState_t* ms,
1484
+ const BYTE* ip, const BYTE* const iLimit,
1485
+ size_t* offsetPtr)
1486
+ {
1487
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1488
+ switch(cappedSearchLog)
1489
+ {
1490
+ default :
1491
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1492
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1493
+ }
1494
+ }
1495
+
1496
+ FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1497
+ ZSTD_matchState_t* ms,
1498
+ const BYTE* ip, const BYTE* const iLimit,
1499
+ size_t* offsetPtr)
1500
+ {
1501
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1502
+ switch(cappedSearchLog)
1503
+ {
1504
+ default :
1505
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1506
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1507
+ }
1508
+ }
1509
+
848
1510
 
849
1511
  /* *******************************
850
1512
  * Common parser - lazy strategy
851
1513
  *********************************/
852
- typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
1514
+ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
853
1515
 
854
1516
  FORCE_INLINE_TEMPLATE size_t
855
1517
  ZSTD_compressBlock_lazy_generic(
@@ -863,10 +1525,11 @@ ZSTD_compressBlock_lazy_generic(
863
1525
  const BYTE* ip = istart;
864
1526
  const BYTE* anchor = istart;
865
1527
  const BYTE* const iend = istart + srcSize;
866
- const BYTE* const ilimit = iend - 8;
1528
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
867
1529
  const BYTE* const base = ms->window.base;
868
1530
  const U32 prefixLowestIndex = ms->window.dictLimit;
869
1531
  const BYTE* const prefixLowest = base + prefixLowestIndex;
1532
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
870
1533
 
871
1534
  typedef size_t (*searchMax_f)(
872
1535
  ZSTD_matchState_t* ms,
@@ -878,26 +1541,30 @@ ZSTD_compressBlock_lazy_generic(
878
1541
  * that should never occur (extDict modes go to the other implementation
879
1542
  * below and there is no DDSS for binary tree search yet).
880
1543
  */
881
- const searchMax_f searchFuncs[4][2] = {
1544
+ const searchMax_f searchFuncs[4][3] = {
882
1545
  {
883
1546
  ZSTD_HcFindBestMatch_selectMLS,
884
- ZSTD_BtFindBestMatch_selectMLS
1547
+ ZSTD_BtFindBestMatch_selectMLS,
1548
+ ZSTD_RowFindBestMatch_selectRowLog
885
1549
  },
886
1550
  {
1551
+ NULL,
887
1552
  NULL,
888
1553
  NULL
889
1554
  },
890
1555
  {
891
1556
  ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
892
- ZSTD_BtFindBestMatch_dictMatchState_selectMLS
1557
+ ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1558
+ ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
893
1559
  },
894
1560
  {
895
1561
  ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
896
- NULL
1562
+ NULL,
1563
+ ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
897
1564
  }
898
1565
  };
899
1566
 
900
- searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree];
1567
+ searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
901
1568
  U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
902
1569
 
903
1570
  const int isDMS = dictMode == ZSTD_dictMatchState;
@@ -915,9 +1582,7 @@ ZSTD_compressBlock_lazy_generic(
915
1582
 
916
1583
  assert(searchMax != NULL);
917
1584
 
918
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
919
-
920
- /* init */
1585
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
921
1586
  ip += (dictAndPrefixLength == 0);
922
1587
  if (dictMode == ZSTD_noDict) {
923
1588
  U32 const curr = (U32)(ip - base);
@@ -933,6 +1598,12 @@ ZSTD_compressBlock_lazy_generic(
933
1598
  assert(offset_2 <= dictAndPrefixLength);
934
1599
  }
935
1600
 
1601
+ if (searchMethod == search_rowHash) {
1602
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1603
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1604
+ ms->nextToUpdate, ilimit);
1605
+ }
1606
+
936
1607
  /* Match Loop */
937
1608
  #if defined(__GNUC__) && defined(__x86_64__)
938
1609
  /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
@@ -1198,6 +1869,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1198
1869
  return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1199
1870
  }
1200
1871
 
1872
+ /* Row-based matchfinder */
1873
+ size_t ZSTD_compressBlock_lazy2_row(
1874
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1875
+ void const* src, size_t srcSize)
1876
+ {
1877
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1878
+ }
1879
+
1880
+ size_t ZSTD_compressBlock_lazy_row(
1881
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1882
+ void const* src, size_t srcSize)
1883
+ {
1884
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1885
+ }
1886
+
1887
+ size_t ZSTD_compressBlock_greedy_row(
1888
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1889
+ void const* src, size_t srcSize)
1890
+ {
1891
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1892
+ }
1893
+
1894
+ size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1895
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1896
+ void const* src, size_t srcSize)
1897
+ {
1898
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1899
+ }
1900
+
1901
+ size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1902
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1903
+ void const* src, size_t srcSize)
1904
+ {
1905
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1906
+ }
1907
+
1908
+ size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1909
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1910
+ void const* src, size_t srcSize)
1911
+ {
1912
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1913
+ }
1914
+
1915
+
1916
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1917
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1918
+ void const* src, size_t srcSize)
1919
+ {
1920
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1921
+ }
1922
+
1923
+ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1924
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1925
+ void const* src, size_t srcSize)
1926
+ {
1927
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1928
+ }
1929
+
1930
+ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1931
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1932
+ void const* src, size_t srcSize)
1933
+ {
1934
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1935
+ }
1201
1936
 
1202
1937
  FORCE_INLINE_TEMPLATE
1203
1938
  size_t ZSTD_compressBlock_lazy_extDict_generic(
@@ -1210,7 +1945,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1210
1945
  const BYTE* ip = istart;
1211
1946
  const BYTE* anchor = istart;
1212
1947
  const BYTE* const iend = istart + srcSize;
1213
- const BYTE* const ilimit = iend - 8;
1948
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1214
1949
  const BYTE* const base = ms->window.base;
1215
1950
  const U32 dictLimit = ms->window.dictLimit;
1216
1951
  const BYTE* const prefixStart = base + dictLimit;
@@ -1218,18 +1953,28 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1218
1953
  const BYTE* const dictEnd = dictBase + dictLimit;
1219
1954
  const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1220
1955
  const U32 windowLog = ms->cParams.windowLog;
1956
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1221
1957
 
1222
1958
  typedef size_t (*searchMax_f)(
1223
1959
  ZSTD_matchState_t* ms,
1224
1960
  const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1225
- searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
1226
-
1961
+ const searchMax_f searchFuncs[3] = {
1962
+ ZSTD_HcFindBestMatch_extDict_selectMLS,
1963
+ ZSTD_BtFindBestMatch_extDict_selectMLS,
1964
+ ZSTD_RowFindBestMatch_extDict_selectRowLog
1965
+ };
1966
+ searchMax_f searchMax = searchFuncs[(int)searchMethod];
1227
1967
  U32 offset_1 = rep[0], offset_2 = rep[1];
1228
1968
 
1229
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
1969
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1230
1970
 
1231
1971
  /* init */
1232
1972
  ip += (ip == prefixStart);
1973
+ if (searchMethod == search_rowHash) {
1974
+ ZSTD_row_fillHashCache(ms, base, rowLog,
1975
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1976
+ ms->nextToUpdate, ilimit);
1977
+ }
1233
1978
 
1234
1979
  /* Match Loop */
1235
1980
  #if defined(__GNUC__) && defined(__x86_64__)
@@ -1249,7 +1994,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1249
1994
  const U32 repIndex = (U32)(curr+1 - offset_1);
1250
1995
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1251
1996
  const BYTE* const repMatch = repBase + repIndex;
1252
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
1997
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1998
+ & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
1253
1999
  if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
1254
2000
  /* repcode detected we should take it */
1255
2001
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1280,7 +2026,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1280
2026
  const U32 repIndex = (U32)(curr - offset_1);
1281
2027
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1282
2028
  const BYTE* const repMatch = repBase + repIndex;
1283
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
2029
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2030
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1284
2031
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1285
2032
  /* repcode detected */
1286
2033
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1311,7 +2058,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
1311
2058
  const U32 repIndex = (U32)(curr - offset_1);
1312
2059
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1313
2060
  const BYTE* const repMatch = repBase + repIndex;
1314
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
2061
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2062
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1315
2063
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1316
2064
  /* repcode detected */
1317
2065
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1357,7 +2105,8 @@ _storeSequence:
1357
2105
  const U32 repIndex = repCurrent - offset_2;
1358
2106
  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1359
2107
  const BYTE* const repMatch = repBase + repIndex;
1360
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
2108
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2109
+ & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
1361
2110
  if (MEM_read32(ip) == MEM_read32(repMatch)) {
1362
2111
  /* repcode detected we should take it */
1363
2112
  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1410,3 +2159,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
1410
2159
  {
1411
2160
  return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
1412
2161
  }
2162
+
2163
+ size_t ZSTD_compressBlock_greedy_extDict_row(
2164
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2165
+ void const* src, size_t srcSize)
2166
+ {
2167
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2168
+ }
2169
+
2170
+ size_t ZSTD_compressBlock_lazy_extDict_row(
2171
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2172
+ void const* src, size_t srcSize)
2173
+
2174
+ {
2175
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2176
+ }
2177
+
2178
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
2179
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2180
+ void const* src, size_t srcSize)
2181
+
2182
+ {
2183
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2184
+ }