zstd-ruby 1.4.9.0 → 1.5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/dependabot.yml +8 -0
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/BUCK +5 -7
- data/ext/zstdruby/libzstd/Makefile +42 -13
- data/ext/zstdruby/libzstd/README.md +8 -4
- data/ext/zstdruby/libzstd/common/bitstream.h +1 -1
- data/ext/zstdruby/libzstd/common/compiler.h +1 -1
- data/ext/zstdruby/libzstd/common/cpu.h +1 -1
- data/ext/zstdruby/libzstd/common/debug.c +1 -1
- data/ext/zstdruby/libzstd/common/debug.h +1 -1
- data/ext/zstdruby/libzstd/common/entropy_common.c +1 -1
- data/ext/zstdruby/libzstd/common/error_private.c +1 -1
- data/ext/zstdruby/libzstd/common/error_private.h +3 -3
- data/ext/zstdruby/libzstd/common/fse.h +2 -2
- data/ext/zstdruby/libzstd/common/fse_decompress.c +25 -15
- data/ext/zstdruby/libzstd/common/huf.h +3 -2
- data/ext/zstdruby/libzstd/common/mem.h +3 -5
- data/ext/zstdruby/libzstd/common/pool.c +1 -1
- data/ext/zstdruby/libzstd/common/pool.h +1 -1
- data/ext/zstdruby/libzstd/common/xxhash.c +2 -4
- data/ext/zstdruby/libzstd/common/xxhash.h +1 -1
- data/ext/zstdruby/libzstd/common/zstd_common.c +1 -1
- data/ext/zstdruby/libzstd/common/zstd_deps.h +1 -1
- data/ext/zstdruby/libzstd/common/zstd_internal.h +21 -9
- data/ext/zstdruby/libzstd/common/zstd_trace.h +7 -5
- data/ext/zstdruby/libzstd/compress/fse_compress.c +1 -1
- data/ext/zstdruby/libzstd/compress/hist.c +1 -1
- data/ext/zstdruby/libzstd/compress/hist.h +1 -1
- data/ext/zstdruby/libzstd/compress/huf_compress.c +51 -28
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +1373 -275
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +164 -21
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +2 -2
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +14 -6
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +5 -282
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +147 -46
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +3 -3
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +4 -4
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +914 -142
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +39 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +51 -15
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +2 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +15 -6
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +5 -5
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +44 -43
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +1 -1
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +1 -1
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +3 -4
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +44 -36
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +1 -1
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +1 -2
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +1 -1
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +7 -6
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +6 -5
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +7 -6
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +8 -7
- data/ext/zstdruby/libzstd/dll/example/Makefile +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +1 -1
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
- data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +148 -2
- data/ext/zstdruby/libzstd/zstd.h +165 -83
- data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +1 -1
- data/lib/zstd-ruby/version.rb +1 -1
- metadata +5 -5
- data/ext/zstdruby/libzstd/common/zstd_trace.c +0 -42
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c)
|
2
|
+
* Copyright (c) Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -35,6 +35,10 @@ extern "C" {
|
|
35
35
|
#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
|
36
36
|
#endif
|
37
37
|
|
38
|
+
|
39
|
+
/* Set our tables and aligneds to align by 64 bytes */
|
40
|
+
#define ZSTD_CWKSP_ALIGNMENT_BYTES 64
|
41
|
+
|
38
42
|
/*-*************************************
|
39
43
|
* Structures
|
40
44
|
***************************************/
|
@@ -117,10 +121,11 @@ typedef enum {
|
|
117
121
|
* - Tables: these are any of several different datastructures (hash tables,
|
118
122
|
* chain tables, binary trees) that all respect a common format: they are
|
119
123
|
* uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
|
120
|
-
* Their sizes depend on the cparams.
|
124
|
+
* Their sizes depend on the cparams. These tables are 64-byte aligned.
|
121
125
|
*
|
122
126
|
* - Aligned: these buffers are used for various purposes that require 4 byte
|
123
|
-
* alignment, but don't require any initialization before they're used.
|
127
|
+
* alignment, but don't require any initialization before they're used. These
|
128
|
+
* buffers are each aligned to 64 bytes.
|
124
129
|
*
|
125
130
|
* - Buffers: these buffers are used for various purposes that don't require
|
126
131
|
* any alignment or initialization before they're used. This means they can
|
@@ -133,8 +138,7 @@ typedef enum {
|
|
133
138
|
*
|
134
139
|
* 1. Objects
|
135
140
|
* 2. Buffers
|
136
|
-
* 3. Aligned
|
137
|
-
* 4. Tables
|
141
|
+
* 3. Aligned/Tables
|
138
142
|
*
|
139
143
|
* Attempts to reserve objects of different types out of order will fail.
|
140
144
|
*/
|
@@ -187,6 +191,8 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
|
|
187
191
|
* Since tables aren't currently redzoned, you don't need to call through this
|
188
192
|
* to figure out how much space you need for the matchState tables. Everything
|
189
193
|
* else is though.
|
194
|
+
*
|
195
|
+
* Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().
|
190
196
|
*/
|
191
197
|
MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
|
192
198
|
if (size == 0)
|
@@ -198,30 +204,110 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
|
|
198
204
|
#endif
|
199
205
|
}
|
200
206
|
|
201
|
-
|
207
|
+
/**
|
208
|
+
* Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes.
|
209
|
+
* Used to determine the number of bytes required for a given "aligned".
|
210
|
+
*/
|
211
|
+
MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
|
212
|
+
return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES));
|
213
|
+
}
|
214
|
+
|
215
|
+
/**
|
216
|
+
* Returns the amount of additional space the cwksp must allocate
|
217
|
+
* for internal purposes (currently only alignment).
|
218
|
+
*/
|
219
|
+
MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
|
220
|
+
/* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
|
221
|
+
* to align the beginning of tables section, as well as another n_2=[0, 63] bytes
|
222
|
+
* to align the beginning of the aligned secion.
|
223
|
+
*
|
224
|
+
* n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
|
225
|
+
* aligneds being sized in multiples of 64 bytes.
|
226
|
+
*/
|
227
|
+
size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
|
228
|
+
return slackSpace;
|
229
|
+
}
|
230
|
+
|
231
|
+
|
232
|
+
/**
|
233
|
+
* Return the number of additional bytes required to align a pointer to the given number of bytes.
|
234
|
+
* alignBytes must be a power of two.
|
235
|
+
*/
|
236
|
+
MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) {
|
237
|
+
size_t const alignBytesMask = alignBytes - 1;
|
238
|
+
size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
|
239
|
+
assert((alignBytes & alignBytesMask) == 0);
|
240
|
+
assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
|
241
|
+
return bytes;
|
242
|
+
}
|
243
|
+
|
244
|
+
/**
|
245
|
+
* Internal function. Do not use directly.
|
246
|
+
* Reserves the given number of bytes within the aligned/buffer segment of the wksp, which
|
247
|
+
* counts from the end of the wksp. (as opposed to the object/table segment)
|
248
|
+
*
|
249
|
+
* Returns a pointer to the beginning of that space.
|
250
|
+
*/
|
251
|
+
MEM_STATIC void* ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) {
|
252
|
+
void* const alloc = (BYTE*)ws->allocStart - bytes;
|
253
|
+
void* const bottom = ws->tableEnd;
|
254
|
+
DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
|
255
|
+
alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
|
256
|
+
ZSTD_cwksp_assert_internal_consistency(ws);
|
257
|
+
assert(alloc >= bottom);
|
258
|
+
if (alloc < bottom) {
|
259
|
+
DEBUGLOG(4, "cwksp: alloc failed!");
|
260
|
+
ws->allocFailed = 1;
|
261
|
+
return NULL;
|
262
|
+
}
|
263
|
+
if (alloc < ws->tableValidEnd) {
|
264
|
+
ws->tableValidEnd = alloc;
|
265
|
+
}
|
266
|
+
ws->allocStart = alloc;
|
267
|
+
return alloc;
|
268
|
+
}
|
269
|
+
|
270
|
+
/**
|
271
|
+
* Moves the cwksp to the next phase, and does any necessary allocations.
|
272
|
+
* Returns a 0 on success, or zstd error
|
273
|
+
*/
|
274
|
+
MEM_STATIC size_t ZSTD_cwksp_internal_advance_phase(
|
202
275
|
ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) {
|
203
276
|
assert(phase >= ws->phase);
|
204
277
|
if (phase > ws->phase) {
|
278
|
+
/* Going from allocating objects to allocating buffers */
|
205
279
|
if (ws->phase < ZSTD_cwksp_alloc_buffers &&
|
206
280
|
phase >= ZSTD_cwksp_alloc_buffers) {
|
207
281
|
ws->tableValidEnd = ws->objectEnd;
|
208
282
|
}
|
283
|
+
|
284
|
+
/* Going from allocating buffers to allocating aligneds/tables */
|
209
285
|
if (ws->phase < ZSTD_cwksp_alloc_aligned &&
|
210
286
|
phase >= ZSTD_cwksp_alloc_aligned) {
|
211
|
-
/*
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
287
|
+
{ /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
|
288
|
+
size_t const bytesToAlign =
|
289
|
+
ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
|
290
|
+
DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
|
291
|
+
ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
|
292
|
+
RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
|
293
|
+
memory_allocation, "aligned phase - alignment initial allocation failed!");
|
294
|
+
}
|
295
|
+
{ /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
|
296
|
+
void* const alloc = ws->objectEnd;
|
297
|
+
size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
|
298
|
+
void* const end = (BYTE*)alloc + bytesToAlign;
|
299
|
+
DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
|
300
|
+
RETURN_ERROR_IF(end > ws->workspaceEnd, memory_allocation,
|
301
|
+
"table phase - alignment initial allocation failed!");
|
302
|
+
ws->objectEnd = end;
|
303
|
+
ws->tableEnd = end;
|
304
|
+
ws->tableValidEnd = end;
|
221
305
|
}
|
222
306
|
}
|
223
307
|
ws->phase = phase;
|
308
|
+
ZSTD_cwksp_assert_internal_consistency(ws);
|
224
309
|
}
|
310
|
+
return 0;
|
225
311
|
}
|
226
312
|
|
227
313
|
/**
|
@@ -237,38 +323,25 @@ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) {
|
|
237
323
|
MEM_STATIC void* ZSTD_cwksp_reserve_internal(
|
238
324
|
ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) {
|
239
325
|
void* alloc;
|
240
|
-
|
241
|
-
ZSTD_cwksp_internal_advance_phase(ws, phase);
|
242
|
-
alloc = (BYTE *)ws->allocStart - bytes;
|
243
|
-
|
244
|
-
if (bytes == 0)
|
326
|
+
if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {
|
245
327
|
return NULL;
|
328
|
+
}
|
246
329
|
|
247
330
|
#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
|
248
331
|
/* over-reserve space */
|
249
|
-
|
332
|
+
bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
|
250
333
|
#endif
|
251
334
|
|
252
|
-
|
253
|
-
alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
|
254
|
-
ZSTD_cwksp_assert_internal_consistency(ws);
|
255
|
-
assert(alloc >= bottom);
|
256
|
-
if (alloc < bottom) {
|
257
|
-
DEBUGLOG(4, "cwksp: alloc failed!");
|
258
|
-
ws->allocFailed = 1;
|
259
|
-
return NULL;
|
260
|
-
}
|
261
|
-
if (alloc < ws->tableValidEnd) {
|
262
|
-
ws->tableValidEnd = alloc;
|
263
|
-
}
|
264
|
-
ws->allocStart = alloc;
|
335
|
+
alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);
|
265
336
|
|
266
337
|
#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
|
267
338
|
/* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
|
268
339
|
* either size. */
|
269
|
-
|
270
|
-
|
271
|
-
|
340
|
+
if (alloc) {
|
341
|
+
alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
|
342
|
+
if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
|
343
|
+
__asan_unpoison_memory_region(alloc, bytes);
|
344
|
+
}
|
272
345
|
}
|
273
346
|
#endif
|
274
347
|
|
@@ -283,28 +356,36 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) {
|
|
283
356
|
}
|
284
357
|
|
285
358
|
/**
|
286
|
-
* Reserves and returns memory sized on and aligned on
|
359
|
+
* Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
|
287
360
|
*/
|
288
361
|
MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
|
289
|
-
|
290
|
-
|
362
|
+
void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
|
363
|
+
ZSTD_cwksp_alloc_aligned);
|
364
|
+
assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
|
365
|
+
return ptr;
|
291
366
|
}
|
292
367
|
|
293
368
|
/**
|
294
|
-
* Aligned on
|
369
|
+
* Aligned on 64 bytes. These buffers have the special property that
|
295
370
|
* their values remain constrained, allowing us to re-use them without
|
296
371
|
* memset()-ing them.
|
297
372
|
*/
|
298
373
|
MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
|
299
374
|
const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
|
300
|
-
void* alloc
|
301
|
-
void* end
|
302
|
-
void* top
|
375
|
+
void* alloc;
|
376
|
+
void* end;
|
377
|
+
void* top;
|
378
|
+
|
379
|
+
if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
|
380
|
+
return NULL;
|
381
|
+
}
|
382
|
+
alloc = ws->tableEnd;
|
383
|
+
end = (BYTE *)alloc + bytes;
|
384
|
+
top = ws->allocStart;
|
303
385
|
|
304
386
|
DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
|
305
387
|
alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
|
306
388
|
assert((bytes & (sizeof(U32)-1)) == 0);
|
307
|
-
ZSTD_cwksp_internal_advance_phase(ws, phase);
|
308
389
|
ZSTD_cwksp_assert_internal_consistency(ws);
|
309
390
|
assert(end <= top);
|
310
391
|
if (end > top) {
|
@@ -320,6 +401,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
|
|
320
401
|
}
|
321
402
|
#endif
|
322
403
|
|
404
|
+
assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0);
|
405
|
+
assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
|
323
406
|
return alloc;
|
324
407
|
}
|
325
408
|
|
@@ -527,6 +610,24 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
|
|
527
610
|
* Functions Checking Free Space
|
528
611
|
***************************************/
|
529
612
|
|
613
|
+
/* ZSTD_alignmentSpaceWithinBounds() :
|
614
|
+
* Returns if the estimated space needed for a wksp is within an acceptable limit of the
|
615
|
+
* actual amount of space used.
|
616
|
+
*/
|
617
|
+
MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
|
618
|
+
size_t const estimatedSpace, int resizedWorkspace) {
|
619
|
+
if (resizedWorkspace) {
|
620
|
+
/* Resized/newly allocated wksp should have exact bounds */
|
621
|
+
return ZSTD_cwksp_used(ws) == estimatedSpace;
|
622
|
+
} else {
|
623
|
+
/* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
|
624
|
+
* than estimatedSpace. See the comments in zstd_cwksp.h for details.
|
625
|
+
*/
|
626
|
+
return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
|
627
|
+
}
|
628
|
+
}
|
629
|
+
|
630
|
+
|
530
631
|
MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
|
531
632
|
return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
|
532
633
|
}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c)
|
2
|
+
* Copyright (c) Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -409,7 +409,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
|
|
409
409
|
hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */
|
410
410
|
|
411
411
|
if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
|
412
|
-
& (
|
412
|
+
& (offset_1 < curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
|
413
413
|
&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
|
414
414
|
const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
|
415
415
|
mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
|
@@ -477,7 +477,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
|
|
477
477
|
U32 const repIndex2 = current2 - offset_2;
|
478
478
|
const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
|
479
479
|
if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
|
480
|
-
& (
|
480
|
+
& (offset_2 < current2 - dictStartIndex))
|
481
481
|
&& (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
|
482
482
|
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
|
483
483
|
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c)
|
2
|
+
* Copyright (c) Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -416,9 +416,9 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
|
|
416
416
|
const BYTE* const repMatch = repBase + repIndex;
|
417
417
|
hashTable[h] = curr; /* update hash table */
|
418
418
|
DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
|
419
|
-
assert(offset_1 <= curr +1); /* check repIndex */
|
420
419
|
|
421
|
-
if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
|
420
|
+
if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
|
421
|
+
& (offset_1 < curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
|
422
422
|
&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
|
423
423
|
const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
|
424
424
|
size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
|
@@ -453,7 +453,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
|
|
453
453
|
U32 const current2 = (U32)(ip-base);
|
454
454
|
U32 const repIndex2 = current2 - offset_2;
|
455
455
|
const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
|
456
|
-
if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (
|
456
|
+
if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 < curr - dictStartIndex)) /* intentional overflow */
|
457
457
|
&& (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
|
458
458
|
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
|
459
459
|
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c)
|
2
|
+
* Copyright (c) Yann Collet, Facebook, Inc.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -438,43 +438,9 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
|
|
438
438
|
}
|
439
439
|
}
|
440
440
|
|
441
|
-
|
442
|
-
|
443
|
-
/* *********************************
|
444
|
-
* Hash Chain
|
441
|
+
/***********************************
|
442
|
+
* Dedicated dict search
|
445
443
|
***********************************/
|
446
|
-
#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
|
447
|
-
|
448
|
-
/* Update chains up to ip (excluded)
|
449
|
-
Assumption : always within prefix (i.e. not within extDict) */
|
450
|
-
FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
|
451
|
-
ZSTD_matchState_t* ms,
|
452
|
-
const ZSTD_compressionParameters* const cParams,
|
453
|
-
const BYTE* ip, U32 const mls)
|
454
|
-
{
|
455
|
-
U32* const hashTable = ms->hashTable;
|
456
|
-
const U32 hashLog = cParams->hashLog;
|
457
|
-
U32* const chainTable = ms->chainTable;
|
458
|
-
const U32 chainMask = (1 << cParams->chainLog) - 1;
|
459
|
-
const BYTE* const base = ms->window.base;
|
460
|
-
const U32 target = (U32)(ip - base);
|
461
|
-
U32 idx = ms->nextToUpdate;
|
462
|
-
|
463
|
-
while(idx < target) { /* catch up */
|
464
|
-
size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
|
465
|
-
NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
|
466
|
-
hashTable[h] = idx;
|
467
|
-
idx++;
|
468
|
-
}
|
469
|
-
|
470
|
-
ms->nextToUpdate = target;
|
471
|
-
return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
|
472
|
-
}
|
473
|
-
|
474
|
-
U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
|
475
|
-
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
476
|
-
return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
|
477
|
-
}
|
478
444
|
|
479
445
|
void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
|
480
446
|
{
|
@@ -500,11 +466,10 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
|
|
500
466
|
U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
|
501
467
|
U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
|
502
468
|
U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
|
503
|
-
|
504
469
|
U32 hashIdx;
|
505
470
|
|
506
471
|
assert(ms->cParams.chainLog <= 24);
|
507
|
-
assert(ms->cParams.hashLog
|
472
|
+
assert(ms->cParams.hashLog > ms->cParams.chainLog);
|
508
473
|
assert(idx != 0);
|
509
474
|
assert(tmpMinChain <= minChain);
|
510
475
|
|
@@ -535,7 +500,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
|
|
535
500
|
if (count == cacheSize) {
|
536
501
|
for (count = 0; count < chainLimit;) {
|
537
502
|
if (i < minChain) {
|
538
|
-
if (!i || countBeyondMinChain
|
503
|
+
if (!i || ++countBeyondMinChain > cacheSize) {
|
539
504
|
/* only allow pulling `cacheSize` number of entries
|
540
505
|
* into the cache or chainTable beyond `minChain`,
|
541
506
|
* to replace the entries pulled out of the
|
@@ -591,6 +556,139 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
|
|
591
556
|
ms->nextToUpdate = target;
|
592
557
|
}
|
593
558
|
|
559
|
+
/* Returns the longest match length found in the dedicated dict search structure.
|
560
|
+
* If none are longer than the argument ml, then ml will be returned.
|
561
|
+
*/
|
562
|
+
FORCE_INLINE_TEMPLATE
|
563
|
+
size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
|
564
|
+
const ZSTD_matchState_t* const dms,
|
565
|
+
const BYTE* const ip, const BYTE* const iLimit,
|
566
|
+
const BYTE* const prefixStart, const U32 curr,
|
567
|
+
const U32 dictLimit, const size_t ddsIdx) {
|
568
|
+
const U32 ddsLowestIndex = dms->window.dictLimit;
|
569
|
+
const BYTE* const ddsBase = dms->window.base;
|
570
|
+
const BYTE* const ddsEnd = dms->window.nextSrc;
|
571
|
+
const U32 ddsSize = (U32)(ddsEnd - ddsBase);
|
572
|
+
const U32 ddsIndexDelta = dictLimit - ddsSize;
|
573
|
+
const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
|
574
|
+
const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
|
575
|
+
U32 ddsAttempt;
|
576
|
+
U32 matchIndex;
|
577
|
+
|
578
|
+
for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
|
579
|
+
PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
|
580
|
+
}
|
581
|
+
|
582
|
+
{
|
583
|
+
U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
|
584
|
+
U32 const chainIndex = chainPackedPointer >> 8;
|
585
|
+
|
586
|
+
PREFETCH_L1(&dms->chainTable[chainIndex]);
|
587
|
+
}
|
588
|
+
|
589
|
+
for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
|
590
|
+
size_t currentMl=0;
|
591
|
+
const BYTE* match;
|
592
|
+
matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
|
593
|
+
match = ddsBase + matchIndex;
|
594
|
+
|
595
|
+
if (!matchIndex) {
|
596
|
+
return ml;
|
597
|
+
}
|
598
|
+
|
599
|
+
/* guaranteed by table construction */
|
600
|
+
(void)ddsLowestIndex;
|
601
|
+
assert(matchIndex >= ddsLowestIndex);
|
602
|
+
assert(match+4 <= ddsEnd);
|
603
|
+
if (MEM_read32(match) == MEM_read32(ip)) {
|
604
|
+
/* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
605
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
|
606
|
+
}
|
607
|
+
|
608
|
+
/* save best solution */
|
609
|
+
if (currentMl > ml) {
|
610
|
+
ml = currentMl;
|
611
|
+
*offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
|
612
|
+
if (ip+currentMl == iLimit) {
|
613
|
+
/* best possible, avoids read overflow on next attempt */
|
614
|
+
return ml;
|
615
|
+
}
|
616
|
+
}
|
617
|
+
}
|
618
|
+
|
619
|
+
{
|
620
|
+
U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
|
621
|
+
U32 chainIndex = chainPackedPointer >> 8;
|
622
|
+
U32 const chainLength = chainPackedPointer & 0xFF;
|
623
|
+
U32 const chainAttempts = nbAttempts - ddsAttempt;
|
624
|
+
U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
|
625
|
+
U32 chainAttempt;
|
626
|
+
|
627
|
+
for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
|
628
|
+
PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
|
629
|
+
}
|
630
|
+
|
631
|
+
for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
|
632
|
+
size_t currentMl=0;
|
633
|
+
const BYTE* match;
|
634
|
+
matchIndex = dms->chainTable[chainIndex];
|
635
|
+
match = ddsBase + matchIndex;
|
636
|
+
|
637
|
+
/* guaranteed by table construction */
|
638
|
+
assert(matchIndex >= ddsLowestIndex);
|
639
|
+
assert(match+4 <= ddsEnd);
|
640
|
+
if (MEM_read32(match) == MEM_read32(ip)) {
|
641
|
+
/* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
642
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
|
643
|
+
}
|
644
|
+
|
645
|
+
/* save best solution */
|
646
|
+
if (currentMl > ml) {
|
647
|
+
ml = currentMl;
|
648
|
+
*offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
|
649
|
+
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
650
|
+
}
|
651
|
+
}
|
652
|
+
}
|
653
|
+
return ml;
|
654
|
+
}
|
655
|
+
|
656
|
+
|
657
|
+
/* *********************************
|
658
|
+
* Hash Chain
|
659
|
+
***********************************/
|
660
|
+
#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
|
661
|
+
|
662
|
+
/* Update chains up to ip (excluded)
|
663
|
+
Assumption : always within prefix (i.e. not within extDict) */
|
664
|
+
FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
|
665
|
+
ZSTD_matchState_t* ms,
|
666
|
+
const ZSTD_compressionParameters* const cParams,
|
667
|
+
const BYTE* ip, U32 const mls)
|
668
|
+
{
|
669
|
+
U32* const hashTable = ms->hashTable;
|
670
|
+
const U32 hashLog = cParams->hashLog;
|
671
|
+
U32* const chainTable = ms->chainTable;
|
672
|
+
const U32 chainMask = (1 << cParams->chainLog) - 1;
|
673
|
+
const BYTE* const base = ms->window.base;
|
674
|
+
const U32 target = (U32)(ip - base);
|
675
|
+
U32 idx = ms->nextToUpdate;
|
676
|
+
|
677
|
+
while(idx < target) { /* catch up */
|
678
|
+
size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
|
679
|
+
NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
|
680
|
+
hashTable[h] = idx;
|
681
|
+
idx++;
|
682
|
+
}
|
683
|
+
|
684
|
+
ms->nextToUpdate = target;
|
685
|
+
return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
|
686
|
+
}
|
687
|
+
|
688
|
+
U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
|
689
|
+
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
690
|
+
return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
|
691
|
+
}
|
594
692
|
|
595
693
|
/* inlining is important to hardwire a hot branch (template emulation) */
|
596
694
|
FORCE_INLINE_TEMPLATE
|
@@ -661,90 +759,8 @@ size_t ZSTD_HcFindBestMatch_generic (
|
|
661
759
|
}
|
662
760
|
|
663
761
|
if (dictMode == ZSTD_dedicatedDictSearch) {
|
664
|
-
|
665
|
-
|
666
|
-
const BYTE* const ddsEnd = dms->window.nextSrc;
|
667
|
-
const U32 ddsSize = (U32)(ddsEnd - ddsBase);
|
668
|
-
const U32 ddsIndexDelta = dictLimit - ddsSize;
|
669
|
-
const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
|
670
|
-
const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
|
671
|
-
U32 ddsAttempt;
|
672
|
-
|
673
|
-
for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
|
674
|
-
PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
|
675
|
-
}
|
676
|
-
|
677
|
-
{
|
678
|
-
U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
|
679
|
-
U32 const chainIndex = chainPackedPointer >> 8;
|
680
|
-
|
681
|
-
PREFETCH_L1(&dms->chainTable[chainIndex]);
|
682
|
-
}
|
683
|
-
|
684
|
-
for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
|
685
|
-
size_t currentMl=0;
|
686
|
-
const BYTE* match;
|
687
|
-
matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
|
688
|
-
match = ddsBase + matchIndex;
|
689
|
-
|
690
|
-
if (!matchIndex) {
|
691
|
-
return ml;
|
692
|
-
}
|
693
|
-
|
694
|
-
/* guaranteed by table construction */
|
695
|
-
(void)ddsLowestIndex;
|
696
|
-
assert(matchIndex >= ddsLowestIndex);
|
697
|
-
assert(match+4 <= ddsEnd);
|
698
|
-
if (MEM_read32(match) == MEM_read32(ip)) {
|
699
|
-
/* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
700
|
-
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
|
701
|
-
}
|
702
|
-
|
703
|
-
/* save best solution */
|
704
|
-
if (currentMl > ml) {
|
705
|
-
ml = currentMl;
|
706
|
-
*offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
|
707
|
-
if (ip+currentMl == iLimit) {
|
708
|
-
/* best possible, avoids read overflow on next attempt */
|
709
|
-
return ml;
|
710
|
-
}
|
711
|
-
}
|
712
|
-
}
|
713
|
-
|
714
|
-
{
|
715
|
-
U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
|
716
|
-
U32 chainIndex = chainPackedPointer >> 8;
|
717
|
-
U32 const chainLength = chainPackedPointer & 0xFF;
|
718
|
-
U32 const chainAttempts = nbAttempts - ddsAttempt;
|
719
|
-
U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
|
720
|
-
U32 chainAttempt;
|
721
|
-
|
722
|
-
for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
|
723
|
-
PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
|
724
|
-
}
|
725
|
-
|
726
|
-
for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
|
727
|
-
size_t currentMl=0;
|
728
|
-
const BYTE* match;
|
729
|
-
matchIndex = dms->chainTable[chainIndex];
|
730
|
-
match = ddsBase + matchIndex;
|
731
|
-
|
732
|
-
/* guaranteed by table construction */
|
733
|
-
assert(matchIndex >= ddsLowestIndex);
|
734
|
-
assert(match+4 <= ddsEnd);
|
735
|
-
if (MEM_read32(match) == MEM_read32(ip)) {
|
736
|
-
/* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
737
|
-
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
|
738
|
-
}
|
739
|
-
|
740
|
-
/* save best solution */
|
741
|
-
if (currentMl > ml) {
|
742
|
-
ml = currentMl;
|
743
|
-
*offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
|
744
|
-
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
745
|
-
}
|
746
|
-
}
|
747
|
-
}
|
762
|
+
ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
|
763
|
+
ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
|
748
764
|
} else if (dictMode == ZSTD_dictMatchState) {
|
749
765
|
const U32* const dmsChainTable = dms->chainTable;
|
750
766
|
const U32 dmsChainSize = (1 << dms->cParams.chainLog);
|
@@ -845,11 +861,657 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
|
|
845
861
|
}
|
846
862
|
}
|
847
863
|
|
864
|
+
/* *********************************
|
865
|
+
* (SIMD) Row-based matchfinder
|
866
|
+
***********************************/
|
867
|
+
/* Constants for row-based hash */
|
868
|
+
#define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
|
869
|
+
#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
|
870
|
+
#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
|
871
|
+
|
872
|
+
#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
|
873
|
+
|
874
|
+
typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
|
875
|
+
|
876
|
+
#if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
|
877
|
+
|
878
|
+
#include <emmintrin.h>
|
879
|
+
typedef __m128i ZSTD_Vec128;
|
880
|
+
|
881
|
+
/* Returns a 128-bit container with 128-bits from src */
|
882
|
+
static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
|
883
|
+
return _mm_loadu_si128((ZSTD_Vec128 const*)src);
|
884
|
+
}
|
885
|
+
|
886
|
+
/* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
|
887
|
+
static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
|
888
|
+
return _mm_set1_epi8((char)val);
|
889
|
+
}
|
890
|
+
|
891
|
+
/* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
|
892
|
+
* into a 32-bit mask that is the MSB of each byte.
|
893
|
+
* */
|
894
|
+
static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
|
895
|
+
return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
|
896
|
+
}
|
897
|
+
|
898
|
+
typedef struct {
|
899
|
+
__m128i fst;
|
900
|
+
__m128i snd;
|
901
|
+
} ZSTD_Vec256;
|
902
|
+
|
903
|
+
static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
|
904
|
+
ZSTD_Vec256 v;
|
905
|
+
v.fst = ZSTD_Vec128_read(ptr);
|
906
|
+
v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
|
907
|
+
return v;
|
908
|
+
}
|
909
|
+
|
910
|
+
static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
|
911
|
+
ZSTD_Vec256 v;
|
912
|
+
v.fst = ZSTD_Vec128_set8(val);
|
913
|
+
v.snd = ZSTD_Vec128_set8(val);
|
914
|
+
return v;
|
915
|
+
}
|
916
|
+
|
917
|
+
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
|
918
|
+
ZSTD_VecMask fstMask;
|
919
|
+
ZSTD_VecMask sndMask;
|
920
|
+
fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
|
921
|
+
sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
|
922
|
+
return fstMask | (sndMask << 16);
|
923
|
+
}
|
924
|
+
|
925
|
+
#elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
|
926
|
+
|
927
|
+
#include <arm_neon.h>
|
928
|
+
typedef uint8x16_t ZSTD_Vec128;
|
929
|
+
|
930
|
+
static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
|
931
|
+
return vld1q_u8((const BYTE* const)src);
|
932
|
+
}
|
933
|
+
|
934
|
+
static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
|
935
|
+
return vdupq_n_u8(val);
|
936
|
+
}
|
937
|
+
|
938
|
+
/* Mimics '_mm_movemask_epi8()' from SSE */
|
939
|
+
static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
|
940
|
+
/* Shift out everything but the MSB bits in each byte */
|
941
|
+
uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
|
942
|
+
/* Merge the even lanes together with vsra (right shift and add) */
|
943
|
+
uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
|
944
|
+
uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
|
945
|
+
uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
|
946
|
+
/* Extract the low 8 bits from each lane, merge */
|
947
|
+
return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
|
948
|
+
}
|
949
|
+
|
950
|
+
static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
|
951
|
+
return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
|
952
|
+
}
|
953
|
+
|
954
|
+
typedef struct {
|
955
|
+
uint8x16_t fst;
|
956
|
+
uint8x16_t snd;
|
957
|
+
} ZSTD_Vec256;
|
958
|
+
|
959
|
+
static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
|
960
|
+
ZSTD_Vec256 v;
|
961
|
+
v.fst = ZSTD_Vec128_read(ptr);
|
962
|
+
v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
|
963
|
+
return v;
|
964
|
+
}
|
965
|
+
|
966
|
+
static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
|
967
|
+
ZSTD_Vec256 v;
|
968
|
+
v.fst = ZSTD_Vec128_set8(val);
|
969
|
+
v.snd = ZSTD_Vec128_set8(val);
|
970
|
+
return v;
|
971
|
+
}
|
972
|
+
|
973
|
+
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
|
974
|
+
ZSTD_VecMask fstMask;
|
975
|
+
ZSTD_VecMask sndMask;
|
976
|
+
fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
|
977
|
+
sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
|
978
|
+
return fstMask | (sndMask << 16);
|
979
|
+
}
|
980
|
+
|
981
|
+
#else /* Scalar fallback version */
|
982
|
+
|
983
|
+
#define VEC128_NB_SIZE_T (16 / sizeof(size_t))
|
984
|
+
typedef struct {
|
985
|
+
size_t vec[VEC128_NB_SIZE_T];
|
986
|
+
} ZSTD_Vec128;
|
987
|
+
|
988
|
+
static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
|
989
|
+
ZSTD_Vec128 ret;
|
990
|
+
ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
|
991
|
+
return ret;
|
992
|
+
}
|
993
|
+
|
994
|
+
static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
|
995
|
+
ZSTD_Vec128 ret = { {0} };
|
996
|
+
int startBit = sizeof(size_t) * 8 - 8;
|
997
|
+
for (;startBit >= 0; startBit -= 8) {
|
998
|
+
unsigned j = 0;
|
999
|
+
for (;j < VEC128_NB_SIZE_T; ++j) {
|
1000
|
+
ret.vec[j] |= ((size_t)val << startBit);
|
1001
|
+
}
|
1002
|
+
}
|
1003
|
+
return ret;
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
/* Compare x to y, byte by byte, generating a "matches" bitfield */
|
1007
|
+
static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
|
1008
|
+
ZSTD_VecMask res = 0;
|
1009
|
+
unsigned i = 0;
|
1010
|
+
unsigned l = 0;
|
1011
|
+
for (; i < VEC128_NB_SIZE_T; ++i) {
|
1012
|
+
const size_t cmp1 = x.vec[i];
|
1013
|
+
const size_t cmp2 = y.vec[i];
|
1014
|
+
unsigned j = 0;
|
1015
|
+
for (; j < sizeof(size_t); ++j, ++l) {
|
1016
|
+
if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
|
1017
|
+
res |= ((U32)1 << (j+i*sizeof(size_t)));
|
1018
|
+
}
|
1019
|
+
}
|
1020
|
+
}
|
1021
|
+
return res;
|
1022
|
+
}
|
1023
|
+
|
1024
|
+
#define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
|
1025
|
+
typedef struct {
|
1026
|
+
size_t vec[VEC256_NB_SIZE_T];
|
1027
|
+
} ZSTD_Vec256;
|
1028
|
+
|
1029
|
+
static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
|
1030
|
+
ZSTD_Vec256 ret;
|
1031
|
+
ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
|
1032
|
+
return ret;
|
1033
|
+
}
|
1034
|
+
|
1035
|
+
static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
|
1036
|
+
ZSTD_Vec256 ret = { {0} };
|
1037
|
+
int startBit = sizeof(size_t) * 8 - 8;
|
1038
|
+
for (;startBit >= 0; startBit -= 8) {
|
1039
|
+
unsigned j = 0;
|
1040
|
+
for (;j < VEC256_NB_SIZE_T; ++j) {
|
1041
|
+
ret.vec[j] |= ((size_t)val << startBit);
|
1042
|
+
}
|
1043
|
+
}
|
1044
|
+
return ret;
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
/* Compare x to y, byte by byte, generating a "matches" bitfield */
|
1048
|
+
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
|
1049
|
+
ZSTD_VecMask res = 0;
|
1050
|
+
unsigned i = 0;
|
1051
|
+
unsigned l = 0;
|
1052
|
+
for (; i < VEC256_NB_SIZE_T; ++i) {
|
1053
|
+
const size_t cmp1 = x.vec[i];
|
1054
|
+
const size_t cmp2 = y.vec[i];
|
1055
|
+
unsigned j = 0;
|
1056
|
+
for (; j < sizeof(size_t); ++j, ++l) {
|
1057
|
+
if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
|
1058
|
+
res |= ((U32)1 << (j+i*sizeof(size_t)));
|
1059
|
+
}
|
1060
|
+
}
|
1061
|
+
}
|
1062
|
+
return res;
|
1063
|
+
}
|
1064
|
+
|
1065
|
+
#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
|
1066
|
+
|
1067
|
+
/* ZSTD_VecMask_next():
|
1068
|
+
* Starting from the LSB, returns the idx of the next non-zero bit.
|
1069
|
+
* Basically counting the nb of trailing zeroes.
|
1070
|
+
*/
|
1071
|
+
static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
|
1072
|
+
# if defined(_MSC_VER) /* Visual */
|
1073
|
+
unsigned long r=0;
|
1074
|
+
return _BitScanForward(&r, val) ? (U32)r : 0;
|
1075
|
+
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
1076
|
+
return (U32)__builtin_ctz(val);
|
1077
|
+
# else
|
1078
|
+
/* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
|
1079
|
+
static const U32 multiplyDeBruijnBitPosition[32] =
|
1080
|
+
{
|
1081
|
+
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
1082
|
+
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
|
1083
|
+
};
|
1084
|
+
return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
|
1085
|
+
# endif
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
/* ZSTD_VecMask_rotateRight():
|
1089
|
+
* Rotates a bitfield to the right by "rotation" bits.
|
1090
|
+
* If the rotation is greater than totalBits, the returned mask is 0.
|
1091
|
+
*/
|
1092
|
+
FORCE_INLINE_TEMPLATE ZSTD_VecMask
|
1093
|
+
ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
|
1094
|
+
if (rotation == 0)
|
1095
|
+
return mask;
|
1096
|
+
switch (totalBits) {
|
1097
|
+
default:
|
1098
|
+
assert(0);
|
1099
|
+
case 16:
|
1100
|
+
return (mask >> rotation) | (U16)(mask << (16 - rotation));
|
1101
|
+
case 32:
|
1102
|
+
return (mask >> rotation) | (U32)(mask << (32 - rotation));
|
1103
|
+
}
|
1104
|
+
}
|
1105
|
+
|
1106
|
+
/* ZSTD_row_nextIndex():
|
1107
|
+
* Returns the next index to insert at within a tagTable row, and updates the "head"
|
1108
|
+
* value to reflect the update. Essentially cycles backwards from [0, {entries per row})
|
1109
|
+
*/
|
1110
|
+
FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
|
1111
|
+
U32 const next = (*tagRow - 1) & rowMask;
|
1112
|
+
*tagRow = (BYTE)next;
|
1113
|
+
return next;
|
1114
|
+
}
|
1115
|
+
|
1116
|
+
/* ZSTD_isAligned():
|
1117
|
+
* Checks that a pointer is aligned to "align" bytes which must be a power of 2.
|
1118
|
+
*/
|
1119
|
+
MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
|
1120
|
+
assert((align & (align - 1)) == 0);
|
1121
|
+
return (((size_t)ptr) & (align - 1)) == 0;
|
1122
|
+
}
|
1123
|
+
|
1124
|
+
/* ZSTD_row_prefetch():
|
1125
|
+
* Performs prefetching for the hashTable and tagTable at a given row.
|
1126
|
+
*/
|
1127
|
+
FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
|
1128
|
+
PREFETCH_L1(hashTable + relRow);
|
1129
|
+
if (rowLog == 5) {
|
1130
|
+
PREFETCH_L1(hashTable + relRow + 16);
|
1131
|
+
}
|
1132
|
+
PREFETCH_L1(tagTable + relRow);
|
1133
|
+
assert(rowLog == 4 || rowLog == 5);
|
1134
|
+
assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
|
1135
|
+
assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
|
1136
|
+
}
|
1137
|
+
|
1138
|
+
/* ZSTD_row_fillHashCache():
|
1139
|
+
* Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
|
1140
|
+
* but not beyond iLimit.
|
1141
|
+
*/
|
1142
|
+
static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
|
1143
|
+
U32 const rowLog, U32 const mls,
|
1144
|
+
U32 idx, const BYTE* const iLimit)
|
1145
|
+
{
|
1146
|
+
U32 const* const hashTable = ms->hashTable;
|
1147
|
+
U16 const* const tagTable = ms->tagTable;
|
1148
|
+
U32 const hashLog = ms->rowHashLog;
|
1149
|
+
U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
|
1150
|
+
U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
|
1151
|
+
|
1152
|
+
for (; idx < lim; ++idx) {
|
1153
|
+
U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
1154
|
+
U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
1155
|
+
ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
|
1156
|
+
ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
|
1157
|
+
}
|
1158
|
+
|
1159
|
+
DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
|
1160
|
+
ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
|
1161
|
+
ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
/* ZSTD_row_nextCachedHash():
|
1165
|
+
* Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
|
1166
|
+
* base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
|
1167
|
+
*/
|
1168
|
+
FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
|
1169
|
+
U16 const* tagTable, BYTE const* base,
|
1170
|
+
U32 idx, U32 const hashLog,
|
1171
|
+
U32 const rowLog, U32 const mls)
|
1172
|
+
{
|
1173
|
+
U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
1174
|
+
U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
1175
|
+
ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
|
1176
|
+
{ U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
|
1177
|
+
cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
|
1178
|
+
return hash;
|
1179
|
+
}
|
1180
|
+
}
|
1181
|
+
|
1182
|
+
/* ZSTD_row_update_internal():
|
1183
|
+
* Inserts the byte at ip into the appropriate position in the hash table.
|
1184
|
+
* Determines the relative row, and the position within the {16, 32} entry row to insert at.
|
1185
|
+
*/
|
1186
|
+
FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
|
1187
|
+
U32 const mls, U32 const rowLog,
|
1188
|
+
U32 const rowMask, U32 const useCache)
|
1189
|
+
{
|
1190
|
+
U32* const hashTable = ms->hashTable;
|
1191
|
+
U16* const tagTable = ms->tagTable;
|
1192
|
+
U32 const hashLog = ms->rowHashLog;
|
1193
|
+
const BYTE* const base = ms->window.base;
|
1194
|
+
const U32 target = (U32)(ip - base);
|
1195
|
+
U32 idx = ms->nextToUpdate;
|
1196
|
+
|
1197
|
+
DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
|
1198
|
+
for (; idx < target; ++idx) {
|
1199
|
+
U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
|
1200
|
+
: (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
1201
|
+
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
1202
|
+
U32* const row = hashTable + relRow;
|
1203
|
+
BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
|
1204
|
+
Explicit cast allows us to get exact desired position within each row */
|
1205
|
+
U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
|
1206
|
+
|
1207
|
+
assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
|
1208
|
+
((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
|
1209
|
+
row[pos] = idx;
|
1210
|
+
}
|
1211
|
+
ms->nextToUpdate = target;
|
1212
|
+
}
|
1213
|
+
|
1214
|
+
/* ZSTD_row_update():
|
1215
|
+
* External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
|
1216
|
+
* processing.
|
1217
|
+
*/
|
1218
|
+
void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
|
1219
|
+
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
|
1220
|
+
const U32 rowMask = (1u << rowLog) - 1;
|
1221
|
+
const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
|
1222
|
+
|
1223
|
+
DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
|
1224
|
+
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
|
1225
|
+
}
|
1226
|
+
|
1227
|
+
/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
|
1228
|
+
* the hash at the nth position in a row of the tagTable.
|
1229
|
+
*/
|
1230
|
+
FORCE_INLINE_TEMPLATE
|
1231
|
+
ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
|
1232
|
+
ZSTD_VecMask matches = 0;
|
1233
|
+
if (rowEntries == 16) {
|
1234
|
+
ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
|
1235
|
+
ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
|
1236
|
+
matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
|
1237
|
+
} else if (rowEntries == 32) {
|
1238
|
+
ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
|
1239
|
+
ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
|
1240
|
+
matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
|
1241
|
+
} else {
|
1242
|
+
assert(0);
|
1243
|
+
}
|
1244
|
+
/* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
|
1245
|
+
to match up with the actual layout of the entries within the hashTable */
|
1246
|
+
return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
|
1247
|
+
}
|
1248
|
+
|
1249
|
+
/* The high-level approach of the SIMD row based match finder is as follows:
|
1250
|
+
* - Figure out where to insert the new entry:
|
1251
|
+
* - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
|
1252
|
+
* - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
|
1253
|
+
* which row to insert into.
|
1254
|
+
* - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
|
1255
|
+
* be considered as a circular buffer with a "head" index that resides in the tagTable.
|
1256
|
+
* - Also insert the "tag" into the equivalent row and position in the tagTable.
|
1257
|
+
* - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
|
1258
|
+
* The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
|
1259
|
+
* for alignment/performance reasons, leaving some bytes unused.
|
1260
|
+
* - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
|
1261
|
+
* generate a bitfield that we can cycle through to check the collisions in the hash table.
|
1262
|
+
* - Pick the longest match.
|
1263
|
+
*/
|
1264
|
+
FORCE_INLINE_TEMPLATE
|
1265
|
+
size_t ZSTD_RowFindBestMatch_generic (
|
1266
|
+
ZSTD_matchState_t* ms,
|
1267
|
+
const BYTE* const ip, const BYTE* const iLimit,
|
1268
|
+
size_t* offsetPtr,
|
1269
|
+
const U32 mls, const ZSTD_dictMode_e dictMode,
|
1270
|
+
const U32 rowLog)
|
1271
|
+
{
|
1272
|
+
U32* const hashTable = ms->hashTable;
|
1273
|
+
U16* const tagTable = ms->tagTable;
|
1274
|
+
U32* const hashCache = ms->hashCache;
|
1275
|
+
const U32 hashLog = ms->rowHashLog;
|
1276
|
+
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
1277
|
+
const BYTE* const base = ms->window.base;
|
1278
|
+
const BYTE* const dictBase = ms->window.dictBase;
|
1279
|
+
const U32 dictLimit = ms->window.dictLimit;
|
1280
|
+
const BYTE* const prefixStart = base + dictLimit;
|
1281
|
+
const BYTE* const dictEnd = dictBase + dictLimit;
|
1282
|
+
const U32 curr = (U32)(ip-base);
|
1283
|
+
const U32 maxDistance = 1U << cParams->windowLog;
|
1284
|
+
const U32 lowestValid = ms->window.lowLimit;
|
1285
|
+
const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
|
1286
|
+
const U32 isDictionary = (ms->loadedDictEnd != 0);
|
1287
|
+
const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
|
1288
|
+
const U32 rowEntries = (1U << rowLog);
|
1289
|
+
const U32 rowMask = rowEntries - 1;
|
1290
|
+
const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
|
1291
|
+
U32 nbAttempts = 1U << cappedSearchLog;
|
1292
|
+
size_t ml=4-1;
|
1293
|
+
|
1294
|
+
/* DMS/DDS variables that may be referenced laster */
|
1295
|
+
const ZSTD_matchState_t* const dms = ms->dictMatchState;
|
1296
|
+
size_t ddsIdx;
|
1297
|
+
U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
|
1298
|
+
U32 dmsTag;
|
1299
|
+
U32* dmsRow;
|
1300
|
+
BYTE* dmsTagRow;
|
1301
|
+
|
1302
|
+
if (dictMode == ZSTD_dedicatedDictSearch) {
|
1303
|
+
const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
|
1304
|
+
{ /* Prefetch DDS hashtable entry */
|
1305
|
+
ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
|
1306
|
+
PREFETCH_L1(&dms->hashTable[ddsIdx]);
|
1307
|
+
}
|
1308
|
+
ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
|
1309
|
+
}
|
1310
|
+
|
1311
|
+
if (dictMode == ZSTD_dictMatchState) {
|
1312
|
+
/* Prefetch DMS rows */
|
1313
|
+
U32* const dmsHashTable = dms->hashTable;
|
1314
|
+
U16* const dmsTagTable = dms->tagTable;
|
1315
|
+
U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
|
1316
|
+
U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
1317
|
+
dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
|
1318
|
+
dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
|
1319
|
+
dmsRow = dmsHashTable + dmsRelRow;
|
1320
|
+
ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
|
1321
|
+
}
|
1322
|
+
|
1323
|
+
/* Update the hashTable and tagTable up to (but not including) ip */
|
1324
|
+
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
|
1325
|
+
{ /* Get the hash for ip, compute the appropriate row */
|
1326
|
+
U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
|
1327
|
+
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
|
1328
|
+
U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
|
1329
|
+
U32* const row = hashTable + relRow;
|
1330
|
+
BYTE* tagRow = (BYTE*)(tagTable + relRow);
|
1331
|
+
U32 const head = *tagRow & rowMask;
|
1332
|
+
U32 matchBuffer[32 /* maximum nb entries per row */];
|
1333
|
+
size_t numMatches = 0;
|
1334
|
+
size_t currMatch = 0;
|
1335
|
+
ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
|
1336
|
+
|
1337
|
+
/* Cycle through the matches and prefetch */
|
1338
|
+
for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
1339
|
+
U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
|
1340
|
+
U32 const matchIndex = row[matchPos];
|
1341
|
+
assert(numMatches < rowEntries);
|
1342
|
+
if (matchIndex < lowLimit)
|
1343
|
+
break;
|
1344
|
+
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
1345
|
+
PREFETCH_L1(base + matchIndex);
|
1346
|
+
} else {
|
1347
|
+
PREFETCH_L1(dictBase + matchIndex);
|
1348
|
+
}
|
1349
|
+
matchBuffer[numMatches++] = matchIndex;
|
1350
|
+
}
|
1351
|
+
|
1352
|
+
/* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
|
1353
|
+
in ZSTD_row_update_internal() at the next search. */
|
1354
|
+
{
|
1355
|
+
U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
|
1356
|
+
tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
|
1357
|
+
row[pos] = ms->nextToUpdate++;
|
1358
|
+
}
|
1359
|
+
|
1360
|
+
/* Return the longest match */
|
1361
|
+
for (; currMatch < numMatches; ++currMatch) {
|
1362
|
+
U32 const matchIndex = matchBuffer[currMatch];
|
1363
|
+
size_t currentMl=0;
|
1364
|
+
assert(matchIndex < curr);
|
1365
|
+
assert(matchIndex >= lowLimit);
|
1366
|
+
|
1367
|
+
if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
|
1368
|
+
const BYTE* const match = base + matchIndex;
|
1369
|
+
assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
|
1370
|
+
if (match[ml] == ip[ml]) /* potentially better */
|
1371
|
+
currentMl = ZSTD_count(ip, match, iLimit);
|
1372
|
+
} else {
|
1373
|
+
const BYTE* const match = dictBase + matchIndex;
|
1374
|
+
assert(match+4 <= dictEnd);
|
1375
|
+
if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
|
1376
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
|
1377
|
+
}
|
1378
|
+
|
1379
|
+
/* Save best solution */
|
1380
|
+
if (currentMl > ml) {
|
1381
|
+
ml = currentMl;
|
1382
|
+
*offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
|
1383
|
+
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
|
1384
|
+
}
|
1385
|
+
}
|
1386
|
+
}
|
1387
|
+
|
1388
|
+
if (dictMode == ZSTD_dedicatedDictSearch) {
|
1389
|
+
ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
|
1390
|
+
ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
|
1391
|
+
} else if (dictMode == ZSTD_dictMatchState) {
|
1392
|
+
/* TODO: Measure and potentially add prefetching to DMS */
|
1393
|
+
const U32 dmsLowestIndex = dms->window.dictLimit;
|
1394
|
+
const BYTE* const dmsBase = dms->window.base;
|
1395
|
+
const BYTE* const dmsEnd = dms->window.nextSrc;
|
1396
|
+
const U32 dmsSize = (U32)(dmsEnd - dmsBase);
|
1397
|
+
const U32 dmsIndexDelta = dictLimit - dmsSize;
|
1398
|
+
|
1399
|
+
{ U32 const head = *dmsTagRow & rowMask;
|
1400
|
+
U32 matchBuffer[32 /* maximum nb row entries */];
|
1401
|
+
size_t numMatches = 0;
|
1402
|
+
size_t currMatch = 0;
|
1403
|
+
ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
|
1404
|
+
|
1405
|
+
for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
|
1406
|
+
U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
|
1407
|
+
U32 const matchIndex = dmsRow[matchPos];
|
1408
|
+
if (matchIndex < dmsLowestIndex)
|
1409
|
+
break;
|
1410
|
+
PREFETCH_L1(dmsBase + matchIndex);
|
1411
|
+
matchBuffer[numMatches++] = matchIndex;
|
1412
|
+
}
|
1413
|
+
|
1414
|
+
/* Return the longest match */
|
1415
|
+
for (; currMatch < numMatches; ++currMatch) {
|
1416
|
+
U32 const matchIndex = matchBuffer[currMatch];
|
1417
|
+
size_t currentMl=0;
|
1418
|
+
assert(matchIndex >= dmsLowestIndex);
|
1419
|
+
assert(matchIndex < curr);
|
1420
|
+
|
1421
|
+
{ const BYTE* const match = dmsBase + matchIndex;
|
1422
|
+
assert(match+4 <= dmsEnd);
|
1423
|
+
if (MEM_read32(match) == MEM_read32(ip))
|
1424
|
+
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
|
1425
|
+
}
|
1426
|
+
|
1427
|
+
if (currentMl > ml) {
|
1428
|
+
ml = currentMl;
|
1429
|
+
*offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
|
1430
|
+
if (ip+currentMl == iLimit) break;
|
1431
|
+
}
|
1432
|
+
}
|
1433
|
+
}
|
1434
|
+
}
|
1435
|
+
return ml;
|
1436
|
+
}
|
1437
|
+
|
1438
|
+
/* Inlining is important to hardwire a hot branch (template emulation) */
|
1439
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
|
1440
|
+
ZSTD_matchState_t* ms,
|
1441
|
+
const BYTE* ip, const BYTE* const iLimit,
|
1442
|
+
const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
|
1443
|
+
{
|
1444
|
+
switch(ms->cParams.minMatch)
|
1445
|
+
{
|
1446
|
+
default : /* includes case 3 */
|
1447
|
+
case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
|
1448
|
+
case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
|
1449
|
+
case 7 :
|
1450
|
+
case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
|
1451
|
+
}
|
1452
|
+
}
|
1453
|
+
|
1454
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
|
1455
|
+
ZSTD_matchState_t* ms,
|
1456
|
+
const BYTE* ip, const BYTE* const iLimit,
|
1457
|
+
size_t* offsetPtr)
|
1458
|
+
{
|
1459
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
1460
|
+
switch(cappedSearchLog)
|
1461
|
+
{
|
1462
|
+
default :
|
1463
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
|
1464
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
|
1465
|
+
}
|
1466
|
+
}
|
1467
|
+
|
1468
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
|
1469
|
+
ZSTD_matchState_t* ms,
|
1470
|
+
const BYTE* ip, const BYTE* const iLimit,
|
1471
|
+
size_t* offsetPtr)
|
1472
|
+
{
|
1473
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
1474
|
+
switch(cappedSearchLog)
|
1475
|
+
{
|
1476
|
+
default :
|
1477
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
|
1478
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
|
1479
|
+
}
|
1480
|
+
}
|
1481
|
+
|
1482
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
|
1483
|
+
ZSTD_matchState_t* ms,
|
1484
|
+
const BYTE* ip, const BYTE* const iLimit,
|
1485
|
+
size_t* offsetPtr)
|
1486
|
+
{
|
1487
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
1488
|
+
switch(cappedSearchLog)
|
1489
|
+
{
|
1490
|
+
default :
|
1491
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
|
1492
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
|
1493
|
+
}
|
1494
|
+
}
|
1495
|
+
|
1496
|
+
FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
|
1497
|
+
ZSTD_matchState_t* ms,
|
1498
|
+
const BYTE* ip, const BYTE* const iLimit,
|
1499
|
+
size_t* offsetPtr)
|
1500
|
+
{
|
1501
|
+
const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
|
1502
|
+
switch(cappedSearchLog)
|
1503
|
+
{
|
1504
|
+
default :
|
1505
|
+
case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
|
1506
|
+
case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
|
1507
|
+
}
|
1508
|
+
}
|
1509
|
+
|
848
1510
|
|
849
1511
|
/* *******************************
|
850
1512
|
* Common parser - lazy strategy
|
851
1513
|
*********************************/
|
852
|
-
typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
|
1514
|
+
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
|
853
1515
|
|
854
1516
|
FORCE_INLINE_TEMPLATE size_t
|
855
1517
|
ZSTD_compressBlock_lazy_generic(
|
@@ -863,10 +1525,11 @@ ZSTD_compressBlock_lazy_generic(
|
|
863
1525
|
const BYTE* ip = istart;
|
864
1526
|
const BYTE* anchor = istart;
|
865
1527
|
const BYTE* const iend = istart + srcSize;
|
866
|
-
const BYTE* const ilimit = iend - 8;
|
1528
|
+
const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
|
867
1529
|
const BYTE* const base = ms->window.base;
|
868
1530
|
const U32 prefixLowestIndex = ms->window.dictLimit;
|
869
1531
|
const BYTE* const prefixLowest = base + prefixLowestIndex;
|
1532
|
+
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
|
870
1533
|
|
871
1534
|
typedef size_t (*searchMax_f)(
|
872
1535
|
ZSTD_matchState_t* ms,
|
@@ -878,26 +1541,30 @@ ZSTD_compressBlock_lazy_generic(
|
|
878
1541
|
* that should never occur (extDict modes go to the other implementation
|
879
1542
|
* below and there is no DDSS for binary tree search yet).
|
880
1543
|
*/
|
881
|
-
const searchMax_f searchFuncs[4][
|
1544
|
+
const searchMax_f searchFuncs[4][3] = {
|
882
1545
|
{
|
883
1546
|
ZSTD_HcFindBestMatch_selectMLS,
|
884
|
-
ZSTD_BtFindBestMatch_selectMLS
|
1547
|
+
ZSTD_BtFindBestMatch_selectMLS,
|
1548
|
+
ZSTD_RowFindBestMatch_selectRowLog
|
885
1549
|
},
|
886
1550
|
{
|
1551
|
+
NULL,
|
887
1552
|
NULL,
|
888
1553
|
NULL
|
889
1554
|
},
|
890
1555
|
{
|
891
1556
|
ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
|
892
|
-
ZSTD_BtFindBestMatch_dictMatchState_selectMLS
|
1557
|
+
ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
|
1558
|
+
ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
|
893
1559
|
},
|
894
1560
|
{
|
895
1561
|
ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
|
896
|
-
NULL
|
1562
|
+
NULL,
|
1563
|
+
ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
|
897
1564
|
}
|
898
1565
|
};
|
899
1566
|
|
900
|
-
searchMax_f const searchMax = searchFuncs[dictMode][searchMethod
|
1567
|
+
searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
|
901
1568
|
U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
|
902
1569
|
|
903
1570
|
const int isDMS = dictMode == ZSTD_dictMatchState;
|
@@ -915,9 +1582,7 @@ ZSTD_compressBlock_lazy_generic(
|
|
915
1582
|
|
916
1583
|
assert(searchMax != NULL);
|
917
1584
|
|
918
|
-
DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
|
919
|
-
|
920
|
-
/* init */
|
1585
|
+
DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
|
921
1586
|
ip += (dictAndPrefixLength == 0);
|
922
1587
|
if (dictMode == ZSTD_noDict) {
|
923
1588
|
U32 const curr = (U32)(ip - base);
|
@@ -933,6 +1598,12 @@ ZSTD_compressBlock_lazy_generic(
|
|
933
1598
|
assert(offset_2 <= dictAndPrefixLength);
|
934
1599
|
}
|
935
1600
|
|
1601
|
+
if (searchMethod == search_rowHash) {
|
1602
|
+
ZSTD_row_fillHashCache(ms, base, rowLog,
|
1603
|
+
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
|
1604
|
+
ms->nextToUpdate, ilimit);
|
1605
|
+
}
|
1606
|
+
|
936
1607
|
/* Match Loop */
|
937
1608
|
#if defined(__GNUC__) && defined(__x86_64__)
|
938
1609
|
/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
|
@@ -1198,6 +1869,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
|
|
1198
1869
|
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
|
1199
1870
|
}
|
1200
1871
|
|
1872
|
+
/* Row-based matchfinder */
|
1873
|
+
size_t ZSTD_compressBlock_lazy2_row(
|
1874
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1875
|
+
void const* src, size_t srcSize)
|
1876
|
+
{
|
1877
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
|
1878
|
+
}
|
1879
|
+
|
1880
|
+
size_t ZSTD_compressBlock_lazy_row(
|
1881
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1882
|
+
void const* src, size_t srcSize)
|
1883
|
+
{
|
1884
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
|
1885
|
+
}
|
1886
|
+
|
1887
|
+
size_t ZSTD_compressBlock_greedy_row(
|
1888
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1889
|
+
void const* src, size_t srcSize)
|
1890
|
+
{
|
1891
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
|
1892
|
+
}
|
1893
|
+
|
1894
|
+
size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
|
1895
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1896
|
+
void const* src, size_t srcSize)
|
1897
|
+
{
|
1898
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
|
1899
|
+
}
|
1900
|
+
|
1901
|
+
size_t ZSTD_compressBlock_lazy_dictMatchState_row(
|
1902
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1903
|
+
void const* src, size_t srcSize)
|
1904
|
+
{
|
1905
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
size_t ZSTD_compressBlock_greedy_dictMatchState_row(
|
1909
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1910
|
+
void const* src, size_t srcSize)
|
1911
|
+
{
|
1912
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
|
1916
|
+
size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
|
1917
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1918
|
+
void const* src, size_t srcSize)
|
1919
|
+
{
|
1920
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
|
1921
|
+
}
|
1922
|
+
|
1923
|
+
size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
|
1924
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1925
|
+
void const* src, size_t srcSize)
|
1926
|
+
{
|
1927
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
|
1928
|
+
}
|
1929
|
+
|
1930
|
+
size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
|
1931
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
1932
|
+
void const* src, size_t srcSize)
|
1933
|
+
{
|
1934
|
+
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
|
1935
|
+
}
|
1201
1936
|
|
1202
1937
|
FORCE_INLINE_TEMPLATE
|
1203
1938
|
size_t ZSTD_compressBlock_lazy_extDict_generic(
|
@@ -1210,7 +1945,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
1210
1945
|
const BYTE* ip = istart;
|
1211
1946
|
const BYTE* anchor = istart;
|
1212
1947
|
const BYTE* const iend = istart + srcSize;
|
1213
|
-
const BYTE* const ilimit = iend - 8;
|
1948
|
+
const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
|
1214
1949
|
const BYTE* const base = ms->window.base;
|
1215
1950
|
const U32 dictLimit = ms->window.dictLimit;
|
1216
1951
|
const BYTE* const prefixStart = base + dictLimit;
|
@@ -1218,18 +1953,28 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
1218
1953
|
const BYTE* const dictEnd = dictBase + dictLimit;
|
1219
1954
|
const BYTE* const dictStart = dictBase + ms->window.lowLimit;
|
1220
1955
|
const U32 windowLog = ms->cParams.windowLog;
|
1956
|
+
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
|
1221
1957
|
|
1222
1958
|
typedef size_t (*searchMax_f)(
|
1223
1959
|
ZSTD_matchState_t* ms,
|
1224
1960
|
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
|
1225
|
-
searchMax_f
|
1226
|
-
|
1961
|
+
const searchMax_f searchFuncs[3] = {
|
1962
|
+
ZSTD_HcFindBestMatch_extDict_selectMLS,
|
1963
|
+
ZSTD_BtFindBestMatch_extDict_selectMLS,
|
1964
|
+
ZSTD_RowFindBestMatch_extDict_selectRowLog
|
1965
|
+
};
|
1966
|
+
searchMax_f searchMax = searchFuncs[(int)searchMethod];
|
1227
1967
|
U32 offset_1 = rep[0], offset_2 = rep[1];
|
1228
1968
|
|
1229
|
-
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
|
1969
|
+
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
|
1230
1970
|
|
1231
1971
|
/* init */
|
1232
1972
|
ip += (ip == prefixStart);
|
1973
|
+
if (searchMethod == search_rowHash) {
|
1974
|
+
ZSTD_row_fillHashCache(ms, base, rowLog,
|
1975
|
+
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
|
1976
|
+
ms->nextToUpdate, ilimit);
|
1977
|
+
}
|
1233
1978
|
|
1234
1979
|
/* Match Loop */
|
1235
1980
|
#if defined(__GNUC__) && defined(__x86_64__)
|
@@ -1249,7 +1994,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
1249
1994
|
const U32 repIndex = (U32)(curr+1 - offset_1);
|
1250
1995
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
1251
1996
|
const BYTE* const repMatch = repBase + repIndex;
|
1252
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
1997
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
|
1998
|
+
& (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
|
1253
1999
|
if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
|
1254
2000
|
/* repcode detected we should take it */
|
1255
2001
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
@@ -1280,7 +2026,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
1280
2026
|
const U32 repIndex = (U32)(curr - offset_1);
|
1281
2027
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
1282
2028
|
const BYTE* const repMatch = repBase + repIndex;
|
1283
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
2029
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
|
2030
|
+
& (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
|
1284
2031
|
if (MEM_read32(ip) == MEM_read32(repMatch)) {
|
1285
2032
|
/* repcode detected */
|
1286
2033
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
@@ -1311,7 +2058,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
|
|
1311
2058
|
const U32 repIndex = (U32)(curr - offset_1);
|
1312
2059
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
1313
2060
|
const BYTE* const repMatch = repBase + repIndex;
|
1314
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
2061
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
|
2062
|
+
& (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
|
1315
2063
|
if (MEM_read32(ip) == MEM_read32(repMatch)) {
|
1316
2064
|
/* repcode detected */
|
1317
2065
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
@@ -1357,7 +2105,8 @@ _storeSequence:
|
|
1357
2105
|
const U32 repIndex = repCurrent - offset_2;
|
1358
2106
|
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
|
1359
2107
|
const BYTE* const repMatch = repBase + repIndex;
|
1360
|
-
if (((U32)((dictLimit-1) - repIndex) >= 3)
|
2108
|
+
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
|
2109
|
+
& (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
|
1361
2110
|
if (MEM_read32(ip) == MEM_read32(repMatch)) {
|
1362
2111
|
/* repcode detected we should take it */
|
1363
2112
|
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
|
@@ -1410,3 +2159,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
|
|
1410
2159
|
{
|
1411
2160
|
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
|
1412
2161
|
}
|
2162
|
+
|
2163
|
+
size_t ZSTD_compressBlock_greedy_extDict_row(
|
2164
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
2165
|
+
void const* src, size_t srcSize)
|
2166
|
+
{
|
2167
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
|
2168
|
+
}
|
2169
|
+
|
2170
|
+
size_t ZSTD_compressBlock_lazy_extDict_row(
|
2171
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
2172
|
+
void const* src, size_t srcSize)
|
2173
|
+
|
2174
|
+
{
|
2175
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
|
2176
|
+
}
|
2177
|
+
|
2178
|
+
size_t ZSTD_compressBlock_lazy2_extDict_row(
|
2179
|
+
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
2180
|
+
void const* src, size_t srcSize)
|
2181
|
+
|
2182
|
+
{
|
2183
|
+
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
|
2184
|
+
}
|