zstd-ruby 1.4.4.0 → 1.5.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +78 -5
- data/Rakefile +8 -2
- data/ext/zstdruby/common.h +15 -0
- data/ext/zstdruby/extconf.rb +3 -2
- data/ext/zstdruby/libzstd/common/allocations.h +55 -0
- data/ext/zstdruby/libzstd/common/bits.h +200 -0
- data/ext/zstdruby/libzstd/common/bitstream.h +74 -97
- data/ext/zstdruby/libzstd/common/compiler.h +219 -20
- data/ext/zstdruby/libzstd/common/cpu.h +1 -3
- data/ext/zstdruby/libzstd/common/debug.c +11 -31
- data/ext/zstdruby/libzstd/common/debug.h +22 -49
- data/ext/zstdruby/libzstd/common/entropy_common.c +184 -80
- data/ext/zstdruby/libzstd/common/error_private.c +11 -2
- data/ext/zstdruby/libzstd/common/error_private.h +87 -4
- data/ext/zstdruby/libzstd/common/fse.h +47 -116
- data/ext/zstdruby/libzstd/common/fse_decompress.c +127 -127
- data/ext/zstdruby/libzstd/common/huf.h +112 -197
- data/ext/zstdruby/libzstd/common/mem.h +124 -142
- data/ext/zstdruby/libzstd/common/pool.c +54 -27
- data/ext/zstdruby/libzstd/common/pool.h +11 -5
- data/ext/zstdruby/libzstd/common/portability_macros.h +156 -0
- data/ext/zstdruby/libzstd/common/threading.c +78 -22
- data/ext/zstdruby/libzstd/common/threading.h +9 -13
- data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
- data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
- data/ext/zstdruby/libzstd/common/zstd_common.c +2 -37
- data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
- data/ext/zstdruby/libzstd/common/zstd_internal.h +186 -144
- data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
- data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
- data/ext/zstdruby/libzstd/compress/fse_compress.c +99 -196
- data/ext/zstdruby/libzstd/compress/hist.c +41 -63
- data/ext/zstdruby/libzstd/compress/hist.h +13 -33
- data/ext/zstdruby/libzstd/compress/huf_compress.c +968 -331
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +4120 -1191
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +688 -159
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +121 -40
- data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -6
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +62 -35
- data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +577 -0
- data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
- data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +322 -115
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +394 -154
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +4 -3
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +729 -253
- data/ext/zstdruby/libzstd/compress/zstd_fast.h +4 -3
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1289 -247
- data/ext/zstdruby/libzstd/compress/zstd_lazy.h +61 -1
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +339 -212
- data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
- data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +508 -282
- data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +217 -466
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +35 -114
- data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1220 -572
- data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +576 -0
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +23 -19
- data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +859 -273
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1244 -375
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +21 -7
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +74 -11
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +75 -54
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
- data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +55 -36
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +126 -110
- data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +248 -56
- data/ext/zstdruby/libzstd/zstd.h +1277 -306
- data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +29 -8
- data/ext/zstdruby/main.c +20 -0
- data/ext/zstdruby/skippable_frame.c +63 -0
- data/ext/zstdruby/streaming_compress.c +177 -0
- data/ext/zstdruby/streaming_compress.h +5 -0
- data/ext/zstdruby/streaming_decompress.c +123 -0
- data/ext/zstdruby/zstdruby.c +114 -32
- data/lib/zstd-ruby/version.rb +1 -1
- data/lib/zstd-ruby.rb +0 -1
- data/zstd-ruby.gemspec +1 -1
- metadata +24 -39
- data/.travis.yml +0 -14
- data/ext/zstdruby/libzstd/.gitignore +0 -3
- data/ext/zstdruby/libzstd/BUCK +0 -234
- data/ext/zstdruby/libzstd/Makefile +0 -289
- data/ext/zstdruby/libzstd/README.md +0 -159
- data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
- data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
- data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -147
- data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
- data/ext/zstdruby/libzstd/dll/example/Makefile +0 -47
- data/ext/zstdruby/libzstd/dll/example/README.md +0 -69
- data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
- data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
- data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2152
- data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3514
- data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3156
- data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3641
- data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4046
- data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4150
- data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4533
- data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
- data/ext/zstdruby/libzstd/libzstd.pc.in +0 -15
- data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright (c)
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
3
3
|
* All rights reserved.
|
4
4
|
*
|
5
5
|
* This source code is licensed under both the BSD-style license (found in the
|
@@ -23,9 +23,13 @@
|
|
23
23
|
/* Unix Large Files support (>4GB) */
|
24
24
|
#define _FILE_OFFSET_BITS 64
|
25
25
|
#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
|
26
|
+
# ifndef _LARGEFILE_SOURCE
|
26
27
|
# define _LARGEFILE_SOURCE
|
28
|
+
# endif
|
27
29
|
#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
|
30
|
+
# ifndef _LARGEFILE64_SOURCE
|
28
31
|
# define _LARGEFILE64_SOURCE
|
32
|
+
# endif
|
29
33
|
#endif
|
30
34
|
|
31
35
|
|
@@ -37,17 +41,19 @@
|
|
37
41
|
#include <stdio.h> /* fprintf, fopen, ftello64 */
|
38
42
|
#include <time.h> /* clock */
|
39
43
|
|
40
|
-
#include "mem.h" /* read */
|
41
|
-
#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
42
|
-
#define HUF_STATIC_LINKING_ONLY
|
43
|
-
#include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
44
|
-
#include "zstd_internal.h" /* includes zstd.h */
|
45
|
-
#include "xxhash.h" /* XXH64 */
|
46
|
-
#include "divsufsort.h"
|
47
44
|
#ifndef ZDICT_STATIC_LINKING_ONLY
|
48
45
|
# define ZDICT_STATIC_LINKING_ONLY
|
49
46
|
#endif
|
50
|
-
|
47
|
+
|
48
|
+
#include "../common/mem.h" /* read */
|
49
|
+
#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
|
50
|
+
#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
|
51
|
+
#include "../common/zstd_internal.h" /* includes zstd.h */
|
52
|
+
#include "../common/xxhash.h" /* XXH64 */
|
53
|
+
#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
|
54
|
+
#include "../zdict.h"
|
55
|
+
#include "divsufsort.h"
|
56
|
+
#include "../common/bits.h" /* ZSTD_NbCommonBytes */
|
51
57
|
|
52
58
|
|
53
59
|
/*-*************************************
|
@@ -61,14 +67,15 @@
|
|
61
67
|
|
62
68
|
#define NOISELENGTH 32
|
63
69
|
|
64
|
-
static const int g_compressionLevel_default = 3;
|
65
70
|
static const U32 g_selectivity_default = 9;
|
66
71
|
|
67
72
|
|
68
73
|
/*-*************************************
|
69
74
|
* Console display
|
70
75
|
***************************************/
|
76
|
+
#undef DISPLAY
|
71
77
|
#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
|
78
|
+
#undef DISPLAYLEVEL
|
72
79
|
#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
73
80
|
|
74
81
|
static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
|
@@ -99,69 +106,30 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
|
99
106
|
return MEM_readLE32((const char*)dictBuffer + 4);
|
100
107
|
}
|
101
108
|
|
102
|
-
|
103
|
-
/*-********************************************************
|
104
|
-
* Dictionary training functions
|
105
|
-
**********************************************************/
|
106
|
-
static unsigned ZDICT_NbCommonBytes (size_t val)
|
109
|
+
size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
|
107
110
|
{
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
|
119
|
-
# endif
|
120
|
-
} else { /* 32 bits */
|
121
|
-
# if defined(_MSC_VER)
|
122
|
-
unsigned long r=0;
|
123
|
-
_BitScanForward( &r, (U32)val );
|
124
|
-
return (unsigned)(r>>3);
|
125
|
-
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
126
|
-
return (__builtin_ctz((U32)val) >> 3);
|
127
|
-
# else
|
128
|
-
static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
|
129
|
-
return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
|
130
|
-
# endif
|
111
|
+
size_t headerSize;
|
112
|
+
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
|
113
|
+
|
114
|
+
{ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
115
|
+
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
|
116
|
+
if (!bs || !wksp) {
|
117
|
+
headerSize = ERROR(memory_allocation);
|
118
|
+
} else {
|
119
|
+
ZSTD_reset_compressedBlockState(bs);
|
120
|
+
headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
|
131
121
|
}
|
132
|
-
} else { /* Big Endian CPU */
|
133
|
-
if (MEM_64bits()) {
|
134
|
-
# if defined(_MSC_VER) && defined(_WIN64)
|
135
|
-
unsigned long r = 0;
|
136
|
-
_BitScanReverse64( &r, val );
|
137
|
-
return (unsigned)(r>>3);
|
138
|
-
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
139
|
-
return (__builtin_clzll(val) >> 3);
|
140
|
-
# else
|
141
|
-
unsigned r;
|
142
|
-
const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
|
143
|
-
if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
|
144
|
-
if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
|
145
|
-
r += (!val);
|
146
|
-
return r;
|
147
|
-
# endif
|
148
|
-
} else { /* 32 bits */
|
149
|
-
# if defined(_MSC_VER)
|
150
|
-
unsigned long r = 0;
|
151
|
-
_BitScanReverse( &r, (unsigned long)val );
|
152
|
-
return (unsigned)(r>>3);
|
153
|
-
# elif defined(__GNUC__) && (__GNUC__ >= 3)
|
154
|
-
return (__builtin_clz((U32)val) >> 3);
|
155
|
-
# else
|
156
|
-
unsigned r;
|
157
|
-
if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
|
158
|
-
r += (!val);
|
159
|
-
return r;
|
160
|
-
# endif
|
161
|
-
} }
|
162
|
-
}
|
163
122
|
|
123
|
+
free(bs);
|
124
|
+
free(wksp);
|
125
|
+
}
|
164
126
|
|
127
|
+
return headerSize;
|
128
|
+
}
|
129
|
+
|
130
|
+
/*-********************************************************
|
131
|
+
* Dictionary training functions
|
132
|
+
**********************************************************/
|
165
133
|
/*! ZDICT_count() :
|
166
134
|
Count the nb of common bytes between 2 pointers.
|
167
135
|
Note : this function presumes end of buffer followed by noisy guard band.
|
@@ -176,7 +144,7 @@ static size_t ZDICT_count(const void* pIn, const void* pMatch)
|
|
176
144
|
pMatch = (const char*)pMatch+sizeof(size_t);
|
177
145
|
continue;
|
178
146
|
}
|
179
|
-
pIn = (const char*)pIn+
|
147
|
+
pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff);
|
180
148
|
return (size_t)((const char*)pIn - pStart);
|
181
149
|
}
|
182
150
|
}
|
@@ -208,7 +176,7 @@ static dictItem ZDICT_analyzePos(
|
|
208
176
|
U32 savings[LLIMIT] = {0};
|
209
177
|
const BYTE* b = (const BYTE*)buffer;
|
210
178
|
size_t maxLength = LLIMIT;
|
211
|
-
size_t pos = suffix[start];
|
179
|
+
size_t pos = (size_t)suffix[start];
|
212
180
|
U32 end = start;
|
213
181
|
dictItem solution;
|
214
182
|
|
@@ -342,7 +310,7 @@ static dictItem ZDICT_analyzePos(
|
|
342
310
|
savings[i] = savings[i-1] + (lengthList[i] * (i-3));
|
343
311
|
|
344
312
|
DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
|
345
|
-
(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
|
313
|
+
(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);
|
346
314
|
|
347
315
|
solution.pos = (U32)pos;
|
348
316
|
solution.length = (U32)maxLength;
|
@@ -352,7 +320,7 @@ static dictItem ZDICT_analyzePos(
|
|
352
320
|
{ U32 id;
|
353
321
|
for (id=start; id<end; id++) {
|
354
322
|
U32 p, pEnd, length;
|
355
|
-
U32 const testedPos = suffix[id];
|
323
|
+
U32 const testedPos = (U32)suffix[id];
|
356
324
|
if (testedPos == pos)
|
357
325
|
length = solution.length;
|
358
326
|
else {
|
@@ -404,7 +372,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
|
|
404
372
|
elt = table[u];
|
405
373
|
/* sort : improve rank */
|
406
374
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
407
|
-
|
375
|
+
table[u] = table[u-1], u--;
|
408
376
|
table[u] = elt;
|
409
377
|
return u;
|
410
378
|
} }
|
@@ -415,7 +383,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
|
|
415
383
|
|
416
384
|
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
|
417
385
|
/* append */
|
418
|
-
int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
386
|
+
int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);
|
419
387
|
table[u].savings += elt.length / 8; /* rough approx bonus */
|
420
388
|
if (addedLength > 0) { /* otherwise, elt fully included into existing */
|
421
389
|
table[u].length += addedLength;
|
@@ -508,6 +476,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
|
508
476
|
clock_t displayClock = 0;
|
509
477
|
clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
510
478
|
|
479
|
+
# undef DISPLAYUPDATE
|
511
480
|
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
|
512
481
|
if (ZDICT_clockSpan(displayClock) > refreshRate) \
|
513
482
|
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
|
@@ -554,7 +523,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
|
554
523
|
if (solution.length==0) { cursor++; continue; }
|
555
524
|
ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
|
556
525
|
cursor += solution.length;
|
557
|
-
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
526
|
+
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0);
|
558
527
|
} }
|
559
528
|
|
560
529
|
_cleanup:
|
@@ -588,20 +557,20 @@ typedef struct
|
|
588
557
|
|
589
558
|
#define MAXREPOFFSET 1024
|
590
559
|
|
591
|
-
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
560
|
+
static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
|
592
561
|
unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
|
593
562
|
const void* src, size_t srcSize,
|
594
563
|
U32 notificationLevel)
|
595
564
|
{
|
596
|
-
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params
|
565
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
|
597
566
|
size_t cSize;
|
598
567
|
|
599
568
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
600
|
-
{ size_t const errorCode =
|
569
|
+
{ size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict);
|
601
570
|
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
|
602
571
|
|
603
572
|
}
|
604
|
-
cSize =
|
573
|
+
cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
|
605
574
|
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
|
606
575
|
|
607
576
|
if (cSize) { /* if == 0; block is not compressible */
|
@@ -634,8 +603,8 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
634
603
|
|
635
604
|
if (nbSeq >= 2) { /* rep offsets */
|
636
605
|
const seqDef* const seq = seqStorePtr->sequencesStart;
|
637
|
-
U32 offset1 = seq[0].
|
638
|
-
U32 offset2 = seq[1].
|
606
|
+
U32 offset1 = seq[0].offBase - ZSTD_REP_NUM;
|
607
|
+
U32 offset2 = seq[1].offBase - ZSTD_REP_NUM;
|
639
608
|
if (offset1 >= MAXREPOFFSET) offset1 = 0;
|
640
609
|
if (offset2 >= MAXREPOFFSET) offset2 = 0;
|
641
610
|
repOffsets[offset1] += 3;
|
@@ -682,7 +651,7 @@ static void ZDICT_flatLit(unsigned* countLit)
|
|
682
651
|
|
683
652
|
#define OFFCODE_MAX 30 /* only applicable to first block */
|
684
653
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
685
|
-
|
654
|
+
int compressionLevel,
|
686
655
|
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
687
656
|
const void* dictBuffer, size_t dictBufferSize,
|
688
657
|
unsigned notificationLevel)
|
@@ -706,6 +675,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
706
675
|
size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
|
707
676
|
size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
|
708
677
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
678
|
+
U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32];
|
709
679
|
|
710
680
|
/* init */
|
711
681
|
DEBUGLOG(4, "ZDICT_analyzeEntropy");
|
@@ -717,7 +687,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
717
687
|
memset(repOffset, 0, sizeof(repOffset));
|
718
688
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
719
689
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
720
|
-
if (compressionLevel==0) compressionLevel =
|
690
|
+
if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
|
721
691
|
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
722
692
|
|
723
693
|
esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
|
@@ -731,15 +701,22 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
731
701
|
|
732
702
|
/* collect stats on all samples */
|
733
703
|
for (u=0; u<nbFiles; u++) {
|
734
|
-
ZDICT_countEStats(esr, params,
|
704
|
+
ZDICT_countEStats(esr, ¶ms,
|
735
705
|
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
736
706
|
(const char*)srcBuffer + pos, fileSizes[u],
|
737
707
|
notificationLevel);
|
738
708
|
pos += fileSizes[u];
|
739
709
|
}
|
740
710
|
|
711
|
+
if (notificationLevel >= 4) {
|
712
|
+
/* writeStats */
|
713
|
+
DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
|
714
|
+
for (u=0; u<=offcodeMax; u++) {
|
715
|
+
DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
|
716
|
+
} }
|
717
|
+
|
741
718
|
/* analyze, build stats, starting with literals */
|
742
|
-
{ size_t maxNbBits =
|
719
|
+
{ size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
|
743
720
|
if (HUF_isError(maxNbBits)) {
|
744
721
|
eSize = maxNbBits;
|
745
722
|
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
|
@@ -748,7 +725,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
748
725
|
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
|
749
726
|
DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
|
750
727
|
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
|
751
|
-
maxNbBits =
|
728
|
+
maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
|
752
729
|
assert(maxNbBits==9);
|
753
730
|
}
|
754
731
|
huffLog = (U32)maxNbBits;
|
@@ -762,7 +739,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
762
739
|
/* note : the result of this phase should be used to better appreciate the impact on statistics */
|
763
740
|
|
764
741
|
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
765
|
-
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
742
|
+
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
|
766
743
|
if (FSE_isError(errorCode)) {
|
767
744
|
eSize = errorCode;
|
768
745
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
@@ -771,7 +748,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
771
748
|
Offlog = (U32)errorCode;
|
772
749
|
|
773
750
|
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
|
774
|
-
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
751
|
+
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
|
775
752
|
if (FSE_isError(errorCode)) {
|
776
753
|
eSize = errorCode;
|
777
754
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
@@ -780,7 +757,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
780
757
|
mlLog = (U32)errorCode;
|
781
758
|
|
782
759
|
total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
|
783
|
-
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
760
|
+
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
|
784
761
|
if (FSE_isError(errorCode)) {
|
785
762
|
eSize = errorCode;
|
786
763
|
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
@@ -789,7 +766,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
789
766
|
llLog = (U32)errorCode;
|
790
767
|
|
791
768
|
/* write result to buffer */
|
792
|
-
{ size_t const hhSize =
|
769
|
+
{ size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp));
|
793
770
|
if (HUF_isError(hhSize)) {
|
794
771
|
eSize = hhSize;
|
795
772
|
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
@@ -844,7 +821,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
844
821
|
MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
|
845
822
|
#else
|
846
823
|
/* at this stage, we don't use the result of "most common first offset",
|
847
|
-
|
824
|
+
* as the impact of statistics is not properly evaluated */
|
848
825
|
MEM_writeLE32(dstPtr+0, repStartValue[0]);
|
849
826
|
MEM_writeLE32(dstPtr+4, repStartValue[1]);
|
850
827
|
MEM_writeLE32(dstPtr+8, repStartValue[2]);
|
@@ -860,6 +837,17 @@ _cleanup:
|
|
860
837
|
}
|
861
838
|
|
862
839
|
|
840
|
+
/**
|
841
|
+
* @returns the maximum repcode value
|
842
|
+
*/
|
843
|
+
static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
|
844
|
+
{
|
845
|
+
U32 maxRep = reps[0];
|
846
|
+
int r;
|
847
|
+
for (r = 1; r < ZSTD_REP_NUM; ++r)
|
848
|
+
maxRep = MAX(maxRep, reps[r]);
|
849
|
+
return maxRep;
|
850
|
+
}
|
863
851
|
|
864
852
|
size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
865
853
|
const void* customDictContent, size_t dictContentSize,
|
@@ -869,13 +857,15 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
869
857
|
size_t hSize;
|
870
858
|
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
|
871
859
|
BYTE header[HBUFFSIZE];
|
872
|
-
int const compressionLevel = (params.compressionLevel == 0) ?
|
860
|
+
int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
|
873
861
|
U32 const notificationLevel = params.notificationLevel;
|
862
|
+
/* The final dictionary content must be at least as large as the largest repcode */
|
863
|
+
size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
|
864
|
+
size_t paddingSize;
|
874
865
|
|
875
866
|
/* check conditions */
|
876
867
|
DEBUGLOG(4, "ZDICT_finalizeDictionary");
|
877
868
|
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
|
878
|
-
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
|
879
869
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
|
880
870
|
|
881
871
|
/* dictionary header */
|
@@ -899,12 +889,43 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
899
889
|
hSize += eSize;
|
900
890
|
}
|
901
891
|
|
902
|
-
/*
|
903
|
-
if (hSize + dictContentSize > dictBufferCapacity)
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
892
|
+
/* Shrink the content size if it doesn't fit in the buffer */
|
893
|
+
if (hSize + dictContentSize > dictBufferCapacity) {
|
894
|
+
dictContentSize = dictBufferCapacity - hSize;
|
895
|
+
}
|
896
|
+
|
897
|
+
/* Pad the dictionary content with zeros if it is too small */
|
898
|
+
if (dictContentSize < minContentSize) {
|
899
|
+
RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
|
900
|
+
"dictBufferCapacity too small to fit max repcode");
|
901
|
+
paddingSize = minContentSize - dictContentSize;
|
902
|
+
} else {
|
903
|
+
paddingSize = 0;
|
904
|
+
}
|
905
|
+
|
906
|
+
{
|
907
|
+
size_t const dictSize = hSize + paddingSize + dictContentSize;
|
908
|
+
|
909
|
+
/* The dictionary consists of the header, optional padding, and the content.
|
910
|
+
* The padding comes before the content because the "best" position in the
|
911
|
+
* dictionary is the last byte.
|
912
|
+
*/
|
913
|
+
BYTE* const outDictHeader = (BYTE*)dictBuffer;
|
914
|
+
BYTE* const outDictPadding = outDictHeader + hSize;
|
915
|
+
BYTE* const outDictContent = outDictPadding + paddingSize;
|
916
|
+
|
917
|
+
assert(dictSize <= dictBufferCapacity);
|
918
|
+
assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
|
919
|
+
|
920
|
+
/* First copy the customDictContent into its final location.
|
921
|
+
* `customDictContent` and `dictBuffer` may overlap, so we must
|
922
|
+
* do this before any other writes into the output buffer.
|
923
|
+
* Then copy the header & padding into the output buffer.
|
924
|
+
*/
|
925
|
+
memmove(outDictContent, customDictContent, dictContentSize);
|
926
|
+
memcpy(outDictHeader, header, hSize);
|
927
|
+
memset(outDictPadding, 0, paddingSize);
|
928
|
+
|
908
929
|
return dictSize;
|
909
930
|
}
|
910
931
|
}
|
@@ -915,7 +936,7 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
|
915
936
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
916
937
|
ZDICT_params_t params)
|
917
938
|
{
|
918
|
-
int const compressionLevel = (params.compressionLevel == 0) ?
|
939
|
+
int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
|
919
940
|
U32 const notificationLevel = params.notificationLevel;
|
920
941
|
size_t hSize = 8;
|
921
942
|
|
@@ -944,16 +965,11 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
|
|
944
965
|
return MIN(dictBufferCapacity, hSize+dictContentSize);
|
945
966
|
}
|
946
967
|
|
947
|
-
/* Hidden declaration for dbio.c */
|
948
|
-
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
949
|
-
void* dictBuffer, size_t maxDictSize,
|
950
|
-
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
951
|
-
ZDICT_legacy_params_t params);
|
952
968
|
/*! ZDICT_trainFromBuffer_unsafe_legacy() :
|
953
|
-
* Warning : `samplesBuffer` must be followed by noisy guard band
|
969
|
+
* Warning : `samplesBuffer` must be followed by noisy guard band !!!
|
954
970
|
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
955
971
|
*/
|
956
|
-
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
972
|
+
static size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
957
973
|
void* dictBuffer, size_t maxDictSize,
|
958
974
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
959
975
|
ZDICT_legacy_params_t params)
|
@@ -1090,8 +1106,8 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
|
1090
1106
|
memset(¶ms, 0, sizeof(params));
|
1091
1107
|
params.d = 8;
|
1092
1108
|
params.steps = 4;
|
1093
|
-
/*
|
1094
|
-
params.zParams.compressionLevel =
|
1109
|
+
/* Use default level since no compression level information is available */
|
1110
|
+
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
|
1095
1111
|
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
|
1096
1112
|
params.zParams.notificationLevel = DEBUGLEVEL;
|
1097
1113
|
#endif
|