extzstd 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.ja +5 -0
- data/README.md +5 -5
- data/contrib/zstd/CONTRIBUTING.md +42 -0
- data/contrib/zstd/LICENSE-examples +11 -0
- data/contrib/zstd/Makefile +315 -0
- data/contrib/zstd/NEWS +261 -0
- data/contrib/zstd/PATENTS +33 -0
- data/contrib/zstd/README.md +121 -41
- data/contrib/zstd/TESTING.md +44 -0
- data/contrib/zstd/appveyor.yml +178 -0
- data/contrib/zstd/circle.yml +75 -0
- data/contrib/zstd/lib/BUCK +186 -0
- data/contrib/zstd/lib/Makefile +163 -0
- data/contrib/zstd/lib/README.md +77 -0
- data/contrib/zstd/{common → lib/common}/bitstream.h +7 -4
- data/contrib/zstd/{common → lib/common}/entropy_common.c +19 -23
- data/contrib/zstd/{common → lib/common}/error_private.c +0 -0
- data/contrib/zstd/{common → lib/common}/error_private.h +0 -0
- data/contrib/zstd/{common → lib/common}/fse.h +94 -34
- data/contrib/zstd/{common → lib/common}/fse_decompress.c +18 -19
- data/contrib/zstd/{common → lib/common}/huf.h +52 -20
- data/contrib/zstd/{common → lib/common}/mem.h +17 -13
- data/contrib/zstd/lib/common/pool.c +194 -0
- data/contrib/zstd/lib/common/pool.h +56 -0
- data/contrib/zstd/lib/common/threading.c +80 -0
- data/contrib/zstd/lib/common/threading.h +104 -0
- data/contrib/zstd/{common → lib/common}/xxhash.c +3 -1
- data/contrib/zstd/{common → lib/common}/xxhash.h +11 -15
- data/contrib/zstd/{common → lib/common}/zstd_common.c +1 -11
- data/contrib/zstd/{common → lib/common}/zstd_errors.h +16 -2
- data/contrib/zstd/{common → lib/common}/zstd_internal.h +17 -1
- data/contrib/zstd/{compress → lib/compress}/fse_compress.c +138 -91
- data/contrib/zstd/{compress → lib/compress}/huf_compress.c +218 -67
- data/contrib/zstd/{compress → lib/compress}/zstd_compress.c +231 -108
- data/contrib/zstd/{compress → lib/compress}/zstd_opt.h +44 -25
- data/contrib/zstd/lib/compress/zstdmt_compress.c +739 -0
- data/contrib/zstd/lib/compress/zstdmt_compress.h +78 -0
- data/contrib/zstd/{decompress → lib/decompress}/huf_decompress.c +28 -23
- data/contrib/zstd/{decompress → lib/decompress}/zstd_decompress.c +814 -176
- data/contrib/zstd/{common → lib/deprecated}/zbuff.h +60 -39
- data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
- data/contrib/zstd/lib/deprecated/zbuff_compress.c +145 -0
- data/contrib/zstd/lib/deprecated/zbuff_decompress.c +74 -0
- data/contrib/zstd/lib/dictBuilder/cover.c +1029 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +0 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +0 -0
- data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +68 -18
- data/contrib/zstd/lib/dictBuilder/zdict.h +201 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_legacy.h +122 -7
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +34 -3
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +8 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +45 -12
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +8 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +45 -12
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +8 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +56 -33
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +8 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +45 -18
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +7 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +43 -16
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +7 -0
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.c +57 -23
- data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.h +8 -0
- data/contrib/zstd/lib/libzstd.pc.in +14 -0
- data/contrib/zstd/{zstd.h → lib/zstd.h} +206 -71
- data/ext/depend +2 -0
- data/ext/extconf.rb +4 -4
- data/ext/extzstd.c +1 -1
- data/ext/zstd_common.c +5 -5
- data/ext/zstd_compress.c +3 -3
- data/ext/zstd_decompress.c +2 -2
- data/ext/zstd_dictbuilder.c +2 -2
- data/ext/zstd_legacy_v01.c +1 -1
- data/ext/zstd_legacy_v02.c +1 -1
- data/ext/zstd_legacy_v03.c +1 -1
- data/ext/zstd_legacy_v04.c +1 -1
- data/ext/zstd_legacy_v05.c +1 -1
- data/ext/zstd_legacy_v06.c +1 -1
- data/ext/zstd_legacy_v07.c +1 -1
- data/gemstub.rb +9 -5
- data/lib/extzstd/version.rb +1 -1
- metadata +73 -51
- data/contrib/zstd/compress/zbuff_compress.c +0 -319
- data/contrib/zstd/decompress/zbuff_decompress.c +0 -252
- data/contrib/zstd/dictBuilder/zdict.h +0 -111
@@ -0,0 +1,1029 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under the BSD-style license found in the
|
6
|
+
* LICENSE file in the root directory of this source tree. An additional grant
|
7
|
+
* of patent rights can be found in the PATENTS file in the same directory.
|
8
|
+
*/
|
9
|
+
|
10
|
+
/* *****************************************************************************
|
11
|
+
* Constructs a dictionary using a heuristic based on the following paper:
|
12
|
+
*
|
13
|
+
* Liao, Petri, Moffat, Wirth
|
14
|
+
* Effective Construction of Relative Lempel-Ziv Dictionaries
|
15
|
+
* Published in WWW 2016.
|
16
|
+
*
|
17
|
+
* Adapted from code originally written by @ot (Giuseppe Ottaviano).
|
18
|
+
******************************************************************************/
|
19
|
+
|
20
|
+
/*-*************************************
|
21
|
+
* Dependencies
|
22
|
+
***************************************/
|
23
|
+
#include <stdio.h> /* fprintf */
|
24
|
+
#include <stdlib.h> /* malloc, free, qsort */
|
25
|
+
#include <string.h> /* memset */
|
26
|
+
#include <time.h> /* clock */
|
27
|
+
|
28
|
+
#include "mem.h" /* read */
|
29
|
+
#include "pool.h"
|
30
|
+
#include "threading.h"
|
31
|
+
#include "zstd_internal.h" /* includes zstd.h */
|
32
|
+
#ifndef ZDICT_STATIC_LINKING_ONLY
|
33
|
+
#define ZDICT_STATIC_LINKING_ONLY
|
34
|
+
#endif
|
35
|
+
#include "zdict.h"
|
36
|
+
|
37
|
+
/*-*************************************
|
38
|
+
* Constants
|
39
|
+
***************************************/
|
40
|
+
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
|
41
|
+
|
42
|
+
/*-*************************************
|
43
|
+
* Console display
|
44
|
+
***************************************/
|
45
|
+
static int g_displayLevel = 2;
|
46
|
+
#define DISPLAY(...) \
|
47
|
+
{ \
|
48
|
+
fprintf(stderr, __VA_ARGS__); \
|
49
|
+
fflush(stderr); \
|
50
|
+
}
|
51
|
+
#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
|
52
|
+
if (displayLevel >= l) { \
|
53
|
+
DISPLAY(__VA_ARGS__); \
|
54
|
+
} /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
55
|
+
#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
|
56
|
+
|
57
|
+
#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
|
58
|
+
if (displayLevel >= l) { \
|
59
|
+
if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \
|
60
|
+
g_time = clock(); \
|
61
|
+
DISPLAY(__VA_ARGS__); \
|
62
|
+
if (displayLevel >= 4) \
|
63
|
+
fflush(stdout); \
|
64
|
+
} \
|
65
|
+
}
|
66
|
+
#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
|
67
|
+
static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
|
68
|
+
static clock_t g_time = 0;
|
69
|
+
|
70
|
+
/*-*************************************
|
71
|
+
* Hash table
|
72
|
+
***************************************
|
73
|
+
* A small specialized hash map for storing activeDmers.
|
74
|
+
* The map does not resize, so if it becomes full it will loop forever.
|
75
|
+
* Thus, the map must be large enough to store every value.
|
76
|
+
* The map implements linear probing and keeps its load less than 0.5.
|
77
|
+
*/
|
78
|
+
|
79
|
+
#define MAP_EMPTY_VALUE ((U32)-1)
|
80
|
+
typedef struct COVER_map_pair_t_s {
|
81
|
+
U32 key;
|
82
|
+
U32 value;
|
83
|
+
} COVER_map_pair_t;
|
84
|
+
|
85
|
+
typedef struct COVER_map_s {
|
86
|
+
COVER_map_pair_t *data;
|
87
|
+
U32 sizeLog;
|
88
|
+
U32 size;
|
89
|
+
U32 sizeMask;
|
90
|
+
} COVER_map_t;
|
91
|
+
|
92
|
+
/**
|
93
|
+
* Clear the map.
|
94
|
+
*/
|
95
|
+
static void COVER_map_clear(COVER_map_t *map) {
|
96
|
+
memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
|
97
|
+
}
|
98
|
+
|
99
|
+
/**
|
100
|
+
* Initializes a map of the given size.
|
101
|
+
* Returns 1 on success and 0 on failure.
|
102
|
+
* The map must be destroyed with COVER_map_destroy().
|
103
|
+
* The map is only guaranteed to be large enough to hold size elements.
|
104
|
+
*/
|
105
|
+
static int COVER_map_init(COVER_map_t *map, U32 size) {
|
106
|
+
map->sizeLog = ZSTD_highbit32(size) + 2;
|
107
|
+
map->size = (U32)1 << map->sizeLog;
|
108
|
+
map->sizeMask = map->size - 1;
|
109
|
+
map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
|
110
|
+
if (!map->data) {
|
111
|
+
map->sizeLog = 0;
|
112
|
+
map->size = 0;
|
113
|
+
return 0;
|
114
|
+
}
|
115
|
+
COVER_map_clear(map);
|
116
|
+
return 1;
|
117
|
+
}
|
118
|
+
|
119
|
+
/**
|
120
|
+
* Internal hash function
|
121
|
+
*/
|
122
|
+
static const U32 prime4bytes = 2654435761U;
|
123
|
+
static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
|
124
|
+
return (key * prime4bytes) >> (32 - map->sizeLog);
|
125
|
+
}
|
126
|
+
|
127
|
+
/**
|
128
|
+
* Helper function that returns the index that a key should be placed into.
|
129
|
+
*/
|
130
|
+
static U32 COVER_map_index(COVER_map_t *map, U32 key) {
|
131
|
+
const U32 hash = COVER_map_hash(map, key);
|
132
|
+
U32 i;
|
133
|
+
for (i = hash;; i = (i + 1) & map->sizeMask) {
|
134
|
+
COVER_map_pair_t *pos = &map->data[i];
|
135
|
+
if (pos->value == MAP_EMPTY_VALUE) {
|
136
|
+
return i;
|
137
|
+
}
|
138
|
+
if (pos->key == key) {
|
139
|
+
return i;
|
140
|
+
}
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
/**
|
145
|
+
* Returns the pointer to the value for key.
|
146
|
+
* If key is not in the map, it is inserted and the value is set to 0.
|
147
|
+
* The map must not be full.
|
148
|
+
*/
|
149
|
+
static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
|
150
|
+
COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
|
151
|
+
if (pos->value == MAP_EMPTY_VALUE) {
|
152
|
+
pos->key = key;
|
153
|
+
pos->value = 0;
|
154
|
+
}
|
155
|
+
return &pos->value;
|
156
|
+
}
|
157
|
+
|
158
|
+
/**
|
159
|
+
* Deletes key from the map if present.
|
160
|
+
*/
|
161
|
+
static void COVER_map_remove(COVER_map_t *map, U32 key) {
|
162
|
+
U32 i = COVER_map_index(map, key);
|
163
|
+
COVER_map_pair_t *del = &map->data[i];
|
164
|
+
U32 shift = 1;
|
165
|
+
if (del->value == MAP_EMPTY_VALUE) {
|
166
|
+
return;
|
167
|
+
}
|
168
|
+
for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
|
169
|
+
COVER_map_pair_t *const pos = &map->data[i];
|
170
|
+
/* If the position is empty we are done */
|
171
|
+
if (pos->value == MAP_EMPTY_VALUE) {
|
172
|
+
del->value = MAP_EMPTY_VALUE;
|
173
|
+
return;
|
174
|
+
}
|
175
|
+
/* If pos can be moved to del do so */
|
176
|
+
if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
|
177
|
+
del->key = pos->key;
|
178
|
+
del->value = pos->value;
|
179
|
+
del = pos;
|
180
|
+
shift = 1;
|
181
|
+
} else {
|
182
|
+
++shift;
|
183
|
+
}
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
/**
|
188
|
+
* Destroyes a map that is inited with COVER_map_init().
|
189
|
+
*/
|
190
|
+
static void COVER_map_destroy(COVER_map_t *map) {
|
191
|
+
if (map->data) {
|
192
|
+
free(map->data);
|
193
|
+
}
|
194
|
+
map->data = NULL;
|
195
|
+
map->size = 0;
|
196
|
+
}
|
197
|
+
|
198
|
+
/*-*************************************
|
199
|
+
* Context
|
200
|
+
***************************************/
|
201
|
+
|
202
|
+
typedef struct {
|
203
|
+
const BYTE *samples;
|
204
|
+
size_t *offsets;
|
205
|
+
const size_t *samplesSizes;
|
206
|
+
size_t nbSamples;
|
207
|
+
U32 *suffix;
|
208
|
+
size_t suffixSize;
|
209
|
+
U32 *freqs;
|
210
|
+
U32 *dmerAt;
|
211
|
+
unsigned d;
|
212
|
+
} COVER_ctx_t;
|
213
|
+
|
214
|
+
/* We need a global context for qsort... */
|
215
|
+
static COVER_ctx_t *g_ctx = NULL;
|
216
|
+
|
217
|
+
/*-*************************************
|
218
|
+
* Helper functions
|
219
|
+
***************************************/
|
220
|
+
|
221
|
+
/**
|
222
|
+
* Returns the sum of the sample sizes.
|
223
|
+
*/
|
224
|
+
static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
|
225
|
+
size_t sum = 0;
|
226
|
+
size_t i;
|
227
|
+
for (i = 0; i < nbSamples; ++i) {
|
228
|
+
sum += samplesSizes[i];
|
229
|
+
}
|
230
|
+
return sum;
|
231
|
+
}
|
232
|
+
|
233
|
+
/**
|
234
|
+
* Returns -1 if the dmer at lp is less than the dmer at rp.
|
235
|
+
* Return 0 if the dmers at lp and rp are equal.
|
236
|
+
* Returns 1 if the dmer at lp is greater than the dmer at rp.
|
237
|
+
*/
|
238
|
+
static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
|
239
|
+
const U32 lhs = *(const U32 *)lp;
|
240
|
+
const U32 rhs = *(const U32 *)rp;
|
241
|
+
return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
|
242
|
+
}
|
243
|
+
|
244
|
+
/**
|
245
|
+
* Same as COVER_cmp() except ties are broken by pointer value
|
246
|
+
* NOTE: g_ctx must be set to call this function. A global is required because
|
247
|
+
* qsort doesn't take an opaque pointer.
|
248
|
+
*/
|
249
|
+
static int COVER_strict_cmp(const void *lp, const void *rp) {
|
250
|
+
int result = COVER_cmp(g_ctx, lp, rp);
|
251
|
+
if (result == 0) {
|
252
|
+
result = lp < rp ? -1 : 1;
|
253
|
+
}
|
254
|
+
return result;
|
255
|
+
}
|
256
|
+
|
257
|
+
/**
|
258
|
+
* Returns the first pointer in [first, last) whose element does not compare
|
259
|
+
* less than value. If no such element exists it returns last.
|
260
|
+
*/
|
261
|
+
static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
|
262
|
+
size_t value) {
|
263
|
+
size_t count = last - first;
|
264
|
+
while (count != 0) {
|
265
|
+
size_t step = count / 2;
|
266
|
+
const size_t *ptr = first;
|
267
|
+
ptr += step;
|
268
|
+
if (*ptr < value) {
|
269
|
+
first = ++ptr;
|
270
|
+
count -= step + 1;
|
271
|
+
} else {
|
272
|
+
count = step;
|
273
|
+
}
|
274
|
+
}
|
275
|
+
return first;
|
276
|
+
}
|
277
|
+
|
278
|
+
/**
|
279
|
+
* Generic groupBy function.
|
280
|
+
* Groups an array sorted by cmp into groups with equivalent values.
|
281
|
+
* Calls grp for each group.
|
282
|
+
*/
|
283
|
+
static void
|
284
|
+
COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
|
285
|
+
int (*cmp)(COVER_ctx_t *, const void *, const void *),
|
286
|
+
void (*grp)(COVER_ctx_t *, const void *, const void *)) {
|
287
|
+
const BYTE *ptr = (const BYTE *)data;
|
288
|
+
size_t num = 0;
|
289
|
+
while (num < count) {
|
290
|
+
const BYTE *grpEnd = ptr + size;
|
291
|
+
++num;
|
292
|
+
while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
|
293
|
+
grpEnd += size;
|
294
|
+
++num;
|
295
|
+
}
|
296
|
+
grp(ctx, ptr, grpEnd);
|
297
|
+
ptr = grpEnd;
|
298
|
+
}
|
299
|
+
}
|
300
|
+
|
301
|
+
/*-*************************************
|
302
|
+
* Cover functions
|
303
|
+
***************************************/
|
304
|
+
|
305
|
+
/**
|
306
|
+
* Called on each group of positions with the same dmer.
|
307
|
+
* Counts the frequency of each dmer and saves it in the suffix array.
|
308
|
+
* Fills `ctx->dmerAt`.
|
309
|
+
*/
|
310
|
+
static void COVER_group(COVER_ctx_t *ctx, const void *group,
|
311
|
+
const void *groupEnd) {
|
312
|
+
/* The group consists of all the positions with the same first d bytes. */
|
313
|
+
const U32 *grpPtr = (const U32 *)group;
|
314
|
+
const U32 *grpEnd = (const U32 *)groupEnd;
|
315
|
+
/* The dmerId is how we will reference this dmer.
|
316
|
+
* This allows us to map the whole dmer space to a much smaller space, the
|
317
|
+
* size of the suffix array.
|
318
|
+
*/
|
319
|
+
const U32 dmerId = (U32)(grpPtr - ctx->suffix);
|
320
|
+
/* Count the number of samples this dmer shows up in */
|
321
|
+
U32 freq = 0;
|
322
|
+
/* Details */
|
323
|
+
const size_t *curOffsetPtr = ctx->offsets;
|
324
|
+
const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
|
325
|
+
/* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
|
326
|
+
* different sample than the last.
|
327
|
+
*/
|
328
|
+
size_t curSampleEnd = ctx->offsets[0];
|
329
|
+
for (; grpPtr != grpEnd; ++grpPtr) {
|
330
|
+
/* Save the dmerId for this position so we can get back to it. */
|
331
|
+
ctx->dmerAt[*grpPtr] = dmerId;
|
332
|
+
/* Dictionaries only help for the first reference to the dmer.
|
333
|
+
* After that zstd can reference the match from the previous reference.
|
334
|
+
* So only count each dmer once for each sample it is in.
|
335
|
+
*/
|
336
|
+
if (*grpPtr < curSampleEnd) {
|
337
|
+
continue;
|
338
|
+
}
|
339
|
+
freq += 1;
|
340
|
+
/* Binary search to find the end of the sample *grpPtr is in.
|
341
|
+
* In the common case that grpPtr + 1 == grpEnd we can skip the binary
|
342
|
+
* search because the loop is over.
|
343
|
+
*/
|
344
|
+
if (grpPtr + 1 != grpEnd) {
|
345
|
+
const size_t *sampleEndPtr =
|
346
|
+
COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
|
347
|
+
curSampleEnd = *sampleEndPtr;
|
348
|
+
curOffsetPtr = sampleEndPtr + 1;
|
349
|
+
}
|
350
|
+
}
|
351
|
+
/* At this point we are never going to look at this segment of the suffix
|
352
|
+
* array again. We take advantage of this fact to save memory.
|
353
|
+
* We store the frequency of the dmer in the first position of the group,
|
354
|
+
* which is dmerId.
|
355
|
+
*/
|
356
|
+
ctx->suffix[dmerId] = freq;
|
357
|
+
}
|
358
|
+
|
359
|
+
/**
|
360
|
+
* A segment is a range in the source as well as the score of the segment.
|
361
|
+
*/
|
362
|
+
typedef struct {
|
363
|
+
U32 begin;
|
364
|
+
U32 end;
|
365
|
+
double score;
|
366
|
+
} COVER_segment_t;
|
367
|
+
|
368
|
+
/**
|
369
|
+
* Selects the best segment in an epoch.
|
370
|
+
* Segments of are scored according to the function:
|
371
|
+
*
|
372
|
+
* Let F(d) be the frequency of dmer d.
|
373
|
+
* Let S_i be the dmer at position i of segment S which has length k.
|
374
|
+
*
|
375
|
+
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
|
376
|
+
*
|
377
|
+
* Once the dmer d is in the dictionay we set F(d) = 0.
|
378
|
+
*/
|
379
|
+
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
|
380
|
+
COVER_map_t *activeDmers, U32 begin,
|
381
|
+
U32 end, COVER_params_t parameters) {
|
382
|
+
/* Constants */
|
383
|
+
const U32 k = parameters.k;
|
384
|
+
const U32 d = parameters.d;
|
385
|
+
const U32 dmersInK = k - d + 1;
|
386
|
+
/* Try each segment (activeSegment) and save the best (bestSegment) */
|
387
|
+
COVER_segment_t bestSegment = {0, 0, 0};
|
388
|
+
COVER_segment_t activeSegment;
|
389
|
+
/* Reset the activeDmers in the segment */
|
390
|
+
COVER_map_clear(activeDmers);
|
391
|
+
/* The activeSegment starts at the beginning of the epoch. */
|
392
|
+
activeSegment.begin = begin;
|
393
|
+
activeSegment.end = begin;
|
394
|
+
activeSegment.score = 0;
|
395
|
+
/* Slide the activeSegment through the whole epoch.
|
396
|
+
* Save the best segment in bestSegment.
|
397
|
+
*/
|
398
|
+
while (activeSegment.end < end) {
|
399
|
+
/* The dmerId for the dmer at the next position */
|
400
|
+
U32 newDmer = ctx->dmerAt[activeSegment.end];
|
401
|
+
/* The entry in activeDmers for this dmerId */
|
402
|
+
U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
|
403
|
+
/* If the dmer isn't already present in the segment add its score. */
|
404
|
+
if (*newDmerOcc == 0) {
|
405
|
+
/* The paper suggest using the L-0.5 norm, but experiments show that it
|
406
|
+
* doesn't help.
|
407
|
+
*/
|
408
|
+
activeSegment.score += freqs[newDmer];
|
409
|
+
}
|
410
|
+
/* Add the dmer to the segment */
|
411
|
+
activeSegment.end += 1;
|
412
|
+
*newDmerOcc += 1;
|
413
|
+
|
414
|
+
/* If the window is now too large, drop the first position */
|
415
|
+
if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
|
416
|
+
U32 delDmer = ctx->dmerAt[activeSegment.begin];
|
417
|
+
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
|
418
|
+
activeSegment.begin += 1;
|
419
|
+
*delDmerOcc -= 1;
|
420
|
+
/* If this is the last occurence of the dmer, subtract its score */
|
421
|
+
if (*delDmerOcc == 0) {
|
422
|
+
COVER_map_remove(activeDmers, delDmer);
|
423
|
+
activeSegment.score -= freqs[delDmer];
|
424
|
+
}
|
425
|
+
}
|
426
|
+
|
427
|
+
/* If this segment is the best so far save it */
|
428
|
+
if (activeSegment.score > bestSegment.score) {
|
429
|
+
bestSegment = activeSegment;
|
430
|
+
}
|
431
|
+
}
|
432
|
+
{
|
433
|
+
/* Trim off the zero frequency head and tail from the segment. */
|
434
|
+
U32 newBegin = bestSegment.end;
|
435
|
+
U32 newEnd = bestSegment.begin;
|
436
|
+
U32 pos;
|
437
|
+
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
|
438
|
+
U32 freq = freqs[ctx->dmerAt[pos]];
|
439
|
+
if (freq != 0) {
|
440
|
+
newBegin = MIN(newBegin, pos);
|
441
|
+
newEnd = pos + 1;
|
442
|
+
}
|
443
|
+
}
|
444
|
+
bestSegment.begin = newBegin;
|
445
|
+
bestSegment.end = newEnd;
|
446
|
+
}
|
447
|
+
{
|
448
|
+
/* Zero out the frequency of each dmer covered by the chosen segment. */
|
449
|
+
U32 pos;
|
450
|
+
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
|
451
|
+
freqs[ctx->dmerAt[pos]] = 0;
|
452
|
+
}
|
453
|
+
}
|
454
|
+
return bestSegment;
|
455
|
+
}
|
456
|
+
|
457
|
+
/**
|
458
|
+
* Check the validity of the parameters.
|
459
|
+
* Returns non-zero if the parameters are valid and 0 otherwise.
|
460
|
+
*/
|
461
|
+
static int COVER_checkParameters(COVER_params_t parameters) {
|
462
|
+
/* k and d are required parameters */
|
463
|
+
if (parameters.d == 0 || parameters.k == 0) {
|
464
|
+
return 0;
|
465
|
+
}
|
466
|
+
/* d <= k */
|
467
|
+
if (parameters.d > parameters.k) {
|
468
|
+
return 0;
|
469
|
+
}
|
470
|
+
return 1;
|
471
|
+
}
|
472
|
+
|
473
|
+
/**
|
474
|
+
* Clean up a context initialized with `COVER_ctx_init()`.
|
475
|
+
*/
|
476
|
+
static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
477
|
+
if (!ctx) {
|
478
|
+
return;
|
479
|
+
}
|
480
|
+
if (ctx->suffix) {
|
481
|
+
free(ctx->suffix);
|
482
|
+
ctx->suffix = NULL;
|
483
|
+
}
|
484
|
+
if (ctx->freqs) {
|
485
|
+
free(ctx->freqs);
|
486
|
+
ctx->freqs = NULL;
|
487
|
+
}
|
488
|
+
if (ctx->dmerAt) {
|
489
|
+
free(ctx->dmerAt);
|
490
|
+
ctx->dmerAt = NULL;
|
491
|
+
}
|
492
|
+
if (ctx->offsets) {
|
493
|
+
free(ctx->offsets);
|
494
|
+
ctx->offsets = NULL;
|
495
|
+
}
|
496
|
+
}
|
497
|
+
|
498
|
+
/**
|
499
|
+
* Prepare a context for dictionary building.
|
500
|
+
* The context is only dependent on the parameter `d` and can used multiple
|
501
|
+
* times.
|
502
|
+
* Returns 1 on success or zero on error.
|
503
|
+
* The context must be destroyed with `COVER_ctx_destroy()`.
|
504
|
+
*/
|
505
|
+
static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
506
|
+
const size_t *samplesSizes, unsigned nbSamples,
|
507
|
+
unsigned d) {
|
508
|
+
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
509
|
+
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
|
510
|
+
/* Checks */
|
511
|
+
if (totalSamplesSize < d ||
|
512
|
+
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
513
|
+
DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
|
514
|
+
(COVER_MAX_SAMPLES_SIZE >> 20));
|
515
|
+
return 0;
|
516
|
+
}
|
517
|
+
/* Zero the context */
|
518
|
+
memset(ctx, 0, sizeof(*ctx));
|
519
|
+
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
|
520
|
+
(U32)totalSamplesSize);
|
521
|
+
ctx->samples = samples;
|
522
|
+
ctx->samplesSizes = samplesSizes;
|
523
|
+
ctx->nbSamples = nbSamples;
|
524
|
+
/* Partial suffix array */
|
525
|
+
ctx->suffixSize = totalSamplesSize - d + 1;
|
526
|
+
ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
527
|
+
/* Maps index to the dmerID */
|
528
|
+
ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
529
|
+
/* The offsets of each file */
|
530
|
+
ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
|
531
|
+
if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
|
532
|
+
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
|
533
|
+
COVER_ctx_destroy(ctx);
|
534
|
+
return 0;
|
535
|
+
}
|
536
|
+
ctx->freqs = NULL;
|
537
|
+
ctx->d = d;
|
538
|
+
|
539
|
+
/* Fill offsets from the samlesSizes */
|
540
|
+
{
|
541
|
+
U32 i;
|
542
|
+
ctx->offsets[0] = 0;
|
543
|
+
for (i = 1; i <= nbSamples; ++i) {
|
544
|
+
ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
|
545
|
+
}
|
546
|
+
}
|
547
|
+
DISPLAYLEVEL(2, "Constructing partial suffix array\n");
|
548
|
+
{
|
549
|
+
/* suffix is a partial suffix array.
|
550
|
+
* It only sorts suffixes by their first parameters.d bytes.
|
551
|
+
* The sort is stable, so each dmer group is sorted by position in input.
|
552
|
+
*/
|
553
|
+
U32 i;
|
554
|
+
for (i = 0; i < ctx->suffixSize; ++i) {
|
555
|
+
ctx->suffix[i] = i;
|
556
|
+
}
|
557
|
+
/* qsort doesn't take an opaque pointer, so pass as a global */
|
558
|
+
g_ctx = ctx;
|
559
|
+
qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp);
|
560
|
+
}
|
561
|
+
DISPLAYLEVEL(2, "Computing frequencies\n");
|
562
|
+
/* For each dmer group (group of positions with the same first d bytes):
|
563
|
+
* 1. For each position we set dmerAt[position] = dmerID. The dmerID is
|
564
|
+
* (groupBeginPtr - suffix). This allows us to go from position to
|
565
|
+
* dmerID so we can look up values in freq.
|
566
|
+
* 2. We calculate how many samples the dmer occurs in and save it in
|
567
|
+
* freqs[dmerId].
|
568
|
+
*/
|
569
|
+
COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp,
|
570
|
+
&COVER_group);
|
571
|
+
ctx->freqs = ctx->suffix;
|
572
|
+
ctx->suffix = NULL;
|
573
|
+
return 1;
|
574
|
+
}
|
575
|
+
|
576
|
+
/**
|
577
|
+
* Given the prepared context build the dictionary.
|
578
|
+
*/
|
579
|
+
static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
580
|
+
COVER_map_t *activeDmers, void *dictBuffer,
|
581
|
+
size_t dictBufferCapacity,
|
582
|
+
COVER_params_t parameters) {
|
583
|
+
BYTE *const dict = (BYTE *)dictBuffer;
|
584
|
+
size_t tail = dictBufferCapacity;
|
585
|
+
/* Divide the data up into epochs of equal size.
|
586
|
+
* We will select at least one segment from each epoch.
|
587
|
+
*/
|
588
|
+
const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
|
589
|
+
const U32 epochSize = (U32)(ctx->suffixSize / epochs);
|
590
|
+
size_t epoch;
|
591
|
+
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
|
592
|
+
epochSize);
|
593
|
+
/* Loop through the epochs until there are no more segments or the dictionary
|
594
|
+
* is full.
|
595
|
+
*/
|
596
|
+
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
597
|
+
const U32 epochBegin = (U32)(epoch * epochSize);
|
598
|
+
const U32 epochEnd = epochBegin + epochSize;
|
599
|
+
size_t segmentSize;
|
600
|
+
/* Select a segment */
|
601
|
+
COVER_segment_t segment = COVER_selectSegment(
|
602
|
+
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
603
|
+
/* Trim the segment if necessary and if it is empty then we are done */
|
604
|
+
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
605
|
+
if (segmentSize == 0) {
|
606
|
+
break;
|
607
|
+
}
|
608
|
+
/* We fill the dictionary from the back to allow the best segments to be
|
609
|
+
* referenced with the smallest offsets.
|
610
|
+
*/
|
611
|
+
tail -= segmentSize;
|
612
|
+
memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
|
613
|
+
DISPLAYUPDATE(
|
614
|
+
2, "\r%u%% ",
|
615
|
+
(U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
|
616
|
+
}
|
617
|
+
DISPLAYLEVEL(2, "\r%79s\r", "");
|
618
|
+
return tail;
|
619
|
+
}
|
620
|
+
|
621
|
+
/**
|
622
|
+
* Translate from COVER_params_t to ZDICT_params_t required for finalizing the
|
623
|
+
* dictionary.
|
624
|
+
*/
|
625
|
+
static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) {
|
626
|
+
ZDICT_params_t zdictParams;
|
627
|
+
memset(&zdictParams, 0, sizeof(zdictParams));
|
628
|
+
zdictParams.notificationLevel = 1;
|
629
|
+
zdictParams.dictID = parameters.dictID;
|
630
|
+
zdictParams.compressionLevel = parameters.compressionLevel;
|
631
|
+
return zdictParams;
|
632
|
+
}
|
633
|
+
|
634
|
+
ZDICTLIB_API size_t COVER_trainFromBuffer(
|
635
|
+
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
|
636
|
+
const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) {
|
637
|
+
BYTE *const dict = (BYTE *)dictBuffer;
|
638
|
+
COVER_ctx_t ctx;
|
639
|
+
COVER_map_t activeDmers;
|
640
|
+
/* Checks */
|
641
|
+
if (!COVER_checkParameters(parameters)) {
|
642
|
+
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
643
|
+
return ERROR(GENERIC);
|
644
|
+
}
|
645
|
+
if (nbSamples == 0) {
|
646
|
+
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
647
|
+
return ERROR(GENERIC);
|
648
|
+
}
|
649
|
+
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
650
|
+
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
651
|
+
ZDICT_DICTSIZE_MIN);
|
652
|
+
return ERROR(dstSize_tooSmall);
|
653
|
+
}
|
654
|
+
/* Initialize global data */
|
655
|
+
g_displayLevel = parameters.notificationLevel;
|
656
|
+
/* Initialize context and activeDmers */
|
657
|
+
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
658
|
+
parameters.d)) {
|
659
|
+
return ERROR(GENERIC);
|
660
|
+
}
|
661
|
+
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
662
|
+
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
663
|
+
COVER_ctx_destroy(&ctx);
|
664
|
+
return ERROR(GENERIC);
|
665
|
+
}
|
666
|
+
|
667
|
+
DISPLAYLEVEL(2, "Building dictionary\n");
|
668
|
+
{
|
669
|
+
const size_t tail =
|
670
|
+
COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
|
671
|
+
dictBufferCapacity, parameters);
|
672
|
+
ZDICT_params_t zdictParams = COVER_translateParams(parameters);
|
673
|
+
const size_t dictionarySize = ZDICT_finalizeDictionary(
|
674
|
+
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
|
675
|
+
samplesBuffer, samplesSizes, nbSamples, zdictParams);
|
676
|
+
if (!ZSTD_isError(dictionarySize)) {
|
677
|
+
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
|
678
|
+
(U32)dictionarySize);
|
679
|
+
}
|
680
|
+
COVER_ctx_destroy(&ctx);
|
681
|
+
COVER_map_destroy(&activeDmers);
|
682
|
+
return dictionarySize;
|
683
|
+
}
|
684
|
+
}
|
685
|
+
|
686
|
+
/**
|
687
|
+
* COVER_best_t is used for two purposes:
|
688
|
+
* 1. Synchronizing threads.
|
689
|
+
* 2. Saving the best parameters and dictionary.
|
690
|
+
*
|
691
|
+
* All of the methods except COVER_best_init() are thread safe if zstd is
|
692
|
+
* compiled with multithreaded support.
|
693
|
+
*/
|
694
|
+
typedef struct COVER_best_s {
|
695
|
+
pthread_mutex_t mutex;
|
696
|
+
pthread_cond_t cond;
|
697
|
+
size_t liveJobs;
|
698
|
+
void *dict;
|
699
|
+
size_t dictSize;
|
700
|
+
COVER_params_t parameters;
|
701
|
+
size_t compressedSize;
|
702
|
+
} COVER_best_t;
|
703
|
+
|
704
|
+
/**
|
705
|
+
* Initialize the `COVER_best_t`.
|
706
|
+
*/
|
707
|
+
static void COVER_best_init(COVER_best_t *best) {
|
708
|
+
if (!best) {
|
709
|
+
return;
|
710
|
+
}
|
711
|
+
pthread_mutex_init(&best->mutex, NULL);
|
712
|
+
pthread_cond_init(&best->cond, NULL);
|
713
|
+
best->liveJobs = 0;
|
714
|
+
best->dict = NULL;
|
715
|
+
best->dictSize = 0;
|
716
|
+
best->compressedSize = (size_t)-1;
|
717
|
+
memset(&best->parameters, 0, sizeof(best->parameters));
|
718
|
+
}
|
719
|
+
|
720
|
+
/**
|
721
|
+
* Wait until liveJobs == 0.
|
722
|
+
*/
|
723
|
+
static void COVER_best_wait(COVER_best_t *best) {
|
724
|
+
if (!best) {
|
725
|
+
return;
|
726
|
+
}
|
727
|
+
pthread_mutex_lock(&best->mutex);
|
728
|
+
while (best->liveJobs != 0) {
|
729
|
+
pthread_cond_wait(&best->cond, &best->mutex);
|
730
|
+
}
|
731
|
+
pthread_mutex_unlock(&best->mutex);
|
732
|
+
}
|
733
|
+
|
734
|
+
/**
|
735
|
+
* Call COVER_best_wait() and then destroy the COVER_best_t.
|
736
|
+
*/
|
737
|
+
static void COVER_best_destroy(COVER_best_t *best) {
|
738
|
+
if (!best) {
|
739
|
+
return;
|
740
|
+
}
|
741
|
+
COVER_best_wait(best);
|
742
|
+
if (best->dict) {
|
743
|
+
free(best->dict);
|
744
|
+
}
|
745
|
+
pthread_mutex_destroy(&best->mutex);
|
746
|
+
pthread_cond_destroy(&best->cond);
|
747
|
+
}
|
748
|
+
|
749
|
+
/**
|
750
|
+
* Called when a thread is about to be launched.
|
751
|
+
* Increments liveJobs.
|
752
|
+
*/
|
753
|
+
static void COVER_best_start(COVER_best_t *best) {
|
754
|
+
if (!best) {
|
755
|
+
return;
|
756
|
+
}
|
757
|
+
pthread_mutex_lock(&best->mutex);
|
758
|
+
++best->liveJobs;
|
759
|
+
pthread_mutex_unlock(&best->mutex);
|
760
|
+
}
|
761
|
+
|
762
|
+
/**
|
763
|
+
* Called when a thread finishes executing, both on error or success.
|
764
|
+
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
765
|
+
* If this dictionary is the best so far save it and its parameters.
|
766
|
+
*/
|
767
|
+
static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
768
|
+
COVER_params_t parameters, void *dict,
|
769
|
+
size_t dictSize) {
|
770
|
+
if (!best) {
|
771
|
+
return;
|
772
|
+
}
|
773
|
+
{
|
774
|
+
size_t liveJobs;
|
775
|
+
pthread_mutex_lock(&best->mutex);
|
776
|
+
--best->liveJobs;
|
777
|
+
liveJobs = best->liveJobs;
|
778
|
+
/* If the new dictionary is better */
|
779
|
+
if (compressedSize < best->compressedSize) {
|
780
|
+
/* Allocate space if necessary */
|
781
|
+
if (!best->dict || best->dictSize < dictSize) {
|
782
|
+
if (best->dict) {
|
783
|
+
free(best->dict);
|
784
|
+
}
|
785
|
+
best->dict = malloc(dictSize);
|
786
|
+
if (!best->dict) {
|
787
|
+
best->compressedSize = ERROR(GENERIC);
|
788
|
+
best->dictSize = 0;
|
789
|
+
return;
|
790
|
+
}
|
791
|
+
}
|
792
|
+
/* Save the dictionary, parameters, and size */
|
793
|
+
memcpy(best->dict, dict, dictSize);
|
794
|
+
best->dictSize = dictSize;
|
795
|
+
best->parameters = parameters;
|
796
|
+
best->compressedSize = compressedSize;
|
797
|
+
}
|
798
|
+
pthread_mutex_unlock(&best->mutex);
|
799
|
+
if (liveJobs == 0) {
|
800
|
+
pthread_cond_broadcast(&best->cond);
|
801
|
+
}
|
802
|
+
}
|
803
|
+
}
|
804
|
+
|
805
|
+
/**
|
806
|
+
* Parameters for COVER_tryParameters().
|
807
|
+
*/
|
808
|
+
typedef struct COVER_tryParameters_data_s {
|
809
|
+
const COVER_ctx_t *ctx;
|
810
|
+
COVER_best_t *best;
|
811
|
+
size_t dictBufferCapacity;
|
812
|
+
COVER_params_t parameters;
|
813
|
+
} COVER_tryParameters_data_t;
|
814
|
+
|
815
|
+
/**
|
816
|
+
* Tries a set of parameters and upates the COVER_best_t with the results.
|
817
|
+
* This function is thread safe if zstd is compiled with multithreaded support.
|
818
|
+
* It takes its parameters as an *OWNING* opaque pointer to support threading.
|
819
|
+
*/
|
820
|
+
static void COVER_tryParameters(void *opaque) {
|
821
|
+
/* Save parameters as local variables */
|
822
|
+
COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
|
823
|
+
const COVER_ctx_t *const ctx = data->ctx;
|
824
|
+
const COVER_params_t parameters = data->parameters;
|
825
|
+
size_t dictBufferCapacity = data->dictBufferCapacity;
|
826
|
+
size_t totalCompressedSize = ERROR(GENERIC);
|
827
|
+
/* Allocate space for hash table, dict, and freqs */
|
828
|
+
COVER_map_t activeDmers;
|
829
|
+
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
830
|
+
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
831
|
+
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
832
|
+
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
833
|
+
goto _cleanup;
|
834
|
+
}
|
835
|
+
if (!dict || !freqs) {
|
836
|
+
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
|
837
|
+
goto _cleanup;
|
838
|
+
}
|
839
|
+
/* Copy the frequencies because we need to modify them */
|
840
|
+
memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
|
841
|
+
/* Build the dictionary */
|
842
|
+
{
|
843
|
+
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
|
844
|
+
dictBufferCapacity, parameters);
|
845
|
+
const ZDICT_params_t zdictParams = COVER_translateParams(parameters);
|
846
|
+
dictBufferCapacity = ZDICT_finalizeDictionary(
|
847
|
+
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
|
848
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams);
|
849
|
+
if (ZDICT_isError(dictBufferCapacity)) {
|
850
|
+
DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
|
851
|
+
goto _cleanup;
|
852
|
+
}
|
853
|
+
}
|
854
|
+
/* Check total compressed size */
|
855
|
+
{
|
856
|
+
/* Pointers */
|
857
|
+
ZSTD_CCtx *cctx;
|
858
|
+
ZSTD_CDict *cdict;
|
859
|
+
void *dst;
|
860
|
+
/* Local variables */
|
861
|
+
size_t dstCapacity;
|
862
|
+
size_t i;
|
863
|
+
/* Allocate dst with enough space to compress the maximum sized sample */
|
864
|
+
{
|
865
|
+
size_t maxSampleSize = 0;
|
866
|
+
for (i = 0; i < ctx->nbSamples; ++i) {
|
867
|
+
maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
|
868
|
+
}
|
869
|
+
dstCapacity = ZSTD_compressBound(maxSampleSize);
|
870
|
+
dst = malloc(dstCapacity);
|
871
|
+
}
|
872
|
+
/* Create the cctx and cdict */
|
873
|
+
cctx = ZSTD_createCCtx();
|
874
|
+
cdict =
|
875
|
+
ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel);
|
876
|
+
if (!dst || !cctx || !cdict) {
|
877
|
+
goto _compressCleanup;
|
878
|
+
}
|
879
|
+
/* Compress each sample and sum their sizes (or error) */
|
880
|
+
totalCompressedSize = 0;
|
881
|
+
for (i = 0; i < ctx->nbSamples; ++i) {
|
882
|
+
const size_t size = ZSTD_compress_usingCDict(
|
883
|
+
cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
|
884
|
+
ctx->samplesSizes[i], cdict);
|
885
|
+
if (ZSTD_isError(size)) {
|
886
|
+
totalCompressedSize = ERROR(GENERIC);
|
887
|
+
goto _compressCleanup;
|
888
|
+
}
|
889
|
+
totalCompressedSize += size;
|
890
|
+
}
|
891
|
+
_compressCleanup:
|
892
|
+
ZSTD_freeCCtx(cctx);
|
893
|
+
ZSTD_freeCDict(cdict);
|
894
|
+
if (dst) {
|
895
|
+
free(dst);
|
896
|
+
}
|
897
|
+
}
|
898
|
+
|
899
|
+
_cleanup:
|
900
|
+
COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
|
901
|
+
dictBufferCapacity);
|
902
|
+
free(data);
|
903
|
+
COVER_map_destroy(&activeDmers);
|
904
|
+
if (dict) {
|
905
|
+
free(dict);
|
906
|
+
}
|
907
|
+
if (freqs) {
|
908
|
+
free(freqs);
|
909
|
+
}
|
910
|
+
}
|
911
|
+
|
912
|
+
ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
|
913
|
+
size_t dictBufferCapacity,
|
914
|
+
const void *samplesBuffer,
|
915
|
+
const size_t *samplesSizes,
|
916
|
+
unsigned nbSamples,
|
917
|
+
COVER_params_t *parameters) {
|
918
|
+
/* constants */
|
919
|
+
const unsigned nbThreads = parameters->nbThreads;
|
920
|
+
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
|
921
|
+
const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d;
|
922
|
+
const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k;
|
923
|
+
const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k;
|
924
|
+
const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps;
|
925
|
+
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
|
926
|
+
const unsigned kIterations =
|
927
|
+
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
928
|
+
/* Local variables */
|
929
|
+
const int displayLevel = parameters->notificationLevel;
|
930
|
+
unsigned iteration = 1;
|
931
|
+
unsigned d;
|
932
|
+
unsigned k;
|
933
|
+
COVER_best_t best;
|
934
|
+
POOL_ctx *pool = NULL;
|
935
|
+
/* Checks */
|
936
|
+
if (kMinK < kMaxD || kMaxK < kMinK) {
|
937
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
938
|
+
return ERROR(GENERIC);
|
939
|
+
}
|
940
|
+
if (nbSamples == 0) {
|
941
|
+
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
942
|
+
return ERROR(GENERIC);
|
943
|
+
}
|
944
|
+
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
945
|
+
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
946
|
+
ZDICT_DICTSIZE_MIN);
|
947
|
+
return ERROR(dstSize_tooSmall);
|
948
|
+
}
|
949
|
+
if (nbThreads > 1) {
|
950
|
+
pool = POOL_create(nbThreads, 1);
|
951
|
+
if (!pool) {
|
952
|
+
return ERROR(memory_allocation);
|
953
|
+
}
|
954
|
+
}
|
955
|
+
/* Initialization */
|
956
|
+
COVER_best_init(&best);
|
957
|
+
/* Turn down global display level to clean up display at level 2 and below */
|
958
|
+
g_displayLevel = parameters->notificationLevel - 1;
|
959
|
+
/* Loop through d first because each new value needs a new context */
|
960
|
+
LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
|
961
|
+
kIterations);
|
962
|
+
for (d = kMinD; d <= kMaxD; d += 2) {
|
963
|
+
/* Initialize the context for this value of d */
|
964
|
+
COVER_ctx_t ctx;
|
965
|
+
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
966
|
+
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
|
967
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
968
|
+
COVER_best_destroy(&best);
|
969
|
+
POOL_free(pool);
|
970
|
+
return ERROR(GENERIC);
|
971
|
+
}
|
972
|
+
/* Loop through k reusing the same context */
|
973
|
+
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
974
|
+
/* Prepare the arguments */
|
975
|
+
COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
|
976
|
+
sizeof(COVER_tryParameters_data_t));
|
977
|
+
LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
|
978
|
+
if (!data) {
|
979
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
|
980
|
+
COVER_best_destroy(&best);
|
981
|
+
COVER_ctx_destroy(&ctx);
|
982
|
+
POOL_free(pool);
|
983
|
+
return ERROR(GENERIC);
|
984
|
+
}
|
985
|
+
data->ctx = &ctx;
|
986
|
+
data->best = &best;
|
987
|
+
data->dictBufferCapacity = dictBufferCapacity;
|
988
|
+
data->parameters = *parameters;
|
989
|
+
data->parameters.k = k;
|
990
|
+
data->parameters.d = d;
|
991
|
+
data->parameters.steps = kSteps;
|
992
|
+
/* Check the parameters */
|
993
|
+
if (!COVER_checkParameters(data->parameters)) {
|
994
|
+
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
995
|
+
free(data);
|
996
|
+
continue;
|
997
|
+
}
|
998
|
+
/* Call the function and pass ownership of data to it */
|
999
|
+
COVER_best_start(&best);
|
1000
|
+
if (pool) {
|
1001
|
+
POOL_add(pool, &COVER_tryParameters, data);
|
1002
|
+
} else {
|
1003
|
+
COVER_tryParameters(data);
|
1004
|
+
}
|
1005
|
+
/* Print status */
|
1006
|
+
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
|
1007
|
+
(U32)((iteration * 100) / kIterations));
|
1008
|
+
++iteration;
|
1009
|
+
}
|
1010
|
+
COVER_best_wait(&best);
|
1011
|
+
COVER_ctx_destroy(&ctx);
|
1012
|
+
}
|
1013
|
+
LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
|
1014
|
+
/* Fill the output buffer and parameters with output of the best parameters */
|
1015
|
+
{
|
1016
|
+
const size_t dictSize = best.dictSize;
|
1017
|
+
if (ZSTD_isError(best.compressedSize)) {
|
1018
|
+
const size_t compressedSize = best.compressedSize;
|
1019
|
+
COVER_best_destroy(&best);
|
1020
|
+
POOL_free(pool);
|
1021
|
+
return compressedSize;
|
1022
|
+
}
|
1023
|
+
*parameters = best.parameters;
|
1024
|
+
memcpy(dictBuffer, best.dict, dictSize);
|
1025
|
+
COVER_best_destroy(&best);
|
1026
|
+
POOL_free(pool);
|
1027
|
+
return dictSize;
|
1028
|
+
}
|
1029
|
+
}
|