extzstd 0.1.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/HISTORY.ja.md +18 -0
- data/README.md +15 -50
- data/contrib/zstd/CONTRIBUTING.md +1 -1
- data/contrib/zstd/COPYING +339 -0
- data/contrib/zstd/Makefile +82 -51
- data/contrib/zstd/NEWS +92 -5
- data/contrib/zstd/README.md +50 -41
- data/contrib/zstd/appveyor.yml +164 -102
- data/contrib/zstd/circle.yml +10 -22
- data/contrib/zstd/lib/BUCK +31 -10
- data/contrib/zstd/lib/Makefile +57 -31
- data/contrib/zstd/lib/README.md +68 -37
- data/contrib/zstd/lib/common/bitstream.h +130 -76
- data/contrib/zstd/lib/common/compiler.h +86 -0
- data/contrib/zstd/lib/common/error_private.c +15 -11
- data/contrib/zstd/lib/common/error_private.h +8 -8
- data/contrib/zstd/lib/common/fse.h +19 -9
- data/contrib/zstd/lib/common/fse_decompress.c +3 -22
- data/contrib/zstd/lib/common/huf.h +68 -26
- data/contrib/zstd/lib/common/mem.h +23 -35
- data/contrib/zstd/lib/common/pool.c +123 -63
- data/contrib/zstd/lib/common/pool.h +19 -10
- data/contrib/zstd/lib/common/threading.c +11 -16
- data/contrib/zstd/lib/common/threading.h +52 -33
- data/contrib/zstd/lib/common/xxhash.c +28 -22
- data/contrib/zstd/lib/common/zstd_common.c +40 -27
- data/contrib/zstd/lib/common/zstd_errors.h +43 -34
- data/contrib/zstd/lib/common/zstd_internal.h +131 -123
- data/contrib/zstd/lib/compress/fse_compress.c +17 -33
- data/contrib/zstd/lib/compress/huf_compress.c +15 -9
- data/contrib/zstd/lib/compress/zstd_compress.c +2096 -2363
- data/contrib/zstd/lib/compress/zstd_compress_internal.h +462 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.c +309 -0
- data/contrib/zstd/lib/compress/zstd_double_fast.h +29 -0
- data/contrib/zstd/lib/compress/zstd_fast.c +243 -0
- data/contrib/zstd/lib/compress/zstd_fast.h +31 -0
- data/contrib/zstd/lib/compress/zstd_lazy.c +765 -0
- data/contrib/zstd/lib/compress/zstd_lazy.h +39 -0
- data/contrib/zstd/lib/compress/zstd_ldm.c +707 -0
- data/contrib/zstd/lib/compress/zstd_ldm.h +68 -0
- data/contrib/zstd/lib/compress/zstd_opt.c +785 -0
- data/contrib/zstd/lib/compress/zstd_opt.h +19 -908
- data/contrib/zstd/lib/compress/zstdmt_compress.c +737 -327
- data/contrib/zstd/lib/compress/zstdmt_compress.h +88 -26
- data/contrib/zstd/lib/decompress/huf_decompress.c +158 -50
- data/contrib/zstd/lib/decompress/zstd_decompress.c +884 -699
- data/contrib/zstd/lib/deprecated/zbuff.h +5 -4
- data/contrib/zstd/lib/deprecated/zbuff_common.c +5 -5
- data/contrib/zstd/lib/deprecated/zbuff_compress.c +6 -4
- data/contrib/zstd/lib/deprecated/zbuff_decompress.c +5 -4
- data/contrib/zstd/lib/dictBuilder/cover.c +93 -77
- data/contrib/zstd/lib/dictBuilder/zdict.c +107 -92
- data/contrib/zstd/lib/dictBuilder/zdict.h +112 -102
- data/contrib/zstd/lib/legacy/zstd_legacy.h +9 -4
- data/contrib/zstd/lib/legacy/zstd_v01.c +7 -6
- data/contrib/zstd/lib/legacy/zstd_v01.h +5 -4
- data/contrib/zstd/lib/legacy/zstd_v02.c +27 -99
- data/contrib/zstd/lib/legacy/zstd_v02.h +5 -4
- data/contrib/zstd/lib/legacy/zstd_v03.c +26 -98
- data/contrib/zstd/lib/legacy/zstd_v03.h +5 -4
- data/contrib/zstd/lib/legacy/zstd_v04.c +22 -91
- data/contrib/zstd/lib/legacy/zstd_v04.h +5 -4
- data/contrib/zstd/lib/legacy/zstd_v05.c +23 -99
- data/contrib/zstd/lib/legacy/zstd_v05.h +5 -4
- data/contrib/zstd/lib/legacy/zstd_v06.c +22 -96
- data/contrib/zstd/lib/legacy/zstd_v06.h +5 -4
- data/contrib/zstd/lib/legacy/zstd_v07.c +19 -95
- data/contrib/zstd/lib/legacy/zstd_v07.h +5 -4
- data/contrib/zstd/lib/zstd.h +895 -271
- data/ext/extconf.rb +11 -2
- data/ext/extzstd.c +45 -128
- data/ext/extzstd.h +74 -31
- data/ext/extzstd_stream.c +401 -142
- data/ext/zstd_common.c +5 -0
- data/ext/zstd_compress.c +8 -0
- data/ext/zstd_decompress.c +1 -0
- data/ext/zstd_dictbuilder.c +2 -0
- data/lib/extzstd/version.rb +1 -1
- data/lib/extzstd.rb +48 -1
- data/test/test_basic.rb +9 -1
- metadata +17 -7
- data/HISTORY.ja +0 -10
- data/contrib/zstd/LICENSE-examples +0 -11
- data/contrib/zstd/PATENTS +0 -33
|
@@ -1,18 +1,20 @@
|
|
|
1
|
-
|
|
1
|
+
/*
|
|
2
2
|
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
|
3
3
|
* All rights reserved.
|
|
4
4
|
*
|
|
5
|
-
* This source code is licensed under the BSD-style license found in the
|
|
6
|
-
* LICENSE file in the root directory of this source tree
|
|
7
|
-
*
|
|
5
|
+
* This source code is licensed under both the BSD-style license (found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
7
|
+
* in the COPYING file in the root directory of this source tree).
|
|
8
|
+
* You may select, at your option, one of the above-listed licenses.
|
|
8
9
|
*/
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
/*-**************************************
|
|
12
13
|
* Tuning parameters
|
|
13
14
|
****************************************/
|
|
15
|
+
#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
|
|
14
16
|
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
|
|
15
|
-
#define ZDICT_MIN_SAMPLES_SIZE
|
|
17
|
+
#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
/*-**************************************
|
|
@@ -59,11 +61,8 @@
|
|
|
59
61
|
|
|
60
62
|
#define NOISELENGTH 32
|
|
61
63
|
|
|
62
|
-
|
|
63
|
-
static const int g_compressionLevel_default = 6;
|
|
64
|
+
static const int g_compressionLevel_default = 3;
|
|
64
65
|
static const U32 g_selectivity_default = 9;
|
|
65
|
-
static const size_t g_provision_entropySize = 200;
|
|
66
|
-
static const size_t g_min_fast_dictContent = 192;
|
|
67
66
|
|
|
68
67
|
|
|
69
68
|
/*-*************************************
|
|
@@ -96,7 +95,7 @@ const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(error
|
|
|
96
95
|
unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
|
97
96
|
{
|
|
98
97
|
if (dictSize < 8) return 0;
|
|
99
|
-
if (MEM_readLE32(dictBuffer) !=
|
|
98
|
+
if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
|
|
100
99
|
return MEM_readLE32((const char*)dictBuffer + 4);
|
|
101
100
|
}
|
|
102
101
|
|
|
@@ -104,7 +103,7 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
|
|
104
103
|
/*-********************************************************
|
|
105
104
|
* Dictionary training functions
|
|
106
105
|
**********************************************************/
|
|
107
|
-
static unsigned ZDICT_NbCommonBytes (
|
|
106
|
+
static unsigned ZDICT_NbCommonBytes (size_t val)
|
|
108
107
|
{
|
|
109
108
|
if (MEM_isLittleEndian()) {
|
|
110
109
|
if (MEM_64bits()) {
|
|
@@ -308,10 +307,10 @@ static dictItem ZDICT_analyzePos(
|
|
|
308
307
|
/* look backward */
|
|
309
308
|
length = MINMATCHLENGTH;
|
|
310
309
|
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
310
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
|
311
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
|
312
|
+
lengthList[length]++;
|
|
313
|
+
if (length >= MINMATCHLENGTH) start--;
|
|
315
314
|
}
|
|
316
315
|
|
|
317
316
|
/* largest useful length */
|
|
@@ -363,21 +362,35 @@ static dictItem ZDICT_analyzePos(
|
|
|
363
362
|
}
|
|
364
363
|
|
|
365
364
|
|
|
366
|
-
|
|
365
|
+
static int isIncluded(const void* in, const void* container, size_t length)
|
|
366
|
+
{
|
|
367
|
+
const char* const ip = (const char*) in;
|
|
368
|
+
const char* const into = (const char*) container;
|
|
369
|
+
size_t u;
|
|
370
|
+
|
|
371
|
+
for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
|
|
372
|
+
if (ip[u] != into[u]) break;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return u==length;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/*! ZDICT_tryMerge() :
|
|
367
379
|
check if dictItem can be merged, do it if possible
|
|
368
380
|
@return : id of destination elt, 0 if not merged
|
|
369
381
|
*/
|
|
370
|
-
static U32
|
|
382
|
+
static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
|
|
371
383
|
{
|
|
372
384
|
const U32 tableSize = table->pos;
|
|
373
385
|
const U32 eltEnd = elt.pos + elt.length;
|
|
386
|
+
const char* const buf = (const char*) buffer;
|
|
374
387
|
|
|
375
388
|
/* tail overlap */
|
|
376
389
|
U32 u; for (u=1; u<tableSize; u++) {
|
|
377
390
|
if (u==eltNbToSkip) continue;
|
|
378
391
|
if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
|
|
379
392
|
/* append */
|
|
380
|
-
U32 addedLength = table[u].pos - elt.pos;
|
|
393
|
+
U32 const addedLength = table[u].pos - elt.pos;
|
|
381
394
|
table[u].length += addedLength;
|
|
382
395
|
table[u].pos = elt.pos;
|
|
383
396
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
|
@@ -393,9 +406,10 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
|
393
406
|
/* front overlap */
|
|
394
407
|
for (u=1; u<tableSize; u++) {
|
|
395
408
|
if (u==eltNbToSkip) continue;
|
|
409
|
+
|
|
396
410
|
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
|
|
397
411
|
/* append */
|
|
398
|
-
int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
|
412
|
+
int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
|
399
413
|
table[u].savings += elt.length / 8; /* rough approx bonus */
|
|
400
414
|
if (addedLength > 0) { /* otherwise, elt fully included into existing */
|
|
401
415
|
table[u].length += addedLength;
|
|
@@ -407,7 +421,18 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
|
407
421
|
table[u] = table[u-1], u--;
|
|
408
422
|
table[u] = elt;
|
|
409
423
|
return u;
|
|
410
|
-
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
|
|
427
|
+
if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
|
|
428
|
+
size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
|
|
429
|
+
table[u].pos = elt.pos;
|
|
430
|
+
table[u].savings += (U32)(elt.savings * addedLength / elt.length);
|
|
431
|
+
table[u].length = MIN(elt.length, table[u].length + 1);
|
|
432
|
+
return u;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
411
436
|
|
|
412
437
|
return 0;
|
|
413
438
|
}
|
|
@@ -415,8 +440,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
|
415
440
|
|
|
416
441
|
static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
|
417
442
|
{
|
|
418
|
-
/* convention :
|
|
419
|
-
U32 const max = table
|
|
443
|
+
/* convention : table[0].pos stores nb of elts */
|
|
444
|
+
U32 const max = table[0].pos;
|
|
420
445
|
U32 u;
|
|
421
446
|
if (!id) return; /* protection, should never happen */
|
|
422
447
|
for (u=id; u<max-1; u++)
|
|
@@ -425,14 +450,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
|
|
425
450
|
}
|
|
426
451
|
|
|
427
452
|
|
|
428
|
-
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
|
|
453
|
+
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
|
|
429
454
|
{
|
|
430
455
|
/* merge if possible */
|
|
431
|
-
U32 mergeId =
|
|
456
|
+
U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
|
|
432
457
|
if (mergeId) {
|
|
433
458
|
U32 newMerge = 1;
|
|
434
459
|
while (newMerge) {
|
|
435
|
-
newMerge =
|
|
460
|
+
newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
|
|
436
461
|
if (newMerge) ZDICT_removeDictItem(table, mergeId);
|
|
437
462
|
mergeId = newMerge;
|
|
438
463
|
}
|
|
@@ -463,7 +488,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
|
463
488
|
}
|
|
464
489
|
|
|
465
490
|
|
|
466
|
-
static size_t
|
|
491
|
+
static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
|
|
467
492
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
|
468
493
|
const size_t* fileSizes, unsigned nbFiles,
|
|
469
494
|
U32 minRatio, U32 notificationLevel)
|
|
@@ -480,7 +505,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
|
480
505
|
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
|
|
481
506
|
if (ZDICT_clockSpan(displayClock) > refreshRate) \
|
|
482
507
|
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
|
|
483
|
-
if (notificationLevel>=4) fflush(
|
|
508
|
+
if (notificationLevel>=4) fflush(stderr); } }
|
|
484
509
|
|
|
485
510
|
/* init */
|
|
486
511
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
|
@@ -521,7 +546,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
|
521
546
|
if (doneMarks[cursor]) { cursor++; continue; }
|
|
522
547
|
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
|
|
523
548
|
if (solution.length==0) { cursor++; continue; }
|
|
524
|
-
ZDICT_insertDictItem(dictList, dictListSize, solution);
|
|
549
|
+
ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
|
|
525
550
|
cursor += solution.length;
|
|
526
551
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
|
527
552
|
} }
|
|
@@ -552,7 +577,7 @@ typedef struct
|
|
|
552
577
|
{
|
|
553
578
|
ZSTD_CCtx* ref;
|
|
554
579
|
ZSTD_CCtx* zc;
|
|
555
|
-
void* workPlace; /* must be
|
|
580
|
+
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
|
|
556
581
|
} EStats_ress_t;
|
|
557
582
|
|
|
558
583
|
#define MAXREPOFFSET 1024
|
|
@@ -561,14 +586,14 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
|
561
586
|
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
|
|
562
587
|
const void* src, size_t srcSize, U32 notificationLevel)
|
|
563
588
|
{
|
|
564
|
-
size_t const blockSizeMax = MIN (
|
|
589
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
|
|
565
590
|
size_t cSize;
|
|
566
591
|
|
|
567
592
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
|
568
593
|
{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
|
|
569
594
|
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
|
|
570
595
|
}
|
|
571
|
-
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace,
|
|
596
|
+
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
|
|
572
597
|
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
|
|
573
598
|
|
|
574
599
|
if (cSize) { /* if == 0; block is not compressible */
|
|
@@ -610,17 +635,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
|
610
635
|
} } }
|
|
611
636
|
}
|
|
612
637
|
|
|
613
|
-
/*
|
|
614
|
-
static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
|
|
615
|
-
{
|
|
616
|
-
unsigned u;
|
|
617
|
-
size_t max=0;
|
|
618
|
-
for (u=0; u<nbFiles; u++)
|
|
619
|
-
if (max < fileSizes[u]) max = fileSizes[u];
|
|
620
|
-
return max;
|
|
621
|
-
}
|
|
622
|
-
*/
|
|
623
|
-
|
|
624
638
|
static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
|
|
625
639
|
{
|
|
626
640
|
size_t total=0;
|
|
@@ -676,26 +690,26 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
676
690
|
/* init */
|
|
677
691
|
esr.ref = ZSTD_createCCtx();
|
|
678
692
|
esr.zc = ZSTD_createCCtx();
|
|
679
|
-
esr.workPlace = malloc(
|
|
693
|
+
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
|
680
694
|
if (!esr.ref || !esr.zc || !esr.workPlace) {
|
|
681
695
|
eSize = ERROR(memory_allocation);
|
|
682
696
|
DISPLAYLEVEL(1, "Not enough memory \n");
|
|
683
697
|
goto _cleanup;
|
|
684
698
|
}
|
|
685
|
-
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(
|
|
686
|
-
for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
|
|
687
|
-
for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
|
|
688
|
-
for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
|
|
689
|
-
for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
|
|
699
|
+
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
|
|
700
|
+
for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
|
|
701
|
+
for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
|
|
702
|
+
for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
|
|
703
|
+
for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
|
|
690
704
|
memset(repOffset, 0, sizeof(repOffset));
|
|
691
705
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
|
692
706
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
|
693
|
-
if (compressionLevel
|
|
707
|
+
if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
|
|
694
708
|
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
|
695
709
|
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
|
696
|
-
|
|
710
|
+
if (ZSTD_isError(beginResult)) {
|
|
711
|
+
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
|
|
697
712
|
eSize = ERROR(GENERIC);
|
|
698
|
-
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
|
|
699
713
|
goto _cleanup;
|
|
700
714
|
} }
|
|
701
715
|
|
|
@@ -812,7 +826,6 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
|
812
826
|
MEM_writeLE32(dstPtr+4, repStartValue[1]);
|
|
813
827
|
MEM_writeLE32(dstPtr+8, repStartValue[2]);
|
|
814
828
|
#endif
|
|
815
|
-
//dstPtr += 12;
|
|
816
829
|
eSize += 12;
|
|
817
830
|
|
|
818
831
|
_cleanup:
|
|
@@ -831,7 +844,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
|
831
844
|
ZDICT_params_t params)
|
|
832
845
|
{
|
|
833
846
|
size_t hSize;
|
|
834
|
-
#define HBUFFSIZE 256
|
|
847
|
+
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
|
|
835
848
|
BYTE header[HBUFFSIZE];
|
|
836
849
|
int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
|
|
837
850
|
U32 const notificationLevel = params.notificationLevel;
|
|
@@ -842,7 +855,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
|
|
|
842
855
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
|
|
843
856
|
|
|
844
857
|
/* dictionary header */
|
|
845
|
-
MEM_writeLE32(header,
|
|
858
|
+
MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
|
|
846
859
|
{ U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
|
|
847
860
|
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
|
848
861
|
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
|
@@ -877,20 +890,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
|
877
890
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
878
891
|
ZDICT_params_t params)
|
|
879
892
|
{
|
|
880
|
-
size_t hSize;
|
|
881
893
|
int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
|
|
882
894
|
U32 const notificationLevel = params.notificationLevel;
|
|
895
|
+
size_t hSize = 8;
|
|
883
896
|
|
|
884
|
-
/*
|
|
885
|
-
MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
|
|
886
|
-
{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
|
|
887
|
-
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
|
888
|
-
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
|
889
|
-
MEM_writeLE32((char*)dictBuffer+4, dictID);
|
|
890
|
-
}
|
|
891
|
-
hSize = 8;
|
|
892
|
-
|
|
893
|
-
/* entropy tables */
|
|
897
|
+
/* calculate entropy tables */
|
|
894
898
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
|
895
899
|
DISPLAYLEVEL(2, "statistics ... \n");
|
|
896
900
|
{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
|
|
@@ -902,6 +906,13 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
|
902
906
|
hSize += eSize;
|
|
903
907
|
}
|
|
904
908
|
|
|
909
|
+
/* add dictionary header (after entropy tables) */
|
|
910
|
+
MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
|
|
911
|
+
{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
|
|
912
|
+
U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
|
|
913
|
+
U32 const dictID = params.dictID ? params.dictID : compliantID;
|
|
914
|
+
MEM_writeLE32((char*)dictBuffer+4, dictID);
|
|
915
|
+
}
|
|
905
916
|
|
|
906
917
|
if (hSize + dictContentSize < dictBufferCapacity)
|
|
907
918
|
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
|
|
@@ -909,14 +920,14 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
|
909
920
|
}
|
|
910
921
|
|
|
911
922
|
|
|
912
|
-
/*!
|
|
923
|
+
/*! ZDICT_trainFromBuffer_unsafe_legacy() :
|
|
913
924
|
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
|
914
925
|
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
|
915
926
|
*/
|
|
916
|
-
size_t
|
|
927
|
+
size_t ZDICT_trainFromBuffer_unsafe_legacy(
|
|
917
928
|
void* dictBuffer, size_t maxDictSize,
|
|
918
929
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
919
|
-
|
|
930
|
+
ZDICT_legacy_params_t params)
|
|
920
931
|
{
|
|
921
932
|
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
|
|
922
933
|
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
|
@@ -925,24 +936,24 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
|
925
936
|
size_t const targetDictSize = maxDictSize;
|
|
926
937
|
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
|
927
938
|
size_t dictSize = 0;
|
|
928
|
-
U32 const notificationLevel = params.notificationLevel;
|
|
939
|
+
U32 const notificationLevel = params.zParams.notificationLevel;
|
|
929
940
|
|
|
930
941
|
/* checks */
|
|
931
942
|
if (!dictList) return ERROR(memory_allocation);
|
|
932
|
-
if (maxDictSize
|
|
933
|
-
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return
|
|
943
|
+
if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
|
|
944
|
+
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
|
|
934
945
|
|
|
935
946
|
/* init */
|
|
936
947
|
ZDICT_initDictItem(dictList);
|
|
937
948
|
|
|
938
949
|
/* build dictionary */
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
950
|
+
ZDICT_trainBuffer_legacy(dictList, dictListSize,
|
|
951
|
+
samplesBuffer, samplesBuffSize,
|
|
952
|
+
samplesSizes, nbSamples,
|
|
953
|
+
minRep, notificationLevel);
|
|
943
954
|
|
|
944
955
|
/* display best matches */
|
|
945
|
-
if (params.notificationLevel>= 3) {
|
|
956
|
+
if (params.zParams.notificationLevel>= 3) {
|
|
946
957
|
U32 const nb = MIN(25, dictList[0].pos);
|
|
947
958
|
U32 const dictContentSize = ZDICT_dictSize(dictList);
|
|
948
959
|
U32 u;
|
|
@@ -963,14 +974,15 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
|
963
974
|
|
|
964
975
|
/* create dictionary */
|
|
965
976
|
{ U32 dictContentSize = ZDICT_dictSize(dictList);
|
|
966
|
-
if (dictContentSize <
|
|
977
|
+
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
|
|
978
|
+
if (dictContentSize < targetDictSize/4) {
|
|
967
979
|
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
|
|
980
|
+
if (samplesBuffSize < 10 * targetDictSize)
|
|
981
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
|
|
968
982
|
if (minRep > MINRATIO) {
|
|
969
983
|
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
|
970
984
|
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
|
971
985
|
}
|
|
972
|
-
if (samplesBuffSize < 10 * targetDictSize)
|
|
973
|
-
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
|
|
974
986
|
}
|
|
975
987
|
|
|
976
988
|
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
|
@@ -978,7 +990,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
|
978
990
|
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
|
979
991
|
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
|
|
980
992
|
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
|
981
|
-
DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
|
|
993
|
+
DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
|
|
982
994
|
}
|
|
983
995
|
|
|
984
996
|
/* limit dictionary size */
|
|
@@ -1004,7 +1016,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
|
1004
1016
|
|
|
1005
1017
|
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
|
1006
1018
|
samplesBuffer, samplesSizes, nbSamples,
|
|
1007
|
-
params);
|
|
1019
|
+
params.zParams);
|
|
1008
1020
|
}
|
|
1009
1021
|
|
|
1010
1022
|
/* clean up */
|
|
@@ -1015,9 +1027,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
|
1015
1027
|
|
|
1016
1028
|
/* issue : samplesBuffer need to be followed by a noisy guard band.
|
|
1017
1029
|
* work around : duplicate the buffer, and add the noise */
|
|
1018
|
-
size_t
|
|
1019
|
-
|
|
1020
|
-
|
|
1030
|
+
size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
|
|
1031
|
+
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
|
1032
|
+
ZDICT_legacy_params_t params)
|
|
1021
1033
|
{
|
|
1022
1034
|
size_t result;
|
|
1023
1035
|
void* newBuff;
|
|
@@ -1030,10 +1042,9 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
|
|
|
1030
1042
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
|
1031
1043
|
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
|
1032
1044
|
|
|
1033
|
-
result =
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
params);
|
|
1045
|
+
result =
|
|
1046
|
+
ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
|
|
1047
|
+
samplesSizes, nbSamples, params);
|
|
1037
1048
|
free(newBuff);
|
|
1038
1049
|
return result;
|
|
1039
1050
|
}
|
|
@@ -1042,11 +1053,15 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
|
|
|
1042
1053
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
|
1043
1054
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
|
1044
1055
|
{
|
|
1045
|
-
|
|
1056
|
+
ZDICT_cover_params_t params;
|
|
1046
1057
|
memset(¶ms, 0, sizeof(params));
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1058
|
+
params.d = 8;
|
|
1059
|
+
params.steps = 4;
|
|
1060
|
+
/* Default to level 6 since no compression level information is avaialble */
|
|
1061
|
+
params.zParams.compressionLevel = 6;
|
|
1062
|
+
return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
|
|
1063
|
+
samplesBuffer, samplesSizes,
|
|
1064
|
+
nbSamples, ¶ms);
|
|
1050
1065
|
}
|
|
1051
1066
|
|
|
1052
1067
|
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|