extzstd 0.0.3.CONCEPT-x86-mingw32 → 0.1-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.ja +5 -0
- data/LICENSE +6 -6
- data/README.md +35 -22
- data/contrib/zstd/LICENSE +13 -9
- data/contrib/zstd/README.md +37 -44
- data/contrib/zstd/common/entropy_common.c +33 -39
- data/contrib/zstd/common/error_private.c +43 -0
- data/contrib/zstd/common/error_private.h +11 -60
- data/contrib/zstd/common/fse.h +11 -5
- data/contrib/zstd/common/fse_decompress.c +14 -16
- data/contrib/zstd/common/huf.h +1 -1
- data/contrib/zstd/common/mem.h +36 -43
- data/contrib/zstd/common/xxhash.c +31 -18
- data/contrib/zstd/common/xxhash.h +71 -35
- data/contrib/zstd/common/zbuff.h +29 -35
- data/contrib/zstd/common/zstd_common.c +24 -32
- data/contrib/zstd/common/zstd_errors.h +60 -0
- data/contrib/zstd/common/zstd_internal.h +109 -80
- data/contrib/zstd/compress/fse_compress.c +9 -6
- data/contrib/zstd/compress/huf_compress.c +30 -74
- data/contrib/zstd/compress/zbuff_compress.c +43 -51
- data/contrib/zstd/compress/zstd_compress.c +953 -763
- data/contrib/zstd/compress/zstd_opt.h +115 -261
- data/contrib/zstd/decompress/huf_decompress.c +29 -40
- data/contrib/zstd/decompress/zbuff_decompress.c +36 -78
- data/contrib/zstd/decompress/zstd_decompress.c +976 -496
- data/contrib/zstd/dictBuilder/divsufsort.h +5 -5
- data/contrib/zstd/dictBuilder/zdict.c +194 -229
- data/contrib/zstd/dictBuilder/zdict.h +66 -68
- data/contrib/zstd/legacy/zstd_legacy.h +168 -49
- data/contrib/zstd/legacy/zstd_v01.c +95 -178
- data/contrib/zstd/legacy/zstd_v01.h +12 -32
- data/contrib/zstd/legacy/zstd_v02.c +48 -274
- data/contrib/zstd/legacy/zstd_v02.h +12 -32
- data/contrib/zstd/legacy/zstd_v03.c +48 -274
- data/contrib/zstd/legacy/zstd_v03.h +12 -32
- data/contrib/zstd/legacy/zstd_v04.c +63 -320
- data/contrib/zstd/legacy/zstd_v04.h +13 -33
- data/contrib/zstd/legacy/zstd_v05.c +80 -345
- data/contrib/zstd/legacy/zstd_v05.h +9 -31
- data/contrib/zstd/legacy/zstd_v06.c +48 -458
- data/contrib/zstd/legacy/zstd_v06.h +41 -67
- data/contrib/zstd/legacy/zstd_v07.c +4544 -0
- data/contrib/zstd/legacy/zstd_v07.h +173 -0
- data/contrib/zstd/zstd.h +640 -0
- data/ext/extconf.rb +7 -3
- data/ext/extzstd.c +263 -106
- data/ext/extzstd.h +8 -6
- data/ext/extzstd_nogvls.h +0 -117
- data/ext/extzstd_stream.c +347 -0
- data/ext/zstd_common.c +8 -0
- data/ext/zstd_compress.c +6 -0
- data/ext/zstd_decompress.c +5 -0
- data/ext/zstd_dictbuilder.c +5 -0
- data/ext/zstd_legacy_v07.c +1 -0
- data/gemstub.rb +18 -16
- data/lib/2.1/extzstd.so +0 -0
- data/lib/2.2/extzstd.so +0 -0
- data/lib/2.3/extzstd.so +0 -0
- data/lib/extzstd/version.rb +1 -1
- data/lib/extzstd.rb +77 -43
- data/test/test_basic.rb +11 -6
- metadata +23 -11
- data/contrib/zstd/common/error_public.h +0 -77
- data/contrib/zstd/common/zstd.h +0 -475
- data/ext/extzstd_buffered.c +0 -265
- data/ext/zstd_amalgam.c +0 -18
- data/lib/2.0/extzstd.so +0 -0
@@ -1,40 +1,18 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
* Redistributions of source code must retain the above copyright
|
12
|
-
notice, this list of conditions and the following disclaimer.
|
13
|
-
* Redistributions in binary form must reproduce the above
|
14
|
-
copyright notice, this list of conditions and the following disclaimer
|
15
|
-
in the documentation and/or other materials provided with the
|
16
|
-
distribution.
|
17
|
-
|
18
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
-
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
-
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
-
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
-
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
-
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
-
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
-
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
-
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
-
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
-
|
30
|
-
You can contact the author at :
|
31
|
-
- Zstd homepage : https://www.zstd.net
|
32
|
-
*/
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under the BSD-style license found in the
|
6
|
+
* LICENSE file in the root directory of this source tree. An additional grant
|
7
|
+
* of patent rights can be found in the PATENTS file in the same directory.
|
8
|
+
*/
|
9
|
+
|
33
10
|
|
34
11
|
/*-**************************************
|
35
12
|
* Tuning parameters
|
36
13
|
****************************************/
|
37
14
|
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
|
15
|
+
#define ZDICT_MIN_SAMPLES_SIZE 512
|
38
16
|
|
39
17
|
|
40
18
|
/*-**************************************
|
@@ -78,14 +56,12 @@
|
|
78
56
|
#define MB *(1 <<20)
|
79
57
|
#define GB *(1U<<30)
|
80
58
|
|
81
|
-
#define
|
59
|
+
#define DICTLISTSIZE_DEFAULT 10000
|
82
60
|
|
83
61
|
#define NOISELENGTH 32
|
84
|
-
#define PRIME1 2654435761U
|
85
|
-
#define PRIME2 2246822519U
|
86
62
|
|
87
63
|
#define MINRATIO 4
|
88
|
-
static const
|
64
|
+
static const int g_compressionLevel_default = 5;
|
89
65
|
static const U32 g_selectivity_default = 9;
|
90
66
|
static const size_t g_provision_entropySize = 200;
|
91
67
|
static const size_t g_min_fast_dictContent = 192;
|
@@ -95,26 +71,18 @@ static const size_t g_min_fast_dictContent = 192;
|
|
95
71
|
* Console display
|
96
72
|
***************************************/
|
97
73
|
#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
|
98
|
-
#define DISPLAYLEVEL(l, ...) if (
|
99
|
-
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
|
100
|
-
|
101
|
-
#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
|
102
|
-
if (ZDICT_clockSpan(g_time) > refreshRate) \
|
103
|
-
{ g_time = clock(); DISPLAY(__VA_ARGS__); \
|
104
|
-
if (g_displayLevel>=4) fflush(stdout); } }
|
105
|
-
static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
106
|
-
static clock_t g_time = 0;
|
74
|
+
#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
107
75
|
|
108
76
|
static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
|
109
77
|
|
110
|
-
static void ZDICT_printHex(
|
78
|
+
static void ZDICT_printHex(const void* ptr, size_t length)
|
111
79
|
{
|
112
80
|
const BYTE* const b = (const BYTE*)ptr;
|
113
81
|
size_t u;
|
114
82
|
for (u=0; u<length; u++) {
|
115
83
|
BYTE c = b[u];
|
116
84
|
if (c<32 || c>126) c = '.'; /* non-printable char */
|
117
|
-
|
85
|
+
DISPLAY("%c", c);
|
118
86
|
}
|
119
87
|
}
|
120
88
|
|
@@ -126,6 +94,13 @@ unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
|
126
94
|
|
127
95
|
const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
128
96
|
|
97
|
+
unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
98
|
+
{
|
99
|
+
if (dictSize < 8) return 0;
|
100
|
+
if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
|
101
|
+
return MEM_readLE32((const char*)dictBuffer + 4);
|
102
|
+
}
|
103
|
+
|
129
104
|
|
130
105
|
/*-********************************************************
|
131
106
|
* Dictionary training functions
|
@@ -228,7 +203,7 @@ static void ZDICT_initDictItem(dictItem* d)
|
|
228
203
|
static dictItem ZDICT_analyzePos(
|
229
204
|
BYTE* doneMarks,
|
230
205
|
const int* suffix, U32 start,
|
231
|
-
const void* buffer, U32 minRatio)
|
206
|
+
const void* buffer, U32 minRatio, U32 notificationLevel)
|
232
207
|
{
|
233
208
|
U32 lengthList[LLIMIT] = {0};
|
234
209
|
U32 cumulLength[LLIMIT] = {0};
|
@@ -332,12 +307,13 @@ static dictItem ZDICT_analyzePos(
|
|
332
307
|
} while (length >=MINMATCHLENGTH);
|
333
308
|
|
334
309
|
/* look backward */
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
310
|
+
length = MINMATCHLENGTH;
|
311
|
+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
312
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
313
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
314
|
+
lengthList[length]++;
|
315
|
+
if (length >= MINMATCHLENGTH) start--;
|
316
|
+
}
|
341
317
|
|
342
318
|
/* largest useful length */
|
343
319
|
memset(cumulLength, 0, sizeof(cumulLength));
|
@@ -395,21 +371,22 @@ static dictItem ZDICT_analyzePos(
|
|
395
371
|
static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
396
372
|
{
|
397
373
|
const U32 tableSize = table->pos;
|
398
|
-
const U32
|
374
|
+
const U32 eltEnd = elt.pos + elt.length;
|
399
375
|
|
400
376
|
/* tail overlap */
|
401
377
|
U32 u; for (u=1; u<tableSize; u++) {
|
402
378
|
if (u==eltNbToSkip) continue;
|
403
|
-
if ((table[u].pos > elt.pos) && (table[u].pos
|
379
|
+
if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
|
404
380
|
/* append */
|
405
381
|
U32 addedLength = table[u].pos - elt.pos;
|
406
382
|
table[u].length += addedLength;
|
407
383
|
table[u].pos = elt.pos;
|
408
384
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
409
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
385
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
410
386
|
elt = table[u];
|
387
|
+
/* sort : improve rank */
|
411
388
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
412
|
-
|
389
|
+
table[u] = table[u-1], u--;
|
413
390
|
table[u] = elt;
|
414
391
|
return u;
|
415
392
|
} }
|
@@ -417,14 +394,15 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
417
394
|
/* front overlap */
|
418
395
|
for (u=1; u<tableSize; u++) {
|
419
396
|
if (u==eltNbToSkip) continue;
|
420
|
-
if ((table[u].pos + table[u].length
|
397
|
+
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
|
421
398
|
/* append */
|
422
|
-
int addedLength = (
|
423
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
424
|
-
if (addedLength > 0) { /* otherwise,
|
399
|
+
int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
400
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
401
|
+
if (addedLength > 0) { /* otherwise, elt fully included into existing */
|
425
402
|
table[u].length += addedLength;
|
426
403
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
427
404
|
}
|
405
|
+
/* sort : improve rank */
|
428
406
|
elt = table[u];
|
429
407
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
430
408
|
table[u] = table[u-1], u--;
|
@@ -489,15 +467,21 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
489
467
|
static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
490
468
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
491
469
|
const size_t* fileSizes, unsigned nbFiles,
|
492
|
-
U32
|
470
|
+
U32 minRatio, U32 notificationLevel)
|
493
471
|
{
|
494
472
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
495
473
|
int* const suffix = suffix0+1;
|
496
474
|
U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
|
497
475
|
BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
|
498
476
|
U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
|
499
|
-
U32 minRatio = nbFiles >> shiftRatio;
|
500
477
|
size_t result = 0;
|
478
|
+
clock_t displayClock = 0;
|
479
|
+
clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
480
|
+
|
481
|
+
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
|
482
|
+
if (ZDICT_clockSpan(displayClock) > refreshRate) \
|
483
|
+
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
|
484
|
+
if (notificationLevel>=4) fflush(stdout); } }
|
501
485
|
|
502
486
|
/* init */
|
503
487
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
@@ -523,7 +507,8 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
523
507
|
{ size_t pos;
|
524
508
|
for (pos=0; pos < bufferSize; pos++)
|
525
509
|
reverseSuffix[suffix[pos]] = (U32)pos;
|
526
|
-
/*
|
510
|
+
/* note filePos tracks borders between samples.
|
511
|
+
It's not used at this stage, but planned to become useful in a later update */
|
527
512
|
filePos[0] = 0;
|
528
513
|
for (pos=1; pos<nbFiles; pos++)
|
529
514
|
filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
|
@@ -535,23 +520,13 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
535
520
|
{ U32 cursor; for (cursor=0; cursor < bufferSize; ) {
|
536
521
|
dictItem solution;
|
537
522
|
if (doneMarks[cursor]) { cursor++; continue; }
|
538
|
-
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
|
523
|
+
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
|
539
524
|
if (solution.length==0) { cursor++; continue; }
|
540
525
|
ZDICT_insertDictItem(dictList, dictListSize, solution);
|
541
526
|
cursor += solution.length;
|
542
527
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
543
528
|
} }
|
544
529
|
|
545
|
-
/* limit dictionary size */
|
546
|
-
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
547
|
-
U32 currentSize = 0;
|
548
|
-
U32 n; for (n=1; n<max; n++) {
|
549
|
-
currentSize += dictList[n].length;
|
550
|
-
if (currentSize > maxDictSize) break;
|
551
|
-
}
|
552
|
-
dictList->pos = n;
|
553
|
-
}
|
554
|
-
|
555
530
|
_cleanup:
|
556
531
|
free(suffix0);
|
557
532
|
free(reverseSuffix);
|
@@ -563,10 +538,12 @@ _cleanup:
|
|
563
538
|
|
564
539
|
static void ZDICT_fillNoise(void* buffer, size_t length)
|
565
540
|
{
|
566
|
-
unsigned
|
541
|
+
unsigned const prime1 = 2654435761U;
|
542
|
+
unsigned const prime2 = 2246822519U;
|
543
|
+
unsigned acc = prime1;
|
567
544
|
size_t p=0;;
|
568
545
|
for (p=0; p<length; p++) {
|
569
|
-
acc *=
|
546
|
+
acc *= prime2;
|
570
547
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
571
548
|
}
|
572
549
|
}
|
@@ -576,23 +553,23 @@ typedef struct
|
|
576
553
|
{
|
577
554
|
ZSTD_CCtx* ref;
|
578
555
|
ZSTD_CCtx* zc;
|
579
|
-
void* workPlace; /* must be
|
556
|
+
void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
|
580
557
|
} EStats_ress_t;
|
581
558
|
|
582
559
|
#define MAXREPOFFSET 1024
|
583
560
|
|
584
561
|
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
585
562
|
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
|
586
|
-
const void* src, size_t srcSize)
|
563
|
+
const void* src, size_t srcSize, U32 notificationLevel)
|
587
564
|
{
|
588
|
-
size_t const blockSizeMax = MIN (
|
565
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
|
589
566
|
size_t cSize;
|
590
567
|
|
591
568
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace,
|
569
|
+
{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
|
570
|
+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
|
571
|
+
}
|
572
|
+
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
|
596
573
|
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
|
597
574
|
|
598
575
|
if (cSize) { /* if == 0; block is not compressible */
|
@@ -605,34 +582,33 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
605
582
|
}
|
606
583
|
|
607
584
|
/* seqStats */
|
608
|
-
{
|
609
|
-
ZSTD_seqToCodes(seqStorePtr
|
585
|
+
{ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
586
|
+
ZSTD_seqToCodes(seqStorePtr);
|
610
587
|
|
611
|
-
{ const BYTE* codePtr = seqStorePtr->
|
612
|
-
|
588
|
+
{ const BYTE* codePtr = seqStorePtr->ofCode;
|
589
|
+
U32 u;
|
613
590
|
for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
|
614
591
|
}
|
615
592
|
|
616
|
-
{ const BYTE* codePtr = seqStorePtr->
|
617
|
-
|
593
|
+
{ const BYTE* codePtr = seqStorePtr->mlCode;
|
594
|
+
U32 u;
|
618
595
|
for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
|
619
596
|
}
|
620
597
|
|
621
|
-
{ const BYTE* codePtr = seqStorePtr->
|
622
|
-
|
598
|
+
{ const BYTE* codePtr = seqStorePtr->llCode;
|
599
|
+
U32 u;
|
623
600
|
for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
|
624
|
-
|
601
|
+
}
|
625
602
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
}
|
603
|
+
if (nbSeq >= 2) { /* rep offsets */
|
604
|
+
const seqDef* const seq = seqStorePtr->sequencesStart;
|
605
|
+
U32 offset1 = seq[0].offset - 3;
|
606
|
+
U32 offset2 = seq[1].offset - 3;
|
607
|
+
if (offset1 >= MAXREPOFFSET) offset1 = 0;
|
608
|
+
if (offset2 >= MAXREPOFFSET) offset2 = 0;
|
609
|
+
repOffsets[offset1] += 3;
|
610
|
+
repOffsets[offset2] += 1;
|
611
|
+
} } }
|
636
612
|
}
|
637
613
|
|
638
614
|
/*
|
@@ -671,60 +647,65 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
|
|
671
647
|
}
|
672
648
|
|
673
649
|
|
674
|
-
#define OFFCODE_MAX
|
650
|
+
#define OFFCODE_MAX 30 /* only applicable to first block */
|
675
651
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
676
|
-
|
677
|
-
|
678
|
-
|
652
|
+
unsigned compressionLevel,
|
653
|
+
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
654
|
+
const void* dictBuffer, size_t dictBufferSize,
|
655
|
+
unsigned notificationLevel)
|
679
656
|
{
|
680
657
|
U32 countLit[256];
|
681
658
|
HUF_CREATE_STATIC_CTABLE(hufTable, 255);
|
682
659
|
U32 offcodeCount[OFFCODE_MAX+1];
|
683
660
|
short offcodeNCount[OFFCODE_MAX+1];
|
661
|
+
U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
|
684
662
|
U32 matchLengthCount[MaxML+1];
|
685
663
|
short matchLengthNCount[MaxML+1];
|
686
664
|
U32 litLengthCount[MaxLL+1];
|
687
665
|
short litLengthNCount[MaxLL+1];
|
688
|
-
U32 repOffset[MAXREPOFFSET]
|
666
|
+
U32 repOffset[MAXREPOFFSET];
|
689
667
|
offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
|
690
668
|
EStats_ress_t esr;
|
691
669
|
ZSTD_parameters params;
|
692
|
-
U32 u, huffLog =
|
670
|
+
U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
|
693
671
|
size_t pos = 0, errorCode;
|
694
672
|
size_t eSize = 0;
|
695
673
|
size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
|
696
|
-
size_t const averageSampleSize = totalSrcSize / nbFiles;
|
674
|
+
size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
|
697
675
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
698
676
|
|
699
677
|
/* init */
|
678
|
+
esr.ref = ZSTD_createCCtx();
|
679
|
+
esr.zc = ZSTD_createCCtx();
|
680
|
+
esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
|
681
|
+
if (!esr.ref || !esr.zc || !esr.workPlace) {
|
682
|
+
eSize = ERROR(memory_allocation);
|
683
|
+
DISPLAYLEVEL(1, "Not enough memory \n");
|
684
|
+
goto _cleanup;
|
685
|
+
}
|
686
|
+
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
|
700
687
|
for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
|
701
|
-
for (u=0; u<=
|
688
|
+
for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
|
702
689
|
for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
|
703
690
|
for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
|
691
|
+
memset(repOffset, 0, sizeof(repOffset));
|
704
692
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
705
693
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
706
|
-
esr.ref = ZSTD_createCCtx();
|
707
|
-
esr.zc = ZSTD_createCCtx();
|
708
|
-
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
709
|
-
if (!esr.ref || !esr.zc || !esr.workPlace) {
|
710
|
-
eSize = ERROR(memory_allocation);
|
711
|
-
DISPLAYLEVEL(1, "Not enough memory");
|
712
|
-
goto _cleanup;
|
713
|
-
}
|
714
694
|
if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
|
715
695
|
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
696
|
+
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
697
|
+
if (ZSTD_isError(beginResult)) {
|
698
|
+
eSize = ERROR(GENERIC);
|
699
|
+
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
|
700
|
+
goto _cleanup;
|
701
|
+
} }
|
722
702
|
|
723
703
|
/* collect stats on all files */
|
724
704
|
for (u=0; u<nbFiles; u++) {
|
725
705
|
ZDICT_countEStats(esr, params,
|
726
|
-
|
727
|
-
|
706
|
+
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
707
|
+
(const char*)srcBuffer + pos, fileSizes[u],
|
708
|
+
notificationLevel);
|
728
709
|
pos += fileSizes[u];
|
729
710
|
}
|
730
711
|
|
@@ -732,7 +713,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
732
713
|
errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
733
714
|
if (HUF_isError(errorCode)) {
|
734
715
|
eSize = ERROR(GENERIC);
|
735
|
-
DISPLAYLEVEL(1, "HUF_buildCTable error");
|
716
|
+
DISPLAYLEVEL(1, "HUF_buildCTable error \n");
|
736
717
|
goto _cleanup;
|
737
718
|
}
|
738
719
|
huffLog = (U32)errorCode;
|
@@ -744,11 +725,11 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
744
725
|
}
|
745
726
|
/* note : the result of this phase should be used to better appreciate the impact on statistics */
|
746
727
|
|
747
|
-
total=0; for (u=0; u<=
|
748
|
-
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total,
|
728
|
+
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
729
|
+
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
749
730
|
if (FSE_isError(errorCode)) {
|
750
731
|
eSize = ERROR(GENERIC);
|
751
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
|
732
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
752
733
|
goto _cleanup;
|
753
734
|
}
|
754
735
|
Offlog = (U32)errorCode;
|
@@ -757,7 +738,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
757
738
|
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
758
739
|
if (FSE_isError(errorCode)) {
|
759
740
|
eSize = ERROR(GENERIC);
|
760
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
|
741
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
761
742
|
goto _cleanup;
|
762
743
|
}
|
763
744
|
mlLog = (U32)errorCode;
|
@@ -766,17 +747,16 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
766
747
|
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
767
748
|
if (FSE_isError(errorCode)) {
|
768
749
|
eSize = ERROR(GENERIC);
|
769
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount");
|
750
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
770
751
|
goto _cleanup;
|
771
752
|
}
|
772
753
|
llLog = (U32)errorCode;
|
773
754
|
|
774
|
-
|
775
755
|
/* write result to buffer */
|
776
756
|
{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
|
777
757
|
if (HUF_isError(hhSize)) {
|
778
758
|
eSize = ERROR(GENERIC);
|
779
|
-
DISPLAYLEVEL(1, "HUF_writeCTable error");
|
759
|
+
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
780
760
|
goto _cleanup;
|
781
761
|
}
|
782
762
|
dstPtr += hhSize;
|
@@ -787,7 +767,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
787
767
|
{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
|
788
768
|
if (FSE_isError(ohSize)) {
|
789
769
|
eSize = ERROR(GENERIC);
|
790
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
|
770
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
|
791
771
|
goto _cleanup;
|
792
772
|
}
|
793
773
|
dstPtr += ohSize;
|
@@ -798,7 +778,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
798
778
|
{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
|
799
779
|
if (FSE_isError(mhSize)) {
|
800
780
|
eSize = ERROR(GENERIC);
|
801
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
|
781
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
|
802
782
|
goto _cleanup;
|
803
783
|
}
|
804
784
|
dstPtr += mhSize;
|
@@ -809,7 +789,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
809
789
|
{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
|
810
790
|
if (FSE_isError(lhSize)) {
|
811
791
|
eSize = ERROR(GENERIC);
|
812
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
|
792
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
|
813
793
|
goto _cleanup;
|
814
794
|
}
|
815
795
|
dstPtr += lhSize;
|
@@ -819,7 +799,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
819
799
|
|
820
800
|
if (maxDstSize<12) {
|
821
801
|
eSize = ERROR(GENERIC);
|
822
|
-
DISPLAYLEVEL(1, "not enough space to write RepOffsets");
|
802
|
+
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
|
823
803
|
goto _cleanup;
|
824
804
|
}
|
825
805
|
# if 0
|
@@ -833,7 +813,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
833
813
|
MEM_writeLE32(dstPtr+4, repStartValue[1]);
|
834
814
|
MEM_writeLE32(dstPtr+8, repStartValue[2]);
|
835
815
|
#endif
|
836
|
-
dstPtr += 12;
|
816
|
+
//dstPtr += 12;
|
837
817
|
eSize += 12;
|
838
818
|
|
839
819
|
_cleanup:
|
@@ -845,51 +825,13 @@ _cleanup:
|
|
845
825
|
}
|
846
826
|
|
847
827
|
|
848
|
-
#define DIB_FASTSEGMENTSIZE 64
|
849
|
-
/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
|
850
|
-
Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
|
851
|
-
up to `dictSize`.
|
852
|
-
Filling starts from the end of `dictBuffer`, down to maximum possible.
|
853
|
-
if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
|
854
|
-
@return : amount of data written into `dictBuffer`,
|
855
|
-
or an error code
|
856
|
-
*/
|
857
|
-
static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
|
858
|
-
const void* samplesBuffer, size_t samplesSize)
|
859
|
-
{
|
860
|
-
char* dstPtr = (char*)dictBuffer + dictSize;
|
861
|
-
const char* srcPtr = (const char*)samplesBuffer;
|
862
|
-
size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
|
863
|
-
size_t segNb, interSize;
|
864
|
-
|
865
|
-
if (nbSegments <= 2) return ERROR(srcSize_wrong);
|
866
|
-
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
|
867
|
-
|
868
|
-
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
|
869
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
870
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
871
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
872
|
-
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
|
873
|
-
|
874
|
-
/* regularly copy a segment */
|
875
|
-
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
|
876
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
877
|
-
for (segNb=2; segNb < nbSegments; segNb++) {
|
878
|
-
srcPtr += interSize;
|
879
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
880
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
881
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
882
|
-
}
|
883
|
-
|
884
|
-
return nbSegments * DIB_FASTSEGMENTSIZE;
|
885
|
-
}
|
886
|
-
|
887
828
|
size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
888
829
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
889
830
|
ZDICT_params_t params)
|
890
831
|
{
|
891
832
|
size_t hSize;
|
892
|
-
|
833
|
+
int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
|
834
|
+
U32 const notificationLevel = params.notificationLevel;
|
893
835
|
|
894
836
|
/* dictionary header */
|
895
837
|
MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
|
@@ -903,10 +845,15 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
903
845
|
/* entropy tables */
|
904
846
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
905
847
|
DISPLAYLEVEL(2, "statistics ... \n");
|
906
|
-
|
848
|
+
{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
|
907
849
|
compressionLevel,
|
908
850
|
samplesBuffer, samplesSizes, nbSamples,
|
909
|
-
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize
|
851
|
+
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
|
852
|
+
notificationLevel);
|
853
|
+
if (ZDICT_isError(eSize)) return eSize;
|
854
|
+
hSize += eSize;
|
855
|
+
}
|
856
|
+
|
910
857
|
|
911
858
|
if (hSize + dictContentSize < dictBufferCapacity)
|
912
859
|
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
|
@@ -914,60 +861,86 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
914
861
|
}
|
915
862
|
|
916
863
|
|
917
|
-
#define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3)
|
918
864
|
/*! ZDICT_trainFromBuffer_unsafe() :
|
919
|
-
* `samplesBuffer` must be followed by noisy guard band.
|
920
|
-
* @return : size of dictionary
|
865
|
+
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
866
|
+
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
921
867
|
*/
|
922
868
|
size_t ZDICT_trainFromBuffer_unsafe(
|
923
869
|
void* dictBuffer, size_t maxDictSize,
|
924
870
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
925
871
|
ZDICT_params_t params)
|
926
872
|
{
|
927
|
-
U32 const dictListSize = MAX(
|
873
|
+
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
|
928
874
|
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
929
|
-
unsigned selectivity = params.selectivityLevel;
|
875
|
+
unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
|
876
|
+
unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
|
930
877
|
size_t const targetDictSize = maxDictSize;
|
931
|
-
size_t
|
878
|
+
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
932
879
|
size_t dictSize = 0;
|
880
|
+
U32 const notificationLevel = params.notificationLevel;
|
933
881
|
|
934
882
|
/* checks */
|
935
883
|
if (!dictList) return ERROR(memory_allocation);
|
936
884
|
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
|
885
|
+
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
937
886
|
|
938
887
|
/* init */
|
939
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
940
|
-
if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
941
888
|
ZDICT_initDictItem(dictList);
|
942
|
-
g_displayLevel = params.notificationLevel;
|
943
|
-
if (selectivity==0) selectivity = g_selectivity_default;
|
944
889
|
|
945
890
|
/* build dictionary */
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
891
|
+
ZDICT_trainBuffer(dictList, dictListSize,
|
892
|
+
samplesBuffer, samplesBuffSize,
|
893
|
+
samplesSizes, nbSamples,
|
894
|
+
minRep, notificationLevel);
|
895
|
+
|
896
|
+
/* display best matches */
|
897
|
+
if (params.notificationLevel>= 3) {
|
898
|
+
U32 const nb = MIN(25, dictList[0].pos);
|
899
|
+
U32 const dictContentSize = ZDICT_dictSize(dictList);
|
900
|
+
U32 u;
|
901
|
+
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
902
|
+
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
903
|
+
for (u=1; u<=nb; u++) {
|
904
|
+
U32 pos = dictList[u].pos;
|
905
|
+
U32 length = dictList[u].length;
|
906
|
+
U32 printedLength = MIN(40, length);
|
907
|
+
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
908
|
+
u, length, pos, dictList[u].savings);
|
909
|
+
ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
|
910
|
+
DISPLAYLEVEL(3, "| \n");
|
911
|
+
} }
|
912
|
+
|
968
913
|
|
969
914
|
/* create dictionary */
|
970
915
|
{ U32 dictContentSize = ZDICT_dictSize(dictList);
|
916
|
+
if (dictContentSize < targetDictSize/3) {
|
917
|
+
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
|
918
|
+
if (minRep > MINRATIO) {
|
919
|
+
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
920
|
+
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
921
|
+
}
|
922
|
+
if (samplesBuffSize < 10 * targetDictSize)
|
923
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
|
924
|
+
}
|
925
|
+
|
926
|
+
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
927
|
+
U32 proposedSelectivity = selectivity-1;
|
928
|
+
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
929
|
+
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
|
930
|
+
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
931
|
+
DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
|
932
|
+
}
|
933
|
+
|
934
|
+
/* limit dictionary size */
|
935
|
+
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
936
|
+
U32 currentSize = 0;
|
937
|
+
U32 n; for (n=1; n<max; n++) {
|
938
|
+
currentSize += dictList[n].length;
|
939
|
+
if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
|
940
|
+
}
|
941
|
+
dictList->pos = n;
|
942
|
+
dictContentSize = currentSize;
|
943
|
+
}
|
971
944
|
|
972
945
|
/* build dict content */
|
973
946
|
{ U32 u;
|
@@ -979,14 +952,6 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
979
952
|
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
|
980
953
|
} }
|
981
954
|
|
982
|
-
/* fast mode dict content */
|
983
|
-
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
984
|
-
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
985
|
-
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
986
|
-
dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
|
987
|
-
samplesBuffer, sBuffSize);
|
988
|
-
}
|
989
|
-
|
990
955
|
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
991
956
|
samplesBuffer, samplesSizes, nbSamples,
|
992
957
|
params);
|
@@ -1004,23 +969,23 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
|
|
1004
969
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
1005
970
|
ZDICT_params_t params)
|
1006
971
|
{
|
972
|
+
size_t result;
|
1007
973
|
void* newBuff;
|
1008
|
-
size_t sBuffSize;
|
974
|
+
size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
975
|
+
if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
|
1009
976
|
|
1010
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
1011
|
-
if (sBuffSize==0) return 0; /* empty content => no dictionary */
|
1012
977
|
newBuff = malloc(sBuffSize + NOISELENGTH);
|
1013
978
|
if (!newBuff) return ERROR(memory_allocation);
|
1014
979
|
|
1015
980
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
1016
981
|
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
1017
982
|
|
1018
|
-
|
983
|
+
result = ZDICT_trainFromBuffer_unsafe(
|
1019
984
|
dictBuffer, dictBufferCapacity,
|
1020
985
|
newBuff, samplesSizes, nbSamples,
|
1021
986
|
params);
|
1022
|
-
|
1023
|
-
|
987
|
+
free(newBuff);
|
988
|
+
return result;
|
1024
989
|
}
|
1025
990
|
|
1026
991
|
|