extzstd 0.0.3.CONCEPT-x86-mingw32 → 0.1-x86-mingw32
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.ja +5 -0
- data/LICENSE +6 -6
- data/README.md +35 -22
- data/contrib/zstd/LICENSE +13 -9
- data/contrib/zstd/README.md +37 -44
- data/contrib/zstd/common/entropy_common.c +33 -39
- data/contrib/zstd/common/error_private.c +43 -0
- data/contrib/zstd/common/error_private.h +11 -60
- data/contrib/zstd/common/fse.h +11 -5
- data/contrib/zstd/common/fse_decompress.c +14 -16
- data/contrib/zstd/common/huf.h +1 -1
- data/contrib/zstd/common/mem.h +36 -43
- data/contrib/zstd/common/xxhash.c +31 -18
- data/contrib/zstd/common/xxhash.h +71 -35
- data/contrib/zstd/common/zbuff.h +29 -35
- data/contrib/zstd/common/zstd_common.c +24 -32
- data/contrib/zstd/common/zstd_errors.h +60 -0
- data/contrib/zstd/common/zstd_internal.h +109 -80
- data/contrib/zstd/compress/fse_compress.c +9 -6
- data/contrib/zstd/compress/huf_compress.c +30 -74
- data/contrib/zstd/compress/zbuff_compress.c +43 -51
- data/contrib/zstd/compress/zstd_compress.c +953 -763
- data/contrib/zstd/compress/zstd_opt.h +115 -261
- data/contrib/zstd/decompress/huf_decompress.c +29 -40
- data/contrib/zstd/decompress/zbuff_decompress.c +36 -78
- data/contrib/zstd/decompress/zstd_decompress.c +976 -496
- data/contrib/zstd/dictBuilder/divsufsort.h +5 -5
- data/contrib/zstd/dictBuilder/zdict.c +194 -229
- data/contrib/zstd/dictBuilder/zdict.h +66 -68
- data/contrib/zstd/legacy/zstd_legacy.h +168 -49
- data/contrib/zstd/legacy/zstd_v01.c +95 -178
- data/contrib/zstd/legacy/zstd_v01.h +12 -32
- data/contrib/zstd/legacy/zstd_v02.c +48 -274
- data/contrib/zstd/legacy/zstd_v02.h +12 -32
- data/contrib/zstd/legacy/zstd_v03.c +48 -274
- data/contrib/zstd/legacy/zstd_v03.h +12 -32
- data/contrib/zstd/legacy/zstd_v04.c +63 -320
- data/contrib/zstd/legacy/zstd_v04.h +13 -33
- data/contrib/zstd/legacy/zstd_v05.c +80 -345
- data/contrib/zstd/legacy/zstd_v05.h +9 -31
- data/contrib/zstd/legacy/zstd_v06.c +48 -458
- data/contrib/zstd/legacy/zstd_v06.h +41 -67
- data/contrib/zstd/legacy/zstd_v07.c +4544 -0
- data/contrib/zstd/legacy/zstd_v07.h +173 -0
- data/contrib/zstd/zstd.h +640 -0
- data/ext/extconf.rb +7 -3
- data/ext/extzstd.c +263 -106
- data/ext/extzstd.h +8 -6
- data/ext/extzstd_nogvls.h +0 -117
- data/ext/extzstd_stream.c +347 -0
- data/ext/zstd_common.c +8 -0
- data/ext/zstd_compress.c +6 -0
- data/ext/zstd_decompress.c +5 -0
- data/ext/zstd_dictbuilder.c +5 -0
- data/ext/zstd_legacy_v07.c +1 -0
- data/gemstub.rb +18 -16
- data/lib/2.1/extzstd.so +0 -0
- data/lib/2.2/extzstd.so +0 -0
- data/lib/2.3/extzstd.so +0 -0
- data/lib/extzstd/version.rb +1 -1
- data/lib/extzstd.rb +77 -43
- data/test/test_basic.rb +11 -6
- metadata +23 -11
- data/contrib/zstd/common/error_public.h +0 -77
- data/contrib/zstd/common/zstd.h +0 -475
- data/ext/extzstd_buffered.c +0 -265
- data/ext/zstd_amalgam.c +0 -18
- data/lib/2.0/extzstd.so +0 -0
@@ -1,40 +1,18 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
* Redistributions of source code must retain the above copyright
|
12
|
-
notice, this list of conditions and the following disclaimer.
|
13
|
-
* Redistributions in binary form must reproduce the above
|
14
|
-
copyright notice, this list of conditions and the following disclaimer
|
15
|
-
in the documentation and/or other materials provided with the
|
16
|
-
distribution.
|
17
|
-
|
18
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
-
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
-
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
-
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
-
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
-
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
-
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
-
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
-
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
-
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
-
|
30
|
-
You can contact the author at :
|
31
|
-
- Zstd homepage : https://www.zstd.net
|
32
|
-
*/
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
|
3
|
+
* All rights reserved.
|
4
|
+
*
|
5
|
+
* This source code is licensed under the BSD-style license found in the
|
6
|
+
* LICENSE file in the root directory of this source tree. An additional grant
|
7
|
+
* of patent rights can be found in the PATENTS file in the same directory.
|
8
|
+
*/
|
9
|
+
|
33
10
|
|
34
11
|
/*-**************************************
|
35
12
|
* Tuning parameters
|
36
13
|
****************************************/
|
37
14
|
#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
|
15
|
+
#define ZDICT_MIN_SAMPLES_SIZE 512
|
38
16
|
|
39
17
|
|
40
18
|
/*-**************************************
|
@@ -78,14 +56,12 @@
|
|
78
56
|
#define MB *(1 <<20)
|
79
57
|
#define GB *(1U<<30)
|
80
58
|
|
81
|
-
#define
|
59
|
+
#define DICTLISTSIZE_DEFAULT 10000
|
82
60
|
|
83
61
|
#define NOISELENGTH 32
|
84
|
-
#define PRIME1 2654435761U
|
85
|
-
#define PRIME2 2246822519U
|
86
62
|
|
87
63
|
#define MINRATIO 4
|
88
|
-
static const
|
64
|
+
static const int g_compressionLevel_default = 5;
|
89
65
|
static const U32 g_selectivity_default = 9;
|
90
66
|
static const size_t g_provision_entropySize = 200;
|
91
67
|
static const size_t g_min_fast_dictContent = 192;
|
@@ -95,26 +71,18 @@ static const size_t g_min_fast_dictContent = 192;
|
|
95
71
|
* Console display
|
96
72
|
***************************************/
|
97
73
|
#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
|
98
|
-
#define DISPLAYLEVEL(l, ...) if (
|
99
|
-
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
|
100
|
-
|
101
|
-
#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
|
102
|
-
if (ZDICT_clockSpan(g_time) > refreshRate) \
|
103
|
-
{ g_time = clock(); DISPLAY(__VA_ARGS__); \
|
104
|
-
if (g_displayLevel>=4) fflush(stdout); } }
|
105
|
-
static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
106
|
-
static clock_t g_time = 0;
|
74
|
+
#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
|
107
75
|
|
108
76
|
static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
|
109
77
|
|
110
|
-
static void ZDICT_printHex(
|
78
|
+
static void ZDICT_printHex(const void* ptr, size_t length)
|
111
79
|
{
|
112
80
|
const BYTE* const b = (const BYTE*)ptr;
|
113
81
|
size_t u;
|
114
82
|
for (u=0; u<length; u++) {
|
115
83
|
BYTE c = b[u];
|
116
84
|
if (c<32 || c>126) c = '.'; /* non-printable char */
|
117
|
-
|
85
|
+
DISPLAY("%c", c);
|
118
86
|
}
|
119
87
|
}
|
120
88
|
|
@@ -126,6 +94,13 @@ unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
|
126
94
|
|
127
95
|
const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
128
96
|
|
97
|
+
unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
|
98
|
+
{
|
99
|
+
if (dictSize < 8) return 0;
|
100
|
+
if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
|
101
|
+
return MEM_readLE32((const char*)dictBuffer + 4);
|
102
|
+
}
|
103
|
+
|
129
104
|
|
130
105
|
/*-********************************************************
|
131
106
|
* Dictionary training functions
|
@@ -228,7 +203,7 @@ static void ZDICT_initDictItem(dictItem* d)
|
|
228
203
|
static dictItem ZDICT_analyzePos(
|
229
204
|
BYTE* doneMarks,
|
230
205
|
const int* suffix, U32 start,
|
231
|
-
const void* buffer, U32 minRatio)
|
206
|
+
const void* buffer, U32 minRatio, U32 notificationLevel)
|
232
207
|
{
|
233
208
|
U32 lengthList[LLIMIT] = {0};
|
234
209
|
U32 cumulLength[LLIMIT] = {0};
|
@@ -332,12 +307,13 @@ static dictItem ZDICT_analyzePos(
|
|
332
307
|
} while (length >=MINMATCHLENGTH);
|
333
308
|
|
334
309
|
/* look backward */
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
310
|
+
length = MINMATCHLENGTH;
|
311
|
+
while ((length >= MINMATCHLENGTH) & (start > 0)) {
|
312
|
+
length = ZDICT_count(b + pos, b + suffix[start - 1]);
|
313
|
+
if (length >= LLIMIT) length = LLIMIT - 1;
|
314
|
+
lengthList[length]++;
|
315
|
+
if (length >= MINMATCHLENGTH) start--;
|
316
|
+
}
|
341
317
|
|
342
318
|
/* largest useful length */
|
343
319
|
memset(cumulLength, 0, sizeof(cumulLength));
|
@@ -395,21 +371,22 @@ static dictItem ZDICT_analyzePos(
|
|
395
371
|
static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
396
372
|
{
|
397
373
|
const U32 tableSize = table->pos;
|
398
|
-
const U32
|
374
|
+
const U32 eltEnd = elt.pos + elt.length;
|
399
375
|
|
400
376
|
/* tail overlap */
|
401
377
|
U32 u; for (u=1; u<tableSize; u++) {
|
402
378
|
if (u==eltNbToSkip) continue;
|
403
|
-
if ((table[u].pos > elt.pos) && (table[u].pos
|
379
|
+
if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
|
404
380
|
/* append */
|
405
381
|
U32 addedLength = table[u].pos - elt.pos;
|
406
382
|
table[u].length += addedLength;
|
407
383
|
table[u].pos = elt.pos;
|
408
384
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
409
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
385
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
410
386
|
elt = table[u];
|
387
|
+
/* sort : improve rank */
|
411
388
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
412
|
-
|
389
|
+
table[u] = table[u-1], u--;
|
413
390
|
table[u] = elt;
|
414
391
|
return u;
|
415
392
|
} }
|
@@ -417,14 +394,15 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|
417
394
|
/* front overlap */
|
418
395
|
for (u=1; u<tableSize; u++) {
|
419
396
|
if (u==eltNbToSkip) continue;
|
420
|
-
if ((table[u].pos + table[u].length
|
397
|
+
if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
|
421
398
|
/* append */
|
422
|
-
int addedLength = (
|
423
|
-
table[u].savings += elt.length / 8; /* rough approx */
|
424
|
-
if (addedLength > 0) { /* otherwise,
|
399
|
+
int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
|
400
|
+
table[u].savings += elt.length / 8; /* rough approx bonus */
|
401
|
+
if (addedLength > 0) { /* otherwise, elt fully included into existing */
|
425
402
|
table[u].length += addedLength;
|
426
403
|
table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
|
427
404
|
}
|
405
|
+
/* sort : improve rank */
|
428
406
|
elt = table[u];
|
429
407
|
while ((u>1) && (table[u-1].savings < elt.savings))
|
430
408
|
table[u] = table[u-1], u--;
|
@@ -489,15 +467,21 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|
489
467
|
static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
490
468
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
491
469
|
const size_t* fileSizes, unsigned nbFiles,
|
492
|
-
U32
|
470
|
+
U32 minRatio, U32 notificationLevel)
|
493
471
|
{
|
494
472
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
495
473
|
int* const suffix = suffix0+1;
|
496
474
|
U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
|
497
475
|
BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
|
498
476
|
U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
|
499
|
-
U32 minRatio = nbFiles >> shiftRatio;
|
500
477
|
size_t result = 0;
|
478
|
+
clock_t displayClock = 0;
|
479
|
+
clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
|
480
|
+
|
481
|
+
# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
|
482
|
+
if (ZDICT_clockSpan(displayClock) > refreshRate) \
|
483
|
+
{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
|
484
|
+
if (notificationLevel>=4) fflush(stdout); } }
|
501
485
|
|
502
486
|
/* init */
|
503
487
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
@@ -523,7 +507,8 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
523
507
|
{ size_t pos;
|
524
508
|
for (pos=0; pos < bufferSize; pos++)
|
525
509
|
reverseSuffix[suffix[pos]] = (U32)pos;
|
526
|
-
/*
|
510
|
+
/* note filePos tracks borders between samples.
|
511
|
+
It's not used at this stage, but planned to become useful in a later update */
|
527
512
|
filePos[0] = 0;
|
528
513
|
for (pos=1; pos<nbFiles; pos++)
|
529
514
|
filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
|
@@ -535,23 +520,13 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|
535
520
|
{ U32 cursor; for (cursor=0; cursor < bufferSize; ) {
|
536
521
|
dictItem solution;
|
537
522
|
if (doneMarks[cursor]) { cursor++; continue; }
|
538
|
-
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
|
523
|
+
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
|
539
524
|
if (solution.length==0) { cursor++; continue; }
|
540
525
|
ZDICT_insertDictItem(dictList, dictListSize, solution);
|
541
526
|
cursor += solution.length;
|
542
527
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
543
528
|
} }
|
544
529
|
|
545
|
-
/* limit dictionary size */
|
546
|
-
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
547
|
-
U32 currentSize = 0;
|
548
|
-
U32 n; for (n=1; n<max; n++) {
|
549
|
-
currentSize += dictList[n].length;
|
550
|
-
if (currentSize > maxDictSize) break;
|
551
|
-
}
|
552
|
-
dictList->pos = n;
|
553
|
-
}
|
554
|
-
|
555
530
|
_cleanup:
|
556
531
|
free(suffix0);
|
557
532
|
free(reverseSuffix);
|
@@ -563,10 +538,12 @@ _cleanup:
|
|
563
538
|
|
564
539
|
static void ZDICT_fillNoise(void* buffer, size_t length)
|
565
540
|
{
|
566
|
-
unsigned
|
541
|
+
unsigned const prime1 = 2654435761U;
|
542
|
+
unsigned const prime2 = 2246822519U;
|
543
|
+
unsigned acc = prime1;
|
567
544
|
size_t p=0;;
|
568
545
|
for (p=0; p<length; p++) {
|
569
|
-
acc *=
|
546
|
+
acc *= prime2;
|
570
547
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
571
548
|
}
|
572
549
|
}
|
@@ -576,23 +553,23 @@ typedef struct
|
|
576
553
|
{
|
577
554
|
ZSTD_CCtx* ref;
|
578
555
|
ZSTD_CCtx* zc;
|
579
|
-
void* workPlace; /* must be
|
556
|
+
void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
|
580
557
|
} EStats_ress_t;
|
581
558
|
|
582
559
|
#define MAXREPOFFSET 1024
|
583
560
|
|
584
561
|
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
585
562
|
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
|
586
|
-
const void* src, size_t srcSize)
|
563
|
+
const void* src, size_t srcSize, U32 notificationLevel)
|
587
564
|
{
|
588
|
-
size_t const blockSizeMax = MIN (
|
565
|
+
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
|
589
566
|
size_t cSize;
|
590
567
|
|
591
568
|
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace,
|
569
|
+
{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
|
570
|
+
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
|
571
|
+
}
|
572
|
+
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
|
596
573
|
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
|
597
574
|
|
598
575
|
if (cSize) { /* if == 0; block is not compressible */
|
@@ -605,34 +582,33 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
|
|
605
582
|
}
|
606
583
|
|
607
584
|
/* seqStats */
|
608
|
-
{
|
609
|
-
ZSTD_seqToCodes(seqStorePtr
|
585
|
+
{ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
|
586
|
+
ZSTD_seqToCodes(seqStorePtr);
|
610
587
|
|
611
|
-
{ const BYTE* codePtr = seqStorePtr->
|
612
|
-
|
588
|
+
{ const BYTE* codePtr = seqStorePtr->ofCode;
|
589
|
+
U32 u;
|
613
590
|
for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
|
614
591
|
}
|
615
592
|
|
616
|
-
{ const BYTE* codePtr = seqStorePtr->
|
617
|
-
|
593
|
+
{ const BYTE* codePtr = seqStorePtr->mlCode;
|
594
|
+
U32 u;
|
618
595
|
for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
|
619
596
|
}
|
620
597
|
|
621
|
-
{ const BYTE* codePtr = seqStorePtr->
|
622
|
-
|
598
|
+
{ const BYTE* codePtr = seqStorePtr->llCode;
|
599
|
+
U32 u;
|
623
600
|
for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
|
624
|
-
|
601
|
+
}
|
625
602
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
}
|
603
|
+
if (nbSeq >= 2) { /* rep offsets */
|
604
|
+
const seqDef* const seq = seqStorePtr->sequencesStart;
|
605
|
+
U32 offset1 = seq[0].offset - 3;
|
606
|
+
U32 offset2 = seq[1].offset - 3;
|
607
|
+
if (offset1 >= MAXREPOFFSET) offset1 = 0;
|
608
|
+
if (offset2 >= MAXREPOFFSET) offset2 = 0;
|
609
|
+
repOffsets[offset1] += 3;
|
610
|
+
repOffsets[offset2] += 1;
|
611
|
+
} } }
|
636
612
|
}
|
637
613
|
|
638
614
|
/*
|
@@ -671,60 +647,65 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
|
|
671
647
|
}
|
672
648
|
|
673
649
|
|
674
|
-
#define OFFCODE_MAX
|
650
|
+
#define OFFCODE_MAX 30 /* only applicable to first block */
|
675
651
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
676
|
-
|
677
|
-
|
678
|
-
|
652
|
+
unsigned compressionLevel,
|
653
|
+
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
654
|
+
const void* dictBuffer, size_t dictBufferSize,
|
655
|
+
unsigned notificationLevel)
|
679
656
|
{
|
680
657
|
U32 countLit[256];
|
681
658
|
HUF_CREATE_STATIC_CTABLE(hufTable, 255);
|
682
659
|
U32 offcodeCount[OFFCODE_MAX+1];
|
683
660
|
short offcodeNCount[OFFCODE_MAX+1];
|
661
|
+
U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
|
684
662
|
U32 matchLengthCount[MaxML+1];
|
685
663
|
short matchLengthNCount[MaxML+1];
|
686
664
|
U32 litLengthCount[MaxLL+1];
|
687
665
|
short litLengthNCount[MaxLL+1];
|
688
|
-
U32 repOffset[MAXREPOFFSET]
|
666
|
+
U32 repOffset[MAXREPOFFSET];
|
689
667
|
offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
|
690
668
|
EStats_ress_t esr;
|
691
669
|
ZSTD_parameters params;
|
692
|
-
U32 u, huffLog =
|
670
|
+
U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
|
693
671
|
size_t pos = 0, errorCode;
|
694
672
|
size_t eSize = 0;
|
695
673
|
size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
|
696
|
-
size_t const averageSampleSize = totalSrcSize / nbFiles;
|
674
|
+
size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
|
697
675
|
BYTE* dstPtr = (BYTE*)dstBuffer;
|
698
676
|
|
699
677
|
/* init */
|
678
|
+
esr.ref = ZSTD_createCCtx();
|
679
|
+
esr.zc = ZSTD_createCCtx();
|
680
|
+
esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
|
681
|
+
if (!esr.ref || !esr.zc || !esr.workPlace) {
|
682
|
+
eSize = ERROR(memory_allocation);
|
683
|
+
DISPLAYLEVEL(1, "Not enough memory \n");
|
684
|
+
goto _cleanup;
|
685
|
+
}
|
686
|
+
if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */
|
700
687
|
for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
|
701
|
-
for (u=0; u<=
|
688
|
+
for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
|
702
689
|
for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
|
703
690
|
for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
|
691
|
+
memset(repOffset, 0, sizeof(repOffset));
|
704
692
|
repOffset[1] = repOffset[4] = repOffset[8] = 1;
|
705
693
|
memset(bestRepOffset, 0, sizeof(bestRepOffset));
|
706
|
-
esr.ref = ZSTD_createCCtx();
|
707
|
-
esr.zc = ZSTD_createCCtx();
|
708
|
-
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
|
709
|
-
if (!esr.ref || !esr.zc || !esr.workPlace) {
|
710
|
-
eSize = ERROR(memory_allocation);
|
711
|
-
DISPLAYLEVEL(1, "Not enough memory");
|
712
|
-
goto _cleanup;
|
713
|
-
}
|
714
694
|
if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
|
715
695
|
params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
696
|
+
{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
|
697
|
+
if (ZSTD_isError(beginResult)) {
|
698
|
+
eSize = ERROR(GENERIC);
|
699
|
+
DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
|
700
|
+
goto _cleanup;
|
701
|
+
} }
|
722
702
|
|
723
703
|
/* collect stats on all files */
|
724
704
|
for (u=0; u<nbFiles; u++) {
|
725
705
|
ZDICT_countEStats(esr, params,
|
726
|
-
|
727
|
-
|
706
|
+
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
|
707
|
+
(const char*)srcBuffer + pos, fileSizes[u],
|
708
|
+
notificationLevel);
|
728
709
|
pos += fileSizes[u];
|
729
710
|
}
|
730
711
|
|
@@ -732,7 +713,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
732
713
|
errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
|
733
714
|
if (HUF_isError(errorCode)) {
|
734
715
|
eSize = ERROR(GENERIC);
|
735
|
-
DISPLAYLEVEL(1, "HUF_buildCTable error");
|
716
|
+
DISPLAYLEVEL(1, "HUF_buildCTable error \n");
|
736
717
|
goto _cleanup;
|
737
718
|
}
|
738
719
|
huffLog = (U32)errorCode;
|
@@ -744,11 +725,11 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
744
725
|
}
|
745
726
|
/* note : the result of this phase should be used to better appreciate the impact on statistics */
|
746
727
|
|
747
|
-
total=0; for (u=0; u<=
|
748
|
-
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total,
|
728
|
+
total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
|
729
|
+
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
|
749
730
|
if (FSE_isError(errorCode)) {
|
750
731
|
eSize = ERROR(GENERIC);
|
751
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
|
732
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
|
752
733
|
goto _cleanup;
|
753
734
|
}
|
754
735
|
Offlog = (U32)errorCode;
|
@@ -757,7 +738,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
757
738
|
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
|
758
739
|
if (FSE_isError(errorCode)) {
|
759
740
|
eSize = ERROR(GENERIC);
|
760
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
|
741
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
|
761
742
|
goto _cleanup;
|
762
743
|
}
|
763
744
|
mlLog = (U32)errorCode;
|
@@ -766,17 +747,16 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
766
747
|
errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
|
767
748
|
if (FSE_isError(errorCode)) {
|
768
749
|
eSize = ERROR(GENERIC);
|
769
|
-
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount");
|
750
|
+
DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
|
770
751
|
goto _cleanup;
|
771
752
|
}
|
772
753
|
llLog = (U32)errorCode;
|
773
754
|
|
774
|
-
|
775
755
|
/* write result to buffer */
|
776
756
|
{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
|
777
757
|
if (HUF_isError(hhSize)) {
|
778
758
|
eSize = ERROR(GENERIC);
|
779
|
-
DISPLAYLEVEL(1, "HUF_writeCTable error");
|
759
|
+
DISPLAYLEVEL(1, "HUF_writeCTable error \n");
|
780
760
|
goto _cleanup;
|
781
761
|
}
|
782
762
|
dstPtr += hhSize;
|
@@ -787,7 +767,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
787
767
|
{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
|
788
768
|
if (FSE_isError(ohSize)) {
|
789
769
|
eSize = ERROR(GENERIC);
|
790
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
|
770
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
|
791
771
|
goto _cleanup;
|
792
772
|
}
|
793
773
|
dstPtr += ohSize;
|
@@ -798,7 +778,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
798
778
|
{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
|
799
779
|
if (FSE_isError(mhSize)) {
|
800
780
|
eSize = ERROR(GENERIC);
|
801
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
|
781
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
|
802
782
|
goto _cleanup;
|
803
783
|
}
|
804
784
|
dstPtr += mhSize;
|
@@ -809,7 +789,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
809
789
|
{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
|
810
790
|
if (FSE_isError(lhSize)) {
|
811
791
|
eSize = ERROR(GENERIC);
|
812
|
-
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
|
792
|
+
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
|
813
793
|
goto _cleanup;
|
814
794
|
}
|
815
795
|
dstPtr += lhSize;
|
@@ -819,7 +799,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
819
799
|
|
820
800
|
if (maxDstSize<12) {
|
821
801
|
eSize = ERROR(GENERIC);
|
822
|
-
DISPLAYLEVEL(1, "not enough space to write RepOffsets");
|
802
|
+
DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
|
823
803
|
goto _cleanup;
|
824
804
|
}
|
825
805
|
# if 0
|
@@ -833,7 +813,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|
833
813
|
MEM_writeLE32(dstPtr+4, repStartValue[1]);
|
834
814
|
MEM_writeLE32(dstPtr+8, repStartValue[2]);
|
835
815
|
#endif
|
836
|
-
dstPtr += 12;
|
816
|
+
//dstPtr += 12;
|
837
817
|
eSize += 12;
|
838
818
|
|
839
819
|
_cleanup:
|
@@ -845,51 +825,13 @@ _cleanup:
|
|
845
825
|
}
|
846
826
|
|
847
827
|
|
848
|
-
#define DIB_FASTSEGMENTSIZE 64
|
849
|
-
/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
|
850
|
-
Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
|
851
|
-
up to `dictSize`.
|
852
|
-
Filling starts from the end of `dictBuffer`, down to maximum possible.
|
853
|
-
if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
|
854
|
-
@return : amount of data written into `dictBuffer`,
|
855
|
-
or an error code
|
856
|
-
*/
|
857
|
-
static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
|
858
|
-
const void* samplesBuffer, size_t samplesSize)
|
859
|
-
{
|
860
|
-
char* dstPtr = (char*)dictBuffer + dictSize;
|
861
|
-
const char* srcPtr = (const char*)samplesBuffer;
|
862
|
-
size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
|
863
|
-
size_t segNb, interSize;
|
864
|
-
|
865
|
-
if (nbSegments <= 2) return ERROR(srcSize_wrong);
|
866
|
-
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
|
867
|
-
|
868
|
-
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
|
869
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
870
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
871
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
872
|
-
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
|
873
|
-
|
874
|
-
/* regularly copy a segment */
|
875
|
-
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
|
876
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
877
|
-
for (segNb=2; segNb < nbSegments; segNb++) {
|
878
|
-
srcPtr += interSize;
|
879
|
-
dstPtr -= DIB_FASTSEGMENTSIZE;
|
880
|
-
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
881
|
-
srcPtr += DIB_FASTSEGMENTSIZE;
|
882
|
-
}
|
883
|
-
|
884
|
-
return nbSegments * DIB_FASTSEGMENTSIZE;
|
885
|
-
}
|
886
|
-
|
887
828
|
size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
888
829
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
889
830
|
ZDICT_params_t params)
|
890
831
|
{
|
891
832
|
size_t hSize;
|
892
|
-
|
833
|
+
int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
|
834
|
+
U32 const notificationLevel = params.notificationLevel;
|
893
835
|
|
894
836
|
/* dictionary header */
|
895
837
|
MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
|
@@ -903,10 +845,15 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
903
845
|
/* entropy tables */
|
904
846
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
905
847
|
DISPLAYLEVEL(2, "statistics ... \n");
|
906
|
-
|
848
|
+
{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
|
907
849
|
compressionLevel,
|
908
850
|
samplesBuffer, samplesSizes, nbSamples,
|
909
|
-
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize
|
851
|
+
(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
|
852
|
+
notificationLevel);
|
853
|
+
if (ZDICT_isError(eSize)) return eSize;
|
854
|
+
hSize += eSize;
|
855
|
+
}
|
856
|
+
|
910
857
|
|
911
858
|
if (hSize + dictContentSize < dictBufferCapacity)
|
912
859
|
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
|
@@ -914,60 +861,86 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|
914
861
|
}
|
915
862
|
|
916
863
|
|
917
|
-
#define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3)
|
918
864
|
/*! ZDICT_trainFromBuffer_unsafe() :
|
919
|
-
* `samplesBuffer` must be followed by noisy guard band.
|
920
|
-
* @return : size of dictionary
|
865
|
+
* Warning : `samplesBuffer` must be followed by noisy guard band.
|
866
|
+
* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
|
921
867
|
*/
|
922
868
|
size_t ZDICT_trainFromBuffer_unsafe(
|
923
869
|
void* dictBuffer, size_t maxDictSize,
|
924
870
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
925
871
|
ZDICT_params_t params)
|
926
872
|
{
|
927
|
-
U32 const dictListSize = MAX(
|
873
|
+
U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
|
928
874
|
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
929
|
-
unsigned selectivity = params.selectivityLevel;
|
875
|
+
unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
|
876
|
+
unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
|
930
877
|
size_t const targetDictSize = maxDictSize;
|
931
|
-
size_t
|
878
|
+
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
932
879
|
size_t dictSize = 0;
|
880
|
+
U32 const notificationLevel = params.notificationLevel;
|
933
881
|
|
934
882
|
/* checks */
|
935
883
|
if (!dictList) return ERROR(memory_allocation);
|
936
884
|
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
|
885
|
+
if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
937
886
|
|
938
887
|
/* init */
|
939
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
940
|
-
if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
941
888
|
ZDICT_initDictItem(dictList);
|
942
|
-
g_displayLevel = params.notificationLevel;
|
943
|
-
if (selectivity==0) selectivity = g_selectivity_default;
|
944
889
|
|
945
890
|
/* build dictionary */
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
891
|
+
ZDICT_trainBuffer(dictList, dictListSize,
|
892
|
+
samplesBuffer, samplesBuffSize,
|
893
|
+
samplesSizes, nbSamples,
|
894
|
+
minRep, notificationLevel);
|
895
|
+
|
896
|
+
/* display best matches */
|
897
|
+
if (params.notificationLevel>= 3) {
|
898
|
+
U32 const nb = MIN(25, dictList[0].pos);
|
899
|
+
U32 const dictContentSize = ZDICT_dictSize(dictList);
|
900
|
+
U32 u;
|
901
|
+
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
902
|
+
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
903
|
+
for (u=1; u<=nb; u++) {
|
904
|
+
U32 pos = dictList[u].pos;
|
905
|
+
U32 length = dictList[u].length;
|
906
|
+
U32 printedLength = MIN(40, length);
|
907
|
+
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
908
|
+
u, length, pos, dictList[u].savings);
|
909
|
+
ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
|
910
|
+
DISPLAYLEVEL(3, "| \n");
|
911
|
+
} }
|
912
|
+
|
968
913
|
|
969
914
|
/* create dictionary */
|
970
915
|
{ U32 dictContentSize = ZDICT_dictSize(dictList);
|
916
|
+
if (dictContentSize < targetDictSize/3) {
|
917
|
+
DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
|
918
|
+
if (minRep > MINRATIO) {
|
919
|
+
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
920
|
+
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
921
|
+
}
|
922
|
+
if (samplesBuffSize < 10 * targetDictSize)
|
923
|
+
DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
|
924
|
+
}
|
925
|
+
|
926
|
+
if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
|
927
|
+
U32 proposedSelectivity = selectivity-1;
|
928
|
+
while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
|
929
|
+
DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
|
930
|
+
DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
|
931
|
+
DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
|
932
|
+
}
|
933
|
+
|
934
|
+
/* limit dictionary size */
|
935
|
+
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
936
|
+
U32 currentSize = 0;
|
937
|
+
U32 n; for (n=1; n<max; n++) {
|
938
|
+
currentSize += dictList[n].length;
|
939
|
+
if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
|
940
|
+
}
|
941
|
+
dictList->pos = n;
|
942
|
+
dictContentSize = currentSize;
|
943
|
+
}
|
971
944
|
|
972
945
|
/* build dict content */
|
973
946
|
{ U32 u;
|
@@ -979,14 +952,6 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|
979
952
|
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
|
980
953
|
} }
|
981
954
|
|
982
|
-
/* fast mode dict content */
|
983
|
-
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
984
|
-
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
985
|
-
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
986
|
-
dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
|
987
|
-
samplesBuffer, sBuffSize);
|
988
|
-
}
|
989
|
-
|
990
955
|
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
991
956
|
samplesBuffer, samplesSizes, nbSamples,
|
992
957
|
params);
|
@@ -1004,23 +969,23 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
|
|
1004
969
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
1005
970
|
ZDICT_params_t params)
|
1006
971
|
{
|
972
|
+
size_t result;
|
1007
973
|
void* newBuff;
|
1008
|
-
size_t sBuffSize;
|
974
|
+
size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
975
|
+
if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
|
1009
976
|
|
1010
|
-
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
1011
|
-
if (sBuffSize==0) return 0; /* empty content => no dictionary */
|
1012
977
|
newBuff = malloc(sBuffSize + NOISELENGTH);
|
1013
978
|
if (!newBuff) return ERROR(memory_allocation);
|
1014
979
|
|
1015
980
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
1016
981
|
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
1017
982
|
|
1018
|
-
|
983
|
+
result = ZDICT_trainFromBuffer_unsafe(
|
1019
984
|
dictBuffer, dictBufferCapacity,
|
1020
985
|
newBuff, samplesSizes, nbSamples,
|
1021
986
|
params);
|
1022
|
-
|
1023
|
-
|
987
|
+
free(newBuff);
|
988
|
+
return result;
|
1024
989
|
}
|
1025
990
|
|
1026
991
|
|