extzstd 0.0.3.CONCEPT → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.ja.md +39 -0
  3. data/LICENSE +6 -6
  4. data/README.md +26 -45
  5. data/contrib/zstd/CHANGELOG +555 -0
  6. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  7. data/contrib/zstd/CONTRIBUTING.md +392 -0
  8. data/contrib/zstd/COPYING +339 -0
  9. data/contrib/zstd/LICENSE +13 -9
  10. data/contrib/zstd/Makefile +414 -0
  11. data/contrib/zstd/README.md +170 -45
  12. data/contrib/zstd/TESTING.md +44 -0
  13. data/contrib/zstd/appveyor.yml +289 -0
  14. data/contrib/zstd/lib/BUCK +234 -0
  15. data/contrib/zstd/lib/Makefile +354 -0
  16. data/contrib/zstd/lib/README.md +179 -0
  17. data/contrib/zstd/{common → lib/common}/bitstream.h +170 -130
  18. data/contrib/zstd/lib/common/compiler.h +175 -0
  19. data/contrib/zstd/lib/common/cpu.h +215 -0
  20. data/contrib/zstd/lib/common/debug.c +24 -0
  21. data/contrib/zstd/lib/common/debug.h +114 -0
  22. data/contrib/zstd/{common → lib/common}/entropy_common.c +79 -94
  23. data/contrib/zstd/lib/common/error_private.c +55 -0
  24. data/contrib/zstd/lib/common/error_private.h +80 -0
  25. data/contrib/zstd/{common → lib/common}/fse.h +153 -93
  26. data/contrib/zstd/{common → lib/common}/fse_decompress.c +37 -82
  27. data/contrib/zstd/lib/common/huf.h +340 -0
  28. data/contrib/zstd/{common → lib/common}/mem.h +154 -78
  29. data/contrib/zstd/lib/common/pool.c +344 -0
  30. data/contrib/zstd/lib/common/pool.h +84 -0
  31. data/contrib/zstd/lib/common/threading.c +121 -0
  32. data/contrib/zstd/lib/common/threading.h +155 -0
  33. data/contrib/zstd/{common → lib/common}/xxhash.c +85 -75
  34. data/contrib/zstd/{common → lib/common}/xxhash.h +85 -73
  35. data/contrib/zstd/lib/common/zstd_common.c +83 -0
  36. data/contrib/zstd/lib/common/zstd_errors.h +94 -0
  37. data/contrib/zstd/lib/common/zstd_internal.h +447 -0
  38. data/contrib/zstd/{compress → lib/compress}/fse_compress.c +194 -303
  39. data/contrib/zstd/lib/compress/hist.c +183 -0
  40. data/contrib/zstd/lib/compress/hist.h +75 -0
  41. data/contrib/zstd/lib/compress/huf_compress.c +798 -0
  42. data/contrib/zstd/lib/compress/zstd_compress.c +4278 -0
  43. data/contrib/zstd/lib/compress/zstd_compress_internal.h +1125 -0
  44. data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
  45. data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
  46. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +419 -0
  47. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
  48. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +845 -0
  49. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
  50. data/contrib/zstd/lib/compress/zstd_cwksp.h +525 -0
  51. data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
  52. data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
  53. data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
  54. data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
  55. data/contrib/zstd/lib/compress/zstd_lazy.c +1138 -0
  56. data/contrib/zstd/lib/compress/zstd_lazy.h +67 -0
  57. data/contrib/zstd/lib/compress/zstd_ldm.c +619 -0
  58. data/contrib/zstd/lib/compress/zstd_ldm.h +110 -0
  59. data/contrib/zstd/lib/compress/zstd_opt.c +1200 -0
  60. data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
  61. data/contrib/zstd/lib/compress/zstdmt_compress.c +2143 -0
  62. data/contrib/zstd/lib/compress/zstdmt_compress.h +192 -0
  63. data/contrib/zstd/lib/decompress/huf_decompress.c +1248 -0
  64. data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
  65. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  66. data/contrib/zstd/lib/decompress/zstd_decompress.c +1885 -0
  67. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1432 -0
  68. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
  69. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +189 -0
  70. data/contrib/zstd/{common → lib/deprecated}/zbuff.h +86 -69
  71. data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
  72. data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
  73. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
  74. data/contrib/zstd/lib/dictBuilder/cover.c +1236 -0
  75. data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
  76. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
  77. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +5 -5
  78. data/contrib/zstd/lib/dictBuilder/fastcover.c +757 -0
  79. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +437 -347
  80. data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
  81. data/contrib/zstd/lib/legacy/zstd_legacy.h +415 -0
  82. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +272 -292
  83. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +26 -32
  84. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +162 -392
  85. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +26 -32
  86. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +162 -391
  87. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +27 -33
  88. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +195 -604
  89. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +26 -32
  90. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +300 -575
  91. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +22 -31
  92. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +165 -592
  93. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +54 -67
  94. data/contrib/zstd/lib/legacy/zstd_v07.c +4541 -0
  95. data/contrib/zstd/lib/legacy/zstd_v07.h +187 -0
  96. data/contrib/zstd/lib/libzstd.pc.in +15 -0
  97. data/contrib/zstd/lib/zstd.h +2090 -0
  98. data/ext/depend +2 -0
  99. data/ext/extconf.rb +18 -5
  100. data/ext/extzstd.c +296 -214
  101. data/ext/extzstd.h +81 -36
  102. data/ext/extzstd_nogvls.h +0 -117
  103. data/ext/extzstd_stream.c +622 -0
  104. data/ext/libzstd_conf.h +8 -0
  105. data/ext/zstd_common.c +11 -0
  106. data/ext/zstd_compress.c +15 -0
  107. data/ext/zstd_decompress.c +6 -0
  108. data/ext/zstd_dictbuilder.c +10 -0
  109. data/ext/zstd_dictbuilder_fastcover.c +3 -0
  110. data/ext/zstd_legacy_v01.c +3 -1
  111. data/ext/zstd_legacy_v02.c +3 -1
  112. data/ext/zstd_legacy_v03.c +3 -1
  113. data/ext/zstd_legacy_v04.c +3 -1
  114. data/ext/zstd_legacy_v05.c +3 -1
  115. data/ext/zstd_legacy_v06.c +3 -1
  116. data/ext/zstd_legacy_v07.c +3 -0
  117. data/gemstub.rb +27 -21
  118. data/lib/extzstd.rb +82 -161
  119. data/lib/extzstd/version.rb +1 -1
  120. data/test/test_basic.rb +19 -6
  121. metadata +127 -59
  122. data/contrib/zstd/common/error_private.h +0 -125
  123. data/contrib/zstd/common/error_public.h +0 -77
  124. data/contrib/zstd/common/huf.h +0 -228
  125. data/contrib/zstd/common/zstd.h +0 -475
  126. data/contrib/zstd/common/zstd_common.c +0 -91
  127. data/contrib/zstd/common/zstd_internal.h +0 -238
  128. data/contrib/zstd/compress/huf_compress.c +0 -577
  129. data/contrib/zstd/compress/zbuff_compress.c +0 -327
  130. data/contrib/zstd/compress/zstd_compress.c +0 -3074
  131. data/contrib/zstd/compress/zstd_opt.h +0 -1046
  132. data/contrib/zstd/decompress/huf_decompress.c +0 -894
  133. data/contrib/zstd/decompress/zbuff_decompress.c +0 -294
  134. data/contrib/zstd/decompress/zstd_decompress.c +0 -1362
  135. data/contrib/zstd/dictBuilder/zdict.h +0 -113
  136. data/contrib/zstd/legacy/zstd_legacy.h +0 -140
  137. data/ext/extzstd_buffered.c +0 -265
  138. data/ext/zstd_amalgam.c +0 -18
@@ -1,40 +1,20 @@
1
1
  /*
2
- dictBuilder - dictionary builder for zstd
3
- Copyright (C) Yann Collet 2016
4
-
5
- BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6
-
7
- Redistribution and use in source and binary forms, with or without
8
- modification, are permitted provided that the following conditions are
9
- met:
10
-
11
- * Redistributions of source code must retain the above copyright
12
- notice, this list of conditions and the following disclaimer.
13
- * Redistributions in binary form must reproduce the above
14
- copyright notice, this list of conditions and the following disclaimer
15
- in the documentation and/or other materials provided with the
16
- distribution.
17
-
18
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
-
30
- You can contact the author at :
31
- - Zstd homepage : https://www.zstd.net
32
- */
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
33
11
 
34
12
  /*-**************************************
35
13
  * Tuning parameters
36
14
  ****************************************/
15
+ #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
37
16
  #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
17
+ #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
38
18
 
39
19
 
40
20
  /*-**************************************
@@ -57,18 +37,18 @@
57
37
  #include <stdio.h> /* fprintf, fopen, ftello64 */
58
38
  #include <time.h> /* clock */
59
39
 
60
- #include "mem.h" /* read */
61
- #include "error_private.h"
62
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
40
+ #include "../common/mem.h" /* read */
41
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
63
42
  #define HUF_STATIC_LINKING_ONLY
64
- #include "huf.h"
65
- #include "zstd_internal.h" /* includes zstd.h */
66
- #include "xxhash.h"
43
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
+ #include "../common/zstd_internal.h" /* includes zstd.h */
45
+ #include "../common/xxhash.h" /* XXH64 */
67
46
  #include "divsufsort.h"
68
47
  #ifndef ZDICT_STATIC_LINKING_ONLY
69
48
  # define ZDICT_STATIC_LINKING_ONLY
70
49
  #endif
71
50
  #include "zdict.h"
51
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
72
52
 
73
53
 
74
54
  /*-*************************************
@@ -78,43 +58,30 @@
78
58
  #define MB *(1 <<20)
79
59
  #define GB *(1U<<30)
80
60
 
81
- #define DICTLISTSIZE 10000
61
+ #define DICTLISTSIZE_DEFAULT 10000
82
62
 
83
63
  #define NOISELENGTH 32
84
- #define PRIME1 2654435761U
85
- #define PRIME2 2246822519U
86
64
 
87
- #define MINRATIO 4
88
- static const U32 g_compressionLevel_default = 5;
65
+ static const int g_compressionLevel_default = 3;
89
66
  static const U32 g_selectivity_default = 9;
90
- static const size_t g_provision_entropySize = 200;
91
- static const size_t g_min_fast_dictContent = 192;
92
67
 
93
68
 
94
69
  /*-*************************************
95
70
  * Console display
96
71
  ***************************************/
97
72
  #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
98
- #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
99
- static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
100
-
101
- #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
102
- if (ZDICT_clockSpan(g_time) > refreshRate) \
103
- { g_time = clock(); DISPLAY(__VA_ARGS__); \
104
- if (g_displayLevel>=4) fflush(stdout); } }
105
- static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10;
106
- static clock_t g_time = 0;
73
+ #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
107
74
 
108
75
  static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
109
76
 
110
- static void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
77
+ static void ZDICT_printHex(const void* ptr, size_t length)
111
78
  {
112
79
  const BYTE* const b = (const BYTE*)ptr;
113
80
  size_t u;
114
81
  for (u=0; u<length; u++) {
115
82
  BYTE c = b[u];
116
83
  if (c<32 || c>126) c = '.'; /* non-printable char */
117
- DISPLAYLEVEL(dlevel, "%c", c);
84
+ DISPLAY("%c", c);
118
85
  }
119
86
  }
120
87
 
@@ -126,11 +93,41 @@ unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
126
93
 
127
94
  const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
128
95
 
96
+ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
97
+ {
98
+ if (dictSize < 8) return 0;
99
+ if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
100
+ return MEM_readLE32((const char*)dictBuffer + 4);
101
+ }
102
+
103
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
104
+ {
105
+ size_t headerSize;
106
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
107
+
108
+ { unsigned offcodeMaxValue = MaxOff;
109
+ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
110
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
111
+ short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
112
+ if (!bs || !wksp || !offcodeNCount) {
113
+ headerSize = ERROR(memory_allocation);
114
+ } else {
115
+ ZSTD_reset_compressedBlockState(bs);
116
+ headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
117
+ }
118
+
119
+ free(bs);
120
+ free(wksp);
121
+ free(offcodeNCount);
122
+ }
123
+
124
+ return headerSize;
125
+ }
129
126
 
130
127
  /*-********************************************************
131
128
  * Dictionary training functions
132
129
  **********************************************************/
133
- static unsigned ZDICT_NbCommonBytes (register size_t val)
130
+ static unsigned ZDICT_NbCommonBytes (size_t val)
134
131
  {
135
132
  if (MEM_isLittleEndian()) {
136
133
  if (MEM_64bits()) {
@@ -228,13 +225,12 @@ static void ZDICT_initDictItem(dictItem* d)
228
225
  static dictItem ZDICT_analyzePos(
229
226
  BYTE* doneMarks,
230
227
  const int* suffix, U32 start,
231
- const void* buffer, U32 minRatio)
228
+ const void* buffer, U32 minRatio, U32 notificationLevel)
232
229
  {
233
230
  U32 lengthList[LLIMIT] = {0};
234
231
  U32 cumulLength[LLIMIT] = {0};
235
232
  U32 savings[LLIMIT] = {0};
236
233
  const BYTE* b = (const BYTE*)buffer;
237
- size_t length;
238
234
  size_t maxLength = LLIMIT;
239
235
  size_t pos = suffix[start];
240
236
  U32 end = start;
@@ -249,26 +245,30 @@ static dictItem ZDICT_analyzePos(
249
245
  ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
250
246
  ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
251
247
  /* skip and mark segment */
252
- U16 u16 = MEM_read16(b+pos+4);
253
- U32 u, e = 6;
254
- while (MEM_read16(b+pos+e) == u16) e+=2 ;
255
- if (b[pos+e] == b[pos+e-1]) e++;
256
- for (u=1; u<e; u++)
248
+ U16 const pattern16 = MEM_read16(b+pos+4);
249
+ U32 u, patternEnd = 6;
250
+ while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
251
+ if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
252
+ for (u=1; u<patternEnd; u++)
257
253
  doneMarks[pos+u] = 1;
258
254
  return solution;
259
255
  }
260
256
 
261
257
  /* look forward */
262
- do {
263
- end++;
264
- length = ZDICT_count(b + pos, b + suffix[end]);
265
- } while (length >=MINMATCHLENGTH);
258
+ { size_t length;
259
+ do {
260
+ end++;
261
+ length = ZDICT_count(b + pos, b + suffix[end]);
262
+ } while (length >= MINMATCHLENGTH);
263
+ }
266
264
 
267
265
  /* look backward */
268
- do {
269
- length = ZDICT_count(b + pos, b + *(suffix+start-1));
270
- if (length >=MINMATCHLENGTH) start--;
271
- } while(length >= MINMATCHLENGTH);
266
+ { size_t length;
267
+ do {
268
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
269
+ if (length >=MINMATCHLENGTH) start--;
270
+ } while(length >= MINMATCHLENGTH);
271
+ }
272
272
 
273
273
  /* exit if not found a minimum nb of repetitions */
274
274
  if (end-start < minRatio) {
@@ -279,15 +279,15 @@ static dictItem ZDICT_analyzePos(
279
279
  }
280
280
 
281
281
  { int i;
282
- U32 searchLength;
282
+ U32 mml;
283
283
  U32 refinedStart = start;
284
284
  U32 refinedEnd = end;
285
285
 
286
286
  DISPLAYLEVEL(4, "\n");
287
- DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
287
+ DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
288
288
  DISPLAYLEVEL(4, "\n");
289
289
 
290
- for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
290
+ for (mml = MINMATCHLENGTH ; ; mml++) {
291
291
  BYTE currentChar = 0;
292
292
  U32 currentCount = 0;
293
293
  U32 currentID = refinedStart;
@@ -295,13 +295,13 @@ static dictItem ZDICT_analyzePos(
295
295
  U32 selectedCount = 0;
296
296
  U32 selectedID = currentID;
297
297
  for (id =refinedStart; id < refinedEnd; id++) {
298
- if (b[ suffix[id] + searchLength] != currentChar) {
298
+ if (b[suffix[id] + mml] != currentChar) {
299
299
  if (currentCount > selectedCount) {
300
300
  selectedCount = currentCount;
301
301
  selectedID = currentID;
302
302
  }
303
303
  currentID = id;
304
- currentChar = b[ suffix[id] + searchLength];
304
+ currentChar = b[ suffix[id] + mml];
305
305
  currentCount = 0;
306
306
  }
307
307
  currentCount ++;
@@ -317,27 +317,31 @@ static dictItem ZDICT_analyzePos(
317
317
  refinedEnd = refinedStart + selectedCount;
318
318
  }
319
319
 
320
- /* evaluate gain based on new ref */
320
+ /* evaluate gain based on new dict */
321
321
  start = refinedStart;
322
322
  pos = suffix[refinedStart];
323
323
  end = start;
324
324
  memset(lengthList, 0, sizeof(lengthList));
325
325
 
326
326
  /* look forward */
327
- do {
328
- end++;
329
- length = ZDICT_count(b + pos, b + suffix[end]);
330
- if (length >= LLIMIT) length = LLIMIT-1;
331
- lengthList[length]++;
332
- } while (length >=MINMATCHLENGTH);
327
+ { size_t length;
328
+ do {
329
+ end++;
330
+ length = ZDICT_count(b + pos, b + suffix[end]);
331
+ if (length >= LLIMIT) length = LLIMIT-1;
332
+ lengthList[length]++;
333
+ } while (length >=MINMATCHLENGTH);
334
+ }
333
335
 
334
336
  /* look backward */
335
- do {
336
- length = ZDICT_count(b + pos, b + suffix[start-1]);
337
- if (length >= LLIMIT) length = LLIMIT-1;
338
- lengthList[length]++;
339
- if (length >=MINMATCHLENGTH) start--;
340
- } while(length >= MINMATCHLENGTH);
337
+ { size_t length = MINMATCHLENGTH;
338
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
339
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
340
+ if (length >= LLIMIT) length = LLIMIT - 1;
341
+ lengthList[length]++;
342
+ if (length >= MINMATCHLENGTH) start--;
343
+ }
344
+ }
341
345
 
342
346
  /* largest useful length */
343
347
  memset(cumulLength, 0, sizeof(cumulLength));
@@ -361,8 +365,8 @@ static dictItem ZDICT_analyzePos(
361
365
  for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
362
366
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
363
367
 
364
- DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
365
- (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
368
+ DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
369
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
366
370
 
367
371
  solution.pos = (U32)pos;
368
372
  solution.length = (U32)maxLength;
@@ -371,12 +375,12 @@ static dictItem ZDICT_analyzePos(
371
375
  /* mark positions done */
372
376
  { U32 id;
373
377
  for (id=start; id<end; id++) {
374
- U32 p, pEnd;
378
+ U32 p, pEnd, length;
375
379
  U32 const testedPos = suffix[id];
376
380
  if (testedPos == pos)
377
381
  length = solution.length;
378
382
  else {
379
- length = ZDICT_count(b+pos, b+testedPos);
383
+ length = (U32)ZDICT_count(b+pos, b+testedPos);
380
384
  if (length > solution.length) length = solution.length;
381
385
  }
382
386
  pEnd = (U32)(testedPos + length);
@@ -388,28 +392,43 @@ static dictItem ZDICT_analyzePos(
388
392
  }
389
393
 
390
394
 
391
- /*! ZDICT_checkMerge
395
+ static int isIncluded(const void* in, const void* container, size_t length)
396
+ {
397
+ const char* const ip = (const char*) in;
398
+ const char* const into = (const char*) container;
399
+ size_t u;
400
+
401
+ for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
402
+ if (ip[u] != into[u]) break;
403
+ }
404
+
405
+ return u==length;
406
+ }
407
+
408
+ /*! ZDICT_tryMerge() :
392
409
  check if dictItem can be merged, do it if possible
393
410
  @return : id of destination elt, 0 if not merged
394
411
  */
395
- static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
412
+ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
396
413
  {
397
414
  const U32 tableSize = table->pos;
398
- const U32 max = elt.pos + (elt.length-1);
415
+ const U32 eltEnd = elt.pos + elt.length;
416
+ const char* const buf = (const char*) buffer;
399
417
 
400
418
  /* tail overlap */
401
419
  U32 u; for (u=1; u<tableSize; u++) {
402
420
  if (u==eltNbToSkip) continue;
403
- if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
421
+ if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
404
422
  /* append */
405
- U32 addedLength = table[u].pos - elt.pos;
423
+ U32 const addedLength = table[u].pos - elt.pos;
406
424
  table[u].length += addedLength;
407
425
  table[u].pos = elt.pos;
408
426
  table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
409
- table[u].savings += elt.length / 8; /* rough approx */
427
+ table[u].savings += elt.length / 8; /* rough approx bonus */
410
428
  elt = table[u];
429
+ /* sort : improve rank */
411
430
  while ((u>1) && (table[u-1].savings < elt.savings))
412
- table[u] = table[u-1], u--;
431
+ table[u] = table[u-1], u--;
413
432
  table[u] = elt;
414
433
  return u;
415
434
  } }
@@ -417,20 +436,33 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
417
436
  /* front overlap */
418
437
  for (u=1; u<tableSize; u++) {
419
438
  if (u==eltNbToSkip) continue;
420
- if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
439
+
440
+ if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
421
441
  /* append */
422
- int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
423
- table[u].savings += elt.length / 8; /* rough approx */
424
- if (addedLength > 0) { /* otherwise, already included */
442
+ int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
443
+ table[u].savings += elt.length / 8; /* rough approx bonus */
444
+ if (addedLength > 0) { /* otherwise, elt fully included into existing */
425
445
  table[u].length += addedLength;
426
446
  table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
427
447
  }
448
+ /* sort : improve rank */
428
449
  elt = table[u];
429
450
  while ((u>1) && (table[u-1].savings < elt.savings))
430
451
  table[u] = table[u-1], u--;
431
452
  table[u] = elt;
432
453
  return u;
433
- } }
454
+ }
455
+
456
+ if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
457
+ if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
458
+ size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
459
+ table[u].pos = elt.pos;
460
+ table[u].savings += (U32)(elt.savings * addedLength / elt.length);
461
+ table[u].length = MIN(elt.length, table[u].length + 1);
462
+ return u;
463
+ }
464
+ }
465
+ }
434
466
 
435
467
  return 0;
436
468
  }
@@ -438,8 +470,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
438
470
 
439
471
  static void ZDICT_removeDictItem(dictItem* table, U32 id)
440
472
  {
441
- /* convention : first element is nb of elts */
442
- U32 const max = table->pos;
473
+ /* convention : table[0].pos stores nb of elts */
474
+ U32 const max = table[0].pos;
443
475
  U32 u;
444
476
  if (!id) return; /* protection, should never happen */
445
477
  for (u=id; u<max-1; u++)
@@ -448,14 +480,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
448
480
  }
449
481
 
450
482
 
451
- static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
483
+ static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
452
484
  {
453
485
  /* merge if possible */
454
- U32 mergeId = ZDICT_checkMerge(table, elt, 0);
486
+ U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
455
487
  if (mergeId) {
456
488
  U32 newMerge = 1;
457
489
  while (newMerge) {
458
- newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
490
+ newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
459
491
  if (newMerge) ZDICT_removeDictItem(table, mergeId);
460
492
  mergeId = newMerge;
461
493
  }
@@ -486,18 +518,24 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
486
518
  }
487
519
 
488
520
 
489
- static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
521
+ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
490
522
  const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
491
523
  const size_t* fileSizes, unsigned nbFiles,
492
- U32 shiftRatio, unsigned maxDictSize)
524
+ unsigned minRatio, U32 notificationLevel)
493
525
  {
494
526
  int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
495
527
  int* const suffix = suffix0+1;
496
528
  U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
497
529
  BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
498
530
  U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
499
- U32 minRatio = nbFiles >> shiftRatio;
500
531
  size_t result = 0;
532
+ clock_t displayClock = 0;
533
+ clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
534
+
535
+ # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
536
+ if (ZDICT_clockSpan(displayClock) > refreshRate) \
537
+ { displayClock = clock(); DISPLAY(__VA_ARGS__); \
538
+ if (notificationLevel>=4) fflush(stderr); } }
501
539
 
502
540
  /* init */
503
541
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
@@ -509,11 +547,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
509
547
  memset(doneMarks, 0, bufferSize+16);
510
548
 
511
549
  /* limit sample set size (divsufsort limitation)*/
512
- if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
550
+ if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
513
551
  while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
514
552
 
515
553
  /* sort */
516
- DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
554
+ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
517
555
  { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
518
556
  if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
519
557
  }
@@ -523,7 +561,8 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
523
561
  { size_t pos;
524
562
  for (pos=0; pos < bufferSize; pos++)
525
563
  reverseSuffix[suffix[pos]] = (U32)pos;
526
- /* build file pos */
564
+ /* note filePos tracks borders between samples.
565
+ It's not used at this stage, but planned to become useful in a later update */
527
566
  filePos[0] = 0;
528
567
  for (pos=1; pos<nbFiles; pos++)
529
568
  filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
@@ -535,23 +574,13 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
535
574
  { U32 cursor; for (cursor=0; cursor < bufferSize; ) {
536
575
  dictItem solution;
537
576
  if (doneMarks[cursor]) { cursor++; continue; }
538
- solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
577
+ solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
539
578
  if (solution.length==0) { cursor++; continue; }
540
- ZDICT_insertDictItem(dictList, dictListSize, solution);
579
+ ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
541
580
  cursor += solution.length;
542
581
  DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
543
582
  } }
544
583
 
545
- /* limit dictionary size */
546
- { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
547
- U32 currentSize = 0;
548
- U32 n; for (n=1; n<max; n++) {
549
- currentSize += dictList[n].length;
550
- if (currentSize > maxDictSize) break;
551
- }
552
- dictList->pos = n;
553
- }
554
-
555
584
  _cleanup:
556
585
  free(suffix0);
557
586
  free(reverseSuffix);
@@ -563,10 +592,12 @@ _cleanup:
563
592
 
564
593
  static void ZDICT_fillNoise(void* buffer, size_t length)
565
594
  {
566
- unsigned acc = PRIME1;
567
- size_t p=0;;
595
+ unsigned const prime1 = 2654435761U;
596
+ unsigned const prime2 = 2246822519U;
597
+ unsigned acc = prime1;
598
+ size_t p=0;
568
599
  for (p=0; p<length; p++) {
569
- acc *= PRIME2;
600
+ acc *= prime2;
570
601
  ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
571
602
  }
572
603
  }
@@ -574,29 +605,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
574
605
 
575
606
  typedef struct
576
607
  {
577
- ZSTD_CCtx* ref;
578
- ZSTD_CCtx* zc;
608
+ ZSTD_CDict* dict; /* dictionary */
609
+ ZSTD_CCtx* zc; /* working context */
579
610
  void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
580
611
  } EStats_ress_t;
581
612
 
582
613
  #define MAXREPOFFSET 1024
583
614
 
584
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
585
- U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
586
- const void* src, size_t srcSize)
615
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
616
+ unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
617
+ const void* src, size_t srcSize,
618
+ U32 notificationLevel)
587
619
  {
588
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
620
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
589
621
  size_t cSize;
590
622
 
591
623
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
592
- { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref);
593
- if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
594
- }
624
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
625
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
626
+
627
+ }
595
628
  cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
596
- if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
629
+ if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
597
630
 
598
631
  if (cSize) { /* if == 0; block is not compressible */
599
- const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
632
+ const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
600
633
 
601
634
  /* literals stats */
602
635
  { const BYTE* bytePtr;
@@ -605,46 +638,34 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
605
638
  }
606
639
 
607
640
  /* seqStats */
608
- { size_t const nbSeq = (size_t)(seqStorePtr->offset - seqStorePtr->offsetStart);
609
- ZSTD_seqToCodes(seqStorePtr, nbSeq);
641
+ { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
642
+ ZSTD_seqToCodes(seqStorePtr);
610
643
 
611
- { const BYTE* codePtr = seqStorePtr->offCodeStart;
612
- size_t u;
644
+ { const BYTE* codePtr = seqStorePtr->ofCode;
645
+ U32 u;
613
646
  for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
614
647
  }
615
648
 
616
- { const BYTE* codePtr = seqStorePtr->mlCodeStart;
617
- size_t u;
649
+ { const BYTE* codePtr = seqStorePtr->mlCode;
650
+ U32 u;
618
651
  for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
619
652
  }
620
653
 
621
- { const BYTE* codePtr = seqStorePtr->llCodeStart;
622
- size_t u;
654
+ { const BYTE* codePtr = seqStorePtr->llCode;
655
+ U32 u;
623
656
  for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
624
- } }
625
-
626
- /* rep offsets */
627
- { const U32* const offsetPtr = seqStorePtr->offsetStart;
628
- U32 offset1 = offsetPtr[0] - 3;
629
- U32 offset2 = offsetPtr[1] - 3;
630
- if (offset1 >= MAXREPOFFSET) offset1 = 0;
631
- if (offset2 >= MAXREPOFFSET) offset2 = 0;
632
- repOffsets[offset1] += 3;
633
- repOffsets[offset2] += 1;
634
- }
635
- }
636
- }
657
+ }
637
658
 
638
- /*
639
- static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
640
- {
641
- unsigned u;
642
- size_t max=0;
643
- for (u=0; u<nbFiles; u++)
644
- if (max < fileSizes[u]) max = fileSizes[u];
645
- return max;
659
+ if (nbSeq >= 2) { /* rep offsets */
660
+ const seqDef* const seq = seqStorePtr->sequencesStart;
661
+ U32 offset1 = seq[0].offset - 3;
662
+ U32 offset2 = seq[1].offset - 3;
663
+ if (offset1 >= MAXREPOFFSET) offset1 = 0;
664
+ if (offset2 >= MAXREPOFFSET) offset2 = 0;
665
+ repOffsets[offset1] += 3;
666
+ repOffsets[offset2] += 1;
667
+ } } }
646
668
  }
647
- */
648
669
 
649
670
  static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
650
671
  {
@@ -670,72 +691,92 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
670
691
  }
671
692
  }
672
693
 
694
+ /* ZDICT_flatLit() :
695
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
696
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
697
+ */
698
+ static void ZDICT_flatLit(unsigned* countLit)
699
+ {
700
+ int u;
701
+ for (u=1; u<256; u++) countLit[u] = 2;
702
+ countLit[0] = 4;
703
+ countLit[253] = 1;
704
+ countLit[254] = 1;
705
+ }
673
706
 
674
- #define OFFCODE_MAX 18 /* only applicable to first block */
707
+ #define OFFCODE_MAX 30 /* only applicable to first block */
675
708
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
676
- unsigned compressionLevel,
677
- const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
678
- const void* dictBuffer, size_t dictBufferSize)
709
+ unsigned compressionLevel,
710
+ const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
711
+ const void* dictBuffer, size_t dictBufferSize,
712
+ unsigned notificationLevel)
679
713
  {
680
- U32 countLit[256];
714
+ unsigned countLit[256];
681
715
  HUF_CREATE_STATIC_CTABLE(hufTable, 255);
682
- U32 offcodeCount[OFFCODE_MAX+1];
716
+ unsigned offcodeCount[OFFCODE_MAX+1];
683
717
  short offcodeNCount[OFFCODE_MAX+1];
684
- U32 matchLengthCount[MaxML+1];
718
+ U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
719
+ unsigned matchLengthCount[MaxML+1];
685
720
  short matchLengthNCount[MaxML+1];
686
- U32 litLengthCount[MaxLL+1];
721
+ unsigned litLengthCount[MaxLL+1];
687
722
  short litLengthNCount[MaxLL+1];
688
- U32 repOffset[MAXREPOFFSET] = { 0 };
723
+ U32 repOffset[MAXREPOFFSET];
689
724
  offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
690
- EStats_ress_t esr;
725
+ EStats_ress_t esr = { NULL, NULL, NULL };
691
726
  ZSTD_parameters params;
692
- U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
727
+ U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
693
728
  size_t pos = 0, errorCode;
694
729
  size_t eSize = 0;
695
730
  size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
696
- size_t const averageSampleSize = totalSrcSize / nbFiles;
731
+ size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
697
732
  BYTE* dstPtr = (BYTE*)dstBuffer;
698
733
 
699
734
  /* init */
700
- for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
701
- for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
702
- for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
703
- for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
735
+ DEBUGLOG(4, "ZDICT_analyzeEntropy");
736
+ if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
737
+ for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
738
+ for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
739
+ for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
740
+ for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
741
+ memset(repOffset, 0, sizeof(repOffset));
704
742
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
705
743
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
706
- esr.ref = ZSTD_createCCtx();
744
+ if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
745
+ params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
746
+
747
+ esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
707
748
  esr.zc = ZSTD_createCCtx();
708
749
  esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
709
- if (!esr.ref || !esr.zc || !esr.workPlace) {
710
- eSize = ERROR(memory_allocation);
711
- DISPLAYLEVEL(1, "Not enough memory");
712
- goto _cleanup;
750
+ if (!esr.dict || !esr.zc || !esr.workPlace) {
751
+ eSize = ERROR(memory_allocation);
752
+ DISPLAYLEVEL(1, "Not enough memory \n");
753
+ goto _cleanup;
713
754
  }
714
- if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
715
- params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
716
- { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
717
- if (ZSTD_isError(beginResult)) {
718
- eSize = ERROR(GENERIC);
719
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed ");
720
- goto _cleanup;
721
- } }
722
-
723
- /* collect stats on all files */
755
+
756
+ /* collect stats on all samples */
724
757
  for (u=0; u<nbFiles; u++) {
725
- ZDICT_countEStats(esr, params,
726
- countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
727
- (const char*)srcBuffer + pos, fileSizes[u]);
758
+ ZDICT_countEStats(esr, &params,
759
+ countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
760
+ (const char*)srcBuffer + pos, fileSizes[u],
761
+ notificationLevel);
728
762
  pos += fileSizes[u];
729
763
  }
730
764
 
731
- /* analyze */
732
- errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
733
- if (HUF_isError(errorCode)) {
734
- eSize = ERROR(GENERIC);
735
- DISPLAYLEVEL(1, "HUF_buildCTable error");
736
- goto _cleanup;
765
+ /* analyze, build stats, starting with literals */
766
+ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
767
+ if (HUF_isError(maxNbBits)) {
768
+ eSize = maxNbBits;
769
+ DISPLAYLEVEL(1, " HUF_buildCTable error \n");
770
+ goto _cleanup;
771
+ }
772
+ if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
773
+ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
774
+ ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
775
+ maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
776
+ assert(maxNbBits==9);
777
+ }
778
+ huffLog = (U32)maxNbBits;
737
779
  }
738
- huffLog = (U32)errorCode;
739
780
 
740
781
  /* looking for most common first offsets */
741
782
  { U32 offset;
@@ -744,11 +785,11 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
744
785
  }
745
786
  /* note : the result of this phase should be used to better appreciate the impact on statistics */
746
787
 
747
- total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
748
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
788
+ total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
789
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
749
790
  if (FSE_isError(errorCode)) {
750
- eSize = ERROR(GENERIC);
751
- DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
791
+ eSize = errorCode;
792
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
752
793
  goto _cleanup;
753
794
  }
754
795
  Offlog = (U32)errorCode;
@@ -756,8 +797,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
756
797
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
757
798
  errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
758
799
  if (FSE_isError(errorCode)) {
759
- eSize = ERROR(GENERIC);
760
- DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
800
+ eSize = errorCode;
801
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
761
802
  goto _cleanup;
762
803
  }
763
804
  mlLog = (U32)errorCode;
@@ -765,18 +806,17 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
765
806
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
766
807
  errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
767
808
  if (FSE_isError(errorCode)) {
768
- eSize = ERROR(GENERIC);
769
- DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount");
809
+ eSize = errorCode;
810
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
770
811
  goto _cleanup;
771
812
  }
772
813
  llLog = (U32)errorCode;
773
814
 
774
-
775
815
  /* write result to buffer */
776
816
  { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
777
817
  if (HUF_isError(hhSize)) {
778
- eSize = ERROR(GENERIC);
779
- DISPLAYLEVEL(1, "HUF_writeCTable error");
818
+ eSize = hhSize;
819
+ DISPLAYLEVEL(1, "HUF_writeCTable error \n");
780
820
  goto _cleanup;
781
821
  }
782
822
  dstPtr += hhSize;
@@ -786,8 +826,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
786
826
 
787
827
  { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
788
828
  if (FSE_isError(ohSize)) {
789
- eSize = ERROR(GENERIC);
790
- DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
829
+ eSize = ohSize;
830
+ DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
791
831
  goto _cleanup;
792
832
  }
793
833
  dstPtr += ohSize;
@@ -797,8 +837,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
797
837
 
798
838
  { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
799
839
  if (FSE_isError(mhSize)) {
800
- eSize = ERROR(GENERIC);
801
- DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
840
+ eSize = mhSize;
841
+ DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
802
842
  goto _cleanup;
803
843
  }
804
844
  dstPtr += mhSize;
@@ -808,8 +848,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
808
848
 
809
849
  { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
810
850
  if (FSE_isError(lhSize)) {
811
- eSize = ERROR(GENERIC);
812
- DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
851
+ eSize = lhSize;
852
+ DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
813
853
  goto _cleanup;
814
854
  }
815
855
  dstPtr += lhSize;
@@ -818,8 +858,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
818
858
  }
819
859
 
820
860
  if (maxDstSize<12) {
821
- eSize = ERROR(GENERIC);
822
- DISPLAYLEVEL(1, "not enough space to write RepOffsets");
861
+ eSize = ERROR(dstSize_tooSmall);
862
+ DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
823
863
  goto _cleanup;
824
864
  }
825
865
  # if 0
@@ -833,11 +873,10 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
833
873
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
834
874
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
835
875
  #endif
836
- dstPtr += 12;
837
876
  eSize += 12;
838
877
 
839
878
  _cleanup:
840
- ZSTD_freeCCtx(esr.ref);
879
+ ZSTD_freeCDict(esr.dict);
841
880
  ZSTD_freeCCtx(esr.zc);
842
881
  free(esr.workPlace);
843
882
 
@@ -845,129 +884,180 @@ _cleanup:
845
884
  }
846
885
 
847
886
 
848
- #define DIB_FASTSEGMENTSIZE 64
849
- /*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
850
- Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
851
- up to `dictSize`.
852
- Filling starts from the end of `dictBuffer`, down to maximum possible.
853
- if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
854
- @return : amount of data written into `dictBuffer`,
855
- or an error code
856
- */
857
- static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
858
- const void* samplesBuffer, size_t samplesSize)
859
- {
860
- char* dstPtr = (char*)dictBuffer + dictSize;
861
- const char* srcPtr = (const char*)samplesBuffer;
862
- size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
863
- size_t segNb, interSize;
864
-
865
- if (nbSegments <= 2) return ERROR(srcSize_wrong);
866
- if (samplesSize < dictSize) return ERROR(srcSize_wrong);
867
-
868
- /* first and last segments are part of dictionary, in case they contain interesting header/footer */
869
- dstPtr -= DIB_FASTSEGMENTSIZE;
870
- memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
871
- dstPtr -= DIB_FASTSEGMENTSIZE;
872
- memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
873
-
874
- /* regularly copy a segment */
875
- interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
876
- srcPtr += DIB_FASTSEGMENTSIZE;
877
- for (segNb=2; segNb < nbSegments; segNb++) {
878
- srcPtr += interSize;
879
- dstPtr -= DIB_FASTSEGMENTSIZE;
880
- memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
881
- srcPtr += DIB_FASTSEGMENTSIZE;
882
- }
883
-
884
- return nbSegments * DIB_FASTSEGMENTSIZE;
885
- }
886
887
 
887
- size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
888
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
889
- ZDICT_params_t params)
888
+ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
889
+ const void* customDictContent, size_t dictContentSize,
890
+ const void* samplesBuffer, const size_t* samplesSizes,
891
+ unsigned nbSamples, ZDICT_params_t params)
890
892
  {
891
893
  size_t hSize;
892
- unsigned const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
894
+ #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
895
+ BYTE header[HBUFFSIZE];
896
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
897
+ U32 const notificationLevel = params.notificationLevel;
898
+
899
+ /* check conditions */
900
+ DEBUGLOG(4, "ZDICT_finalizeDictionary");
901
+ if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
902
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
903
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
893
904
 
894
905
  /* dictionary header */
895
- MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
896
- { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
906
+ MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
907
+ { U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
897
908
  U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
898
909
  U32 const dictID = params.dictID ? params.dictID : compliantID;
899
- MEM_writeLE32((char*)dictBuffer+4, dictID);
910
+ MEM_writeLE32(header+4, dictID);
900
911
  }
901
912
  hSize = 8;
902
913
 
903
914
  /* entropy tables */
904
915
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
905
916
  DISPLAYLEVEL(2, "statistics ... \n");
906
- hSize += ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
917
+ { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
918
+ compressionLevel,
919
+ samplesBuffer, samplesSizes, nbSamples,
920
+ customDictContent, dictContentSize,
921
+ notificationLevel);
922
+ if (ZDICT_isError(eSize)) return eSize;
923
+ hSize += eSize;
924
+ }
925
+
926
+ /* copy elements in final buffer ; note : src and dst buffer can overlap */
927
+ if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
928
+ { size_t const dictSize = hSize + dictContentSize;
929
+ char* dictEnd = (char*)dictBuffer + dictSize;
930
+ memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
931
+ memcpy(dictBuffer, header, hSize);
932
+ return dictSize;
933
+ }
934
+ }
935
+
936
+
937
+ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
938
+ void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
939
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
940
+ ZDICT_params_t params)
941
+ {
942
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
943
+ U32 const notificationLevel = params.notificationLevel;
944
+ size_t hSize = 8;
945
+
946
+ /* calculate entropy tables */
947
+ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
948
+ DISPLAYLEVEL(2, "statistics ... \n");
949
+ { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
907
950
  compressionLevel,
908
951
  samplesBuffer, samplesSizes, nbSamples,
909
- (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
952
+ (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
953
+ notificationLevel);
954
+ if (ZDICT_isError(eSize)) return eSize;
955
+ hSize += eSize;
956
+ }
957
+
958
+ /* add dictionary header (after entropy tables) */
959
+ MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
960
+ { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
961
+ U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
962
+ U32 const dictID = params.dictID ? params.dictID : compliantID;
963
+ MEM_writeLE32((char*)dictBuffer+4, dictID);
964
+ }
910
965
 
911
966
  if (hSize + dictContentSize < dictBufferCapacity)
912
967
  memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
913
968
  return MIN(dictBufferCapacity, hSize+dictContentSize);
914
969
  }
915
970
 
916
-
917
- #define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3)
918
- /*! ZDICT_trainFromBuffer_unsafe() :
919
- * `samplesBuffer` must be followed by noisy guard band.
920
- * @return : size of dictionary.
971
+ /* Hidden declaration for dbio.c */
972
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
973
+ void* dictBuffer, size_t maxDictSize,
974
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
975
+ ZDICT_legacy_params_t params);
976
+ /*! ZDICT_trainFromBuffer_unsafe_legacy() :
977
+ * Warning : `samplesBuffer` must be followed by noisy guard band.
978
+ * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
921
979
  */
922
- size_t ZDICT_trainFromBuffer_unsafe(
980
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
923
981
  void* dictBuffer, size_t maxDictSize,
924
982
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
925
- ZDICT_params_t params)
983
+ ZDICT_legacy_params_t params)
926
984
  {
927
- U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
985
+ U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
928
986
  dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
929
- unsigned selectivity = params.selectivityLevel;
987
+ unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
988
+ unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
930
989
  size_t const targetDictSize = maxDictSize;
931
- size_t sBuffSize;
990
+ size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
932
991
  size_t dictSize = 0;
992
+ U32 const notificationLevel = params.zParams.notificationLevel;
933
993
 
934
994
  /* checks */
935
995
  if (!dictList) return ERROR(memory_allocation);
936
- if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
996
+ if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
997
+ if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
937
998
 
938
999
  /* init */
939
- { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
940
- if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
941
1000
  ZDICT_initDictItem(dictList);
942
- g_displayLevel = params.notificationLevel;
943
- if (selectivity==0) selectivity = g_selectivity_default;
944
1001
 
945
1002
  /* build dictionary */
946
- if (selectivity>1) { /* selectivity == 1 => fast mode */
947
- ZDICT_trainBuffer(dictList, dictListSize,
948
- samplesBuffer, sBuffSize,
949
- samplesSizes, nbSamples,
950
- selectivity, (U32)targetDictSize);
951
-
952
- /* display best matches */
953
- if (g_displayLevel>= 3) {
954
- U32 const nb = 25;
955
- U32 const dictContentSize = ZDICT_dictSize(dictList);
956
- U32 u;
957
- DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
958
- DISPLAYLEVEL(3, "list %u best segments \n", nb);
959
- for (u=1; u<=nb; u++) {
960
- U32 p = dictList[u].pos;
961
- U32 l = dictList[u].length;
962
- U32 d = MIN(40, l);
963
- DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
964
- u, l, p, dictList[u].savings);
965
- ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
966
- DISPLAYLEVEL(3, "| \n");
967
- } } }
1003
+ ZDICT_trainBuffer_legacy(dictList, dictListSize,
1004
+ samplesBuffer, samplesBuffSize,
1005
+ samplesSizes, nbSamples,
1006
+ minRep, notificationLevel);
1007
+
1008
+ /* display best matches */
1009
+ if (params.zParams.notificationLevel>= 3) {
1010
+ unsigned const nb = MIN(25, dictList[0].pos);
1011
+ unsigned const dictContentSize = ZDICT_dictSize(dictList);
1012
+ unsigned u;
1013
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
1014
+ DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
1015
+ for (u=1; u<nb; u++) {
1016
+ unsigned const pos = dictList[u].pos;
1017
+ unsigned const length = dictList[u].length;
1018
+ U32 const printedLength = MIN(40, length);
1019
+ if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
1020
+ free(dictList);
1021
+ return ERROR(GENERIC); /* should never happen */
1022
+ }
1023
+ DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
1024
+ u, length, pos, (unsigned)dictList[u].savings);
1025
+ ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
1026
+ DISPLAYLEVEL(3, "| \n");
1027
+ } }
1028
+
968
1029
 
969
1030
  /* create dictionary */
970
- { U32 dictContentSize = ZDICT_dictSize(dictList);
1031
+ { unsigned dictContentSize = ZDICT_dictSize(dictList);
1032
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
1033
+ if (dictContentSize < targetDictSize/4) {
1034
+ DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
1035
+ if (samplesBuffSize < 10 * targetDictSize)
1036
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
1037
+ if (minRep > MINRATIO) {
1038
+ DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
1039
+ DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
1040
+ }
1041
+ }
1042
+
1043
+ if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
1044
+ unsigned proposedSelectivity = selectivity-1;
1045
+ while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
1046
+ DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
1047
+ DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
1048
+ DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
1049
+ }
1050
+
1051
+ /* limit dictionary size */
1052
+ { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
1053
+ U32 currentSize = 0;
1054
+ U32 n; for (n=1; n<max; n++) {
1055
+ currentSize += dictList[n].length;
1056
+ if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
1057
+ }
1058
+ dictList->pos = n;
1059
+ dictContentSize = currentSize;
1060
+ }
971
1061
 
972
1062
  /* build dict content */
973
1063
  { U32 u;
@@ -979,17 +1069,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
979
1069
  memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
980
1070
  } }
981
1071
 
982
- /* fast mode dict content */
983
- if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
984
- DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
985
- DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
986
- dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
987
- samplesBuffer, sBuffSize);
988
- }
989
-
990
1072
  dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
991
1073
  samplesBuffer, samplesSizes, nbSamples,
992
- params);
1074
+ params.zParams);
993
1075
  }
994
1076
 
995
1077
  /* clean up */
@@ -998,44 +1080,52 @@ size_t ZDICT_trainFromBuffer_unsafe(
998
1080
  }
999
1081
 
1000
1082
 
1001
- /* issue : samplesBuffer need to be followed by a noisy guard band.
1002
- * work around : duplicate the buffer, and add the noise */
1003
- size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
1004
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1005
- ZDICT_params_t params)
1083
+ /* ZDICT_trainFromBuffer_legacy() :
1084
+ * issue : samplesBuffer need to be followed by a noisy guard band.
1085
+ * work around : duplicate the buffer, and add the noise */
1086
+ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1087
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1088
+ ZDICT_legacy_params_t params)
1006
1089
  {
1090
+ size_t result;
1007
1091
  void* newBuff;
1008
- size_t sBuffSize;
1092
+ size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
1093
+ if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
1009
1094
 
1010
- { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
1011
- if (sBuffSize==0) return 0; /* empty content => no dictionary */
1012
1095
  newBuff = malloc(sBuffSize + NOISELENGTH);
1013
1096
  if (!newBuff) return ERROR(memory_allocation);
1014
1097
 
1015
1098
  memcpy(newBuff, samplesBuffer, sBuffSize);
1016
1099
  ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
1017
1100
 
1018
- { size_t const result = ZDICT_trainFromBuffer_unsafe(
1019
- dictBuffer, dictBufferCapacity,
1020
- newBuff, samplesSizes, nbSamples,
1021
- params);
1022
- free(newBuff);
1023
- return result; }
1101
+ result =
1102
+ ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1103
+ samplesSizes, nbSamples, params);
1104
+ free(newBuff);
1105
+ return result;
1024
1106
  }
1025
1107
 
1026
1108
 
1027
1109
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1028
1110
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1029
1111
  {
1030
- ZDICT_params_t params;
1112
+ ZDICT_fastCover_params_t params;
1113
+ DEBUGLOG(3, "ZDICT_trainFromBuffer");
1031
1114
  memset(&params, 0, sizeof(params));
1032
- return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
1033
- samplesBuffer, samplesSizes, nbSamples,
1034
- params);
1115
+ params.d = 8;
1116
+ params.steps = 4;
1117
+ /* Default to level 6 since no compression level information is available */
1118
+ params.zParams.compressionLevel = 3;
1119
+ #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1120
+ params.zParams.notificationLevel = DEBUGLEVEL;
1121
+ #endif
1122
+ return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1123
+ samplesBuffer, samplesSizes, nbSamples,
1124
+ &params);
1035
1125
  }
1036
1126
 
1037
1127
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1038
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1128
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1039
1129
  {
1040
1130
  ZDICT_params_t params;
1041
1131
  memset(&params, 0, sizeof(params));