extzstd 0.0.3.CONCEPT → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.ja.md +39 -0
  3. data/LICENSE +6 -6
  4. data/README.md +26 -45
  5. data/contrib/zstd/CHANGELOG +555 -0
  6. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  7. data/contrib/zstd/CONTRIBUTING.md +392 -0
  8. data/contrib/zstd/COPYING +339 -0
  9. data/contrib/zstd/LICENSE +13 -9
  10. data/contrib/zstd/Makefile +414 -0
  11. data/contrib/zstd/README.md +170 -45
  12. data/contrib/zstd/TESTING.md +44 -0
  13. data/contrib/zstd/appveyor.yml +289 -0
  14. data/contrib/zstd/lib/BUCK +234 -0
  15. data/contrib/zstd/lib/Makefile +354 -0
  16. data/contrib/zstd/lib/README.md +179 -0
  17. data/contrib/zstd/{common → lib/common}/bitstream.h +170 -130
  18. data/contrib/zstd/lib/common/compiler.h +175 -0
  19. data/contrib/zstd/lib/common/cpu.h +215 -0
  20. data/contrib/zstd/lib/common/debug.c +24 -0
  21. data/contrib/zstd/lib/common/debug.h +114 -0
  22. data/contrib/zstd/{common → lib/common}/entropy_common.c +79 -94
  23. data/contrib/zstd/lib/common/error_private.c +55 -0
  24. data/contrib/zstd/lib/common/error_private.h +80 -0
  25. data/contrib/zstd/{common → lib/common}/fse.h +153 -93
  26. data/contrib/zstd/{common → lib/common}/fse_decompress.c +37 -82
  27. data/contrib/zstd/lib/common/huf.h +340 -0
  28. data/contrib/zstd/{common → lib/common}/mem.h +154 -78
  29. data/contrib/zstd/lib/common/pool.c +344 -0
  30. data/contrib/zstd/lib/common/pool.h +84 -0
  31. data/contrib/zstd/lib/common/threading.c +121 -0
  32. data/contrib/zstd/lib/common/threading.h +155 -0
  33. data/contrib/zstd/{common → lib/common}/xxhash.c +85 -75
  34. data/contrib/zstd/{common → lib/common}/xxhash.h +85 -73
  35. data/contrib/zstd/lib/common/zstd_common.c +83 -0
  36. data/contrib/zstd/lib/common/zstd_errors.h +94 -0
  37. data/contrib/zstd/lib/common/zstd_internal.h +447 -0
  38. data/contrib/zstd/{compress → lib/compress}/fse_compress.c +194 -303
  39. data/contrib/zstd/lib/compress/hist.c +183 -0
  40. data/contrib/zstd/lib/compress/hist.h +75 -0
  41. data/contrib/zstd/lib/compress/huf_compress.c +798 -0
  42. data/contrib/zstd/lib/compress/zstd_compress.c +4278 -0
  43. data/contrib/zstd/lib/compress/zstd_compress_internal.h +1125 -0
  44. data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
  45. data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
  46. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +419 -0
  47. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
  48. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +845 -0
  49. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
  50. data/contrib/zstd/lib/compress/zstd_cwksp.h +525 -0
  51. data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
  52. data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
  53. data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
  54. data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
  55. data/contrib/zstd/lib/compress/zstd_lazy.c +1138 -0
  56. data/contrib/zstd/lib/compress/zstd_lazy.h +67 -0
  57. data/contrib/zstd/lib/compress/zstd_ldm.c +619 -0
  58. data/contrib/zstd/lib/compress/zstd_ldm.h +110 -0
  59. data/contrib/zstd/lib/compress/zstd_opt.c +1200 -0
  60. data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
  61. data/contrib/zstd/lib/compress/zstdmt_compress.c +2143 -0
  62. data/contrib/zstd/lib/compress/zstdmt_compress.h +192 -0
  63. data/contrib/zstd/lib/decompress/huf_decompress.c +1248 -0
  64. data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
  65. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  66. data/contrib/zstd/lib/decompress/zstd_decompress.c +1885 -0
  67. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1432 -0
  68. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
  69. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +189 -0
  70. data/contrib/zstd/{common → lib/deprecated}/zbuff.h +86 -69
  71. data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
  72. data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
  73. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
  74. data/contrib/zstd/lib/dictBuilder/cover.c +1236 -0
  75. data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
  76. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
  77. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +5 -5
  78. data/contrib/zstd/lib/dictBuilder/fastcover.c +757 -0
  79. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +437 -347
  80. data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
  81. data/contrib/zstd/lib/legacy/zstd_legacy.h +415 -0
  82. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +272 -292
  83. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +26 -32
  84. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +162 -392
  85. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +26 -32
  86. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +162 -391
  87. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +27 -33
  88. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +195 -604
  89. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +26 -32
  90. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +300 -575
  91. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +22 -31
  92. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +165 -592
  93. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +54 -67
  94. data/contrib/zstd/lib/legacy/zstd_v07.c +4541 -0
  95. data/contrib/zstd/lib/legacy/zstd_v07.h +187 -0
  96. data/contrib/zstd/lib/libzstd.pc.in +15 -0
  97. data/contrib/zstd/lib/zstd.h +2090 -0
  98. data/ext/depend +2 -0
  99. data/ext/extconf.rb +18 -5
  100. data/ext/extzstd.c +296 -214
  101. data/ext/extzstd.h +81 -36
  102. data/ext/extzstd_nogvls.h +0 -117
  103. data/ext/extzstd_stream.c +622 -0
  104. data/ext/libzstd_conf.h +8 -0
  105. data/ext/zstd_common.c +11 -0
  106. data/ext/zstd_compress.c +15 -0
  107. data/ext/zstd_decompress.c +6 -0
  108. data/ext/zstd_dictbuilder.c +10 -0
  109. data/ext/zstd_dictbuilder_fastcover.c +3 -0
  110. data/ext/zstd_legacy_v01.c +3 -1
  111. data/ext/zstd_legacy_v02.c +3 -1
  112. data/ext/zstd_legacy_v03.c +3 -1
  113. data/ext/zstd_legacy_v04.c +3 -1
  114. data/ext/zstd_legacy_v05.c +3 -1
  115. data/ext/zstd_legacy_v06.c +3 -1
  116. data/ext/zstd_legacy_v07.c +3 -0
  117. data/gemstub.rb +27 -21
  118. data/lib/extzstd.rb +82 -161
  119. data/lib/extzstd/version.rb +1 -1
  120. data/test/test_basic.rb +19 -6
  121. metadata +127 -59
  122. data/contrib/zstd/common/error_private.h +0 -125
  123. data/contrib/zstd/common/error_public.h +0 -77
  124. data/contrib/zstd/common/huf.h +0 -228
  125. data/contrib/zstd/common/zstd.h +0 -475
  126. data/contrib/zstd/common/zstd_common.c +0 -91
  127. data/contrib/zstd/common/zstd_internal.h +0 -238
  128. data/contrib/zstd/compress/huf_compress.c +0 -577
  129. data/contrib/zstd/compress/zbuff_compress.c +0 -327
  130. data/contrib/zstd/compress/zstd_compress.c +0 -3074
  131. data/contrib/zstd/compress/zstd_opt.h +0 -1046
  132. data/contrib/zstd/decompress/huf_decompress.c +0 -894
  133. data/contrib/zstd/decompress/zbuff_decompress.c +0 -294
  134. data/contrib/zstd/decompress/zstd_decompress.c +0 -1362
  135. data/contrib/zstd/dictBuilder/zdict.h +0 -113
  136. data/contrib/zstd/legacy/zstd_legacy.h +0 -140
  137. data/ext/extzstd_buffered.c +0 -265
  138. data/ext/zstd_amalgam.c +0 -18
@@ -1,40 +1,20 @@
1
1
  /*
2
- dictBuilder - dictionary builder for zstd
3
- Copyright (C) Yann Collet 2016
4
-
5
- BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6
-
7
- Redistribution and use in source and binary forms, with or without
8
- modification, are permitted provided that the following conditions are
9
- met:
10
-
11
- * Redistributions of source code must retain the above copyright
12
- notice, this list of conditions and the following disclaimer.
13
- * Redistributions in binary form must reproduce the above
14
- copyright notice, this list of conditions and the following disclaimer
15
- in the documentation and/or other materials provided with the
16
- distribution.
17
-
18
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
-
30
- You can contact the author at :
31
- - Zstd homepage : https://www.zstd.net
32
- */
2
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
33
11
 
34
12
  /*-**************************************
35
13
  * Tuning parameters
36
14
  ****************************************/
15
+ #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
37
16
  #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
17
+ #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
38
18
 
39
19
 
40
20
  /*-**************************************
@@ -57,18 +37,18 @@
57
37
  #include <stdio.h> /* fprintf, fopen, ftello64 */
58
38
  #include <time.h> /* clock */
59
39
 
60
- #include "mem.h" /* read */
61
- #include "error_private.h"
62
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
40
+ #include "../common/mem.h" /* read */
41
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
63
42
  #define HUF_STATIC_LINKING_ONLY
64
- #include "huf.h"
65
- #include "zstd_internal.h" /* includes zstd.h */
66
- #include "xxhash.h"
43
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
+ #include "../common/zstd_internal.h" /* includes zstd.h */
45
+ #include "../common/xxhash.h" /* XXH64 */
67
46
  #include "divsufsort.h"
68
47
  #ifndef ZDICT_STATIC_LINKING_ONLY
69
48
  # define ZDICT_STATIC_LINKING_ONLY
70
49
  #endif
71
50
  #include "zdict.h"
51
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
72
52
 
73
53
 
74
54
  /*-*************************************
@@ -78,43 +58,30 @@
78
58
  #define MB *(1 <<20)
79
59
  #define GB *(1U<<30)
80
60
 
81
- #define DICTLISTSIZE 10000
61
+ #define DICTLISTSIZE_DEFAULT 10000
82
62
 
83
63
  #define NOISELENGTH 32
84
- #define PRIME1 2654435761U
85
- #define PRIME2 2246822519U
86
64
 
87
- #define MINRATIO 4
88
- static const U32 g_compressionLevel_default = 5;
65
+ static const int g_compressionLevel_default = 3;
89
66
  static const U32 g_selectivity_default = 9;
90
- static const size_t g_provision_entropySize = 200;
91
- static const size_t g_min_fast_dictContent = 192;
92
67
 
93
68
 
94
69
  /*-*************************************
95
70
  * Console display
96
71
  ***************************************/
97
72
  #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
98
- #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
99
- static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
100
-
101
- #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
102
- if (ZDICT_clockSpan(g_time) > refreshRate) \
103
- { g_time = clock(); DISPLAY(__VA_ARGS__); \
104
- if (g_displayLevel>=4) fflush(stdout); } }
105
- static const clock_t refreshRate = CLOCKS_PER_SEC * 3 / 10;
106
- static clock_t g_time = 0;
73
+ #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
107
74
 
108
75
  static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
109
76
 
110
- static void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
77
+ static void ZDICT_printHex(const void* ptr, size_t length)
111
78
  {
112
79
  const BYTE* const b = (const BYTE*)ptr;
113
80
  size_t u;
114
81
  for (u=0; u<length; u++) {
115
82
  BYTE c = b[u];
116
83
  if (c<32 || c>126) c = '.'; /* non-printable char */
117
- DISPLAYLEVEL(dlevel, "%c", c);
84
+ DISPLAY("%c", c);
118
85
  }
119
86
  }
120
87
 
@@ -126,11 +93,41 @@ unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
126
93
 
127
94
  const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
128
95
 
96
+ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
97
+ {
98
+ if (dictSize < 8) return 0;
99
+ if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
100
+ return MEM_readLE32((const char*)dictBuffer + 4);
101
+ }
102
+
103
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
104
+ {
105
+ size_t headerSize;
106
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
107
+
108
+ { unsigned offcodeMaxValue = MaxOff;
109
+ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
110
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
111
+ short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
112
+ if (!bs || !wksp || !offcodeNCount) {
113
+ headerSize = ERROR(memory_allocation);
114
+ } else {
115
+ ZSTD_reset_compressedBlockState(bs);
116
+ headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
117
+ }
118
+
119
+ free(bs);
120
+ free(wksp);
121
+ free(offcodeNCount);
122
+ }
123
+
124
+ return headerSize;
125
+ }
129
126
 
130
127
  /*-********************************************************
131
128
  * Dictionary training functions
132
129
  **********************************************************/
133
- static unsigned ZDICT_NbCommonBytes (register size_t val)
130
+ static unsigned ZDICT_NbCommonBytes (size_t val)
134
131
  {
135
132
  if (MEM_isLittleEndian()) {
136
133
  if (MEM_64bits()) {
@@ -228,13 +225,12 @@ static void ZDICT_initDictItem(dictItem* d)
228
225
  static dictItem ZDICT_analyzePos(
229
226
  BYTE* doneMarks,
230
227
  const int* suffix, U32 start,
231
- const void* buffer, U32 minRatio)
228
+ const void* buffer, U32 minRatio, U32 notificationLevel)
232
229
  {
233
230
  U32 lengthList[LLIMIT] = {0};
234
231
  U32 cumulLength[LLIMIT] = {0};
235
232
  U32 savings[LLIMIT] = {0};
236
233
  const BYTE* b = (const BYTE*)buffer;
237
- size_t length;
238
234
  size_t maxLength = LLIMIT;
239
235
  size_t pos = suffix[start];
240
236
  U32 end = start;
@@ -249,26 +245,30 @@ static dictItem ZDICT_analyzePos(
249
245
  ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
250
246
  ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
251
247
  /* skip and mark segment */
252
- U16 u16 = MEM_read16(b+pos+4);
253
- U32 u, e = 6;
254
- while (MEM_read16(b+pos+e) == u16) e+=2 ;
255
- if (b[pos+e] == b[pos+e-1]) e++;
256
- for (u=1; u<e; u++)
248
+ U16 const pattern16 = MEM_read16(b+pos+4);
249
+ U32 u, patternEnd = 6;
250
+ while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
251
+ if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
252
+ for (u=1; u<patternEnd; u++)
257
253
  doneMarks[pos+u] = 1;
258
254
  return solution;
259
255
  }
260
256
 
261
257
  /* look forward */
262
- do {
263
- end++;
264
- length = ZDICT_count(b + pos, b + suffix[end]);
265
- } while (length >=MINMATCHLENGTH);
258
+ { size_t length;
259
+ do {
260
+ end++;
261
+ length = ZDICT_count(b + pos, b + suffix[end]);
262
+ } while (length >= MINMATCHLENGTH);
263
+ }
266
264
 
267
265
  /* look backward */
268
- do {
269
- length = ZDICT_count(b + pos, b + *(suffix+start-1));
270
- if (length >=MINMATCHLENGTH) start--;
271
- } while(length >= MINMATCHLENGTH);
266
+ { size_t length;
267
+ do {
268
+ length = ZDICT_count(b + pos, b + *(suffix+start-1));
269
+ if (length >=MINMATCHLENGTH) start--;
270
+ } while(length >= MINMATCHLENGTH);
271
+ }
272
272
 
273
273
  /* exit if not found a minimum nb of repetitions */
274
274
  if (end-start < minRatio) {
@@ -279,15 +279,15 @@ static dictItem ZDICT_analyzePos(
279
279
  }
280
280
 
281
281
  { int i;
282
- U32 searchLength;
282
+ U32 mml;
283
283
  U32 refinedStart = start;
284
284
  U32 refinedEnd = end;
285
285
 
286
286
  DISPLAYLEVEL(4, "\n");
287
- DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
287
+ DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);
288
288
  DISPLAYLEVEL(4, "\n");
289
289
 
290
- for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
290
+ for (mml = MINMATCHLENGTH ; ; mml++) {
291
291
  BYTE currentChar = 0;
292
292
  U32 currentCount = 0;
293
293
  U32 currentID = refinedStart;
@@ -295,13 +295,13 @@ static dictItem ZDICT_analyzePos(
295
295
  U32 selectedCount = 0;
296
296
  U32 selectedID = currentID;
297
297
  for (id =refinedStart; id < refinedEnd; id++) {
298
- if (b[ suffix[id] + searchLength] != currentChar) {
298
+ if (b[suffix[id] + mml] != currentChar) {
299
299
  if (currentCount > selectedCount) {
300
300
  selectedCount = currentCount;
301
301
  selectedID = currentID;
302
302
  }
303
303
  currentID = id;
304
- currentChar = b[ suffix[id] + searchLength];
304
+ currentChar = b[ suffix[id] + mml];
305
305
  currentCount = 0;
306
306
  }
307
307
  currentCount ++;
@@ -317,27 +317,31 @@ static dictItem ZDICT_analyzePos(
317
317
  refinedEnd = refinedStart + selectedCount;
318
318
  }
319
319
 
320
- /* evaluate gain based on new ref */
320
+ /* evaluate gain based on new dict */
321
321
  start = refinedStart;
322
322
  pos = suffix[refinedStart];
323
323
  end = start;
324
324
  memset(lengthList, 0, sizeof(lengthList));
325
325
 
326
326
  /* look forward */
327
- do {
328
- end++;
329
- length = ZDICT_count(b + pos, b + suffix[end]);
330
- if (length >= LLIMIT) length = LLIMIT-1;
331
- lengthList[length]++;
332
- } while (length >=MINMATCHLENGTH);
327
+ { size_t length;
328
+ do {
329
+ end++;
330
+ length = ZDICT_count(b + pos, b + suffix[end]);
331
+ if (length >= LLIMIT) length = LLIMIT-1;
332
+ lengthList[length]++;
333
+ } while (length >=MINMATCHLENGTH);
334
+ }
333
335
 
334
336
  /* look backward */
335
- do {
336
- length = ZDICT_count(b + pos, b + suffix[start-1]);
337
- if (length >= LLIMIT) length = LLIMIT-1;
338
- lengthList[length]++;
339
- if (length >=MINMATCHLENGTH) start--;
340
- } while(length >= MINMATCHLENGTH);
337
+ { size_t length = MINMATCHLENGTH;
338
+ while ((length >= MINMATCHLENGTH) & (start > 0)) {
339
+ length = ZDICT_count(b + pos, b + suffix[start - 1]);
340
+ if (length >= LLIMIT) length = LLIMIT - 1;
341
+ lengthList[length]++;
342
+ if (length >= MINMATCHLENGTH) start--;
343
+ }
344
+ }
341
345
 
342
346
  /* largest useful length */
343
347
  memset(cumulLength, 0, sizeof(cumulLength));
@@ -361,8 +365,8 @@ static dictItem ZDICT_analyzePos(
361
365
  for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
362
366
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
363
367
 
364
- DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
365
- (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
368
+ DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
369
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
366
370
 
367
371
  solution.pos = (U32)pos;
368
372
  solution.length = (U32)maxLength;
@@ -371,12 +375,12 @@ static dictItem ZDICT_analyzePos(
371
375
  /* mark positions done */
372
376
  { U32 id;
373
377
  for (id=start; id<end; id++) {
374
- U32 p, pEnd;
378
+ U32 p, pEnd, length;
375
379
  U32 const testedPos = suffix[id];
376
380
  if (testedPos == pos)
377
381
  length = solution.length;
378
382
  else {
379
- length = ZDICT_count(b+pos, b+testedPos);
383
+ length = (U32)ZDICT_count(b+pos, b+testedPos);
380
384
  if (length > solution.length) length = solution.length;
381
385
  }
382
386
  pEnd = (U32)(testedPos + length);
@@ -388,28 +392,43 @@ static dictItem ZDICT_analyzePos(
388
392
  }
389
393
 
390
394
 
391
- /*! ZDICT_checkMerge
395
+ static int isIncluded(const void* in, const void* container, size_t length)
396
+ {
397
+ const char* const ip = (const char*) in;
398
+ const char* const into = (const char*) container;
399
+ size_t u;
400
+
401
+ for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */
402
+ if (ip[u] != into[u]) break;
403
+ }
404
+
405
+ return u==length;
406
+ }
407
+
408
+ /*! ZDICT_tryMerge() :
392
409
  check if dictItem can be merged, do it if possible
393
410
  @return : id of destination elt, 0 if not merged
394
411
  */
395
- static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
412
+ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
396
413
  {
397
414
  const U32 tableSize = table->pos;
398
- const U32 max = elt.pos + (elt.length-1);
415
+ const U32 eltEnd = elt.pos + elt.length;
416
+ const char* const buf = (const char*) buffer;
399
417
 
400
418
  /* tail overlap */
401
419
  U32 u; for (u=1; u<tableSize; u++) {
402
420
  if (u==eltNbToSkip) continue;
403
- if ((table[u].pos > elt.pos) && (table[u].pos < max)) { /* overlap */
421
+ if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */
404
422
  /* append */
405
- U32 addedLength = table[u].pos - elt.pos;
423
+ U32 const addedLength = table[u].pos - elt.pos;
406
424
  table[u].length += addedLength;
407
425
  table[u].pos = elt.pos;
408
426
  table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
409
- table[u].savings += elt.length / 8; /* rough approx */
427
+ table[u].savings += elt.length / 8; /* rough approx bonus */
410
428
  elt = table[u];
429
+ /* sort : improve rank */
411
430
  while ((u>1) && (table[u-1].savings < elt.savings))
412
- table[u] = table[u-1], u--;
431
+ table[u] = table[u-1], u--;
413
432
  table[u] = elt;
414
433
  return u;
415
434
  } }
@@ -417,20 +436,33 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
417
436
  /* front overlap */
418
437
  for (u=1; u<tableSize; u++) {
419
438
  if (u==eltNbToSkip) continue;
420
- if ((table[u].pos + table[u].length > elt.pos) && (table[u].pos < elt.pos)) { /* overlap */
439
+
440
+ if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
421
441
  /* append */
422
- int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length);
423
- table[u].savings += elt.length / 8; /* rough approx */
424
- if (addedLength > 0) { /* otherwise, already included */
442
+ int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
443
+ table[u].savings += elt.length / 8; /* rough approx bonus */
444
+ if (addedLength > 0) { /* otherwise, elt fully included into existing */
425
445
  table[u].length += addedLength;
426
446
  table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */
427
447
  }
448
+ /* sort : improve rank */
428
449
  elt = table[u];
429
450
  while ((u>1) && (table[u-1].savings < elt.savings))
430
451
  table[u] = table[u-1], u--;
431
452
  table[u] = elt;
432
453
  return u;
433
- } }
454
+ }
455
+
456
+ if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
457
+ if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
458
+ size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
459
+ table[u].pos = elt.pos;
460
+ table[u].savings += (U32)(elt.savings * addedLength / elt.length);
461
+ table[u].length = MIN(elt.length, table[u].length + 1);
462
+ return u;
463
+ }
464
+ }
465
+ }
434
466
 
435
467
  return 0;
436
468
  }
@@ -438,8 +470,8 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
438
470
 
439
471
  static void ZDICT_removeDictItem(dictItem* table, U32 id)
440
472
  {
441
- /* convention : first element is nb of elts */
442
- U32 const max = table->pos;
473
+ /* convention : table[0].pos stores nb of elts */
474
+ U32 const max = table[0].pos;
443
475
  U32 u;
444
476
  if (!id) return; /* protection, should never happen */
445
477
  for (u=id; u<max-1; u++)
@@ -448,14 +480,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
448
480
  }
449
481
 
450
482
 
451
- static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
483
+ static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
452
484
  {
453
485
  /* merge if possible */
454
- U32 mergeId = ZDICT_checkMerge(table, elt, 0);
486
+ U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
455
487
  if (mergeId) {
456
488
  U32 newMerge = 1;
457
489
  while (newMerge) {
458
- newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
490
+ newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
459
491
  if (newMerge) ZDICT_removeDictItem(table, mergeId);
460
492
  mergeId = newMerge;
461
493
  }
@@ -486,18 +518,24 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
486
518
  }
487
519
 
488
520
 
489
- static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
521
+ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
490
522
  const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
491
523
  const size_t* fileSizes, unsigned nbFiles,
492
- U32 shiftRatio, unsigned maxDictSize)
524
+ unsigned minRatio, U32 notificationLevel)
493
525
  {
494
526
  int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
495
527
  int* const suffix = suffix0+1;
496
528
  U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
497
529
  BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
498
530
  U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
499
- U32 minRatio = nbFiles >> shiftRatio;
500
531
  size_t result = 0;
532
+ clock_t displayClock = 0;
533
+ clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
534
+
535
+ # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
536
+ if (ZDICT_clockSpan(displayClock) > refreshRate) \
537
+ { displayClock = clock(); DISPLAY(__VA_ARGS__); \
538
+ if (notificationLevel>=4) fflush(stderr); } }
501
539
 
502
540
  /* init */
503
541
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
@@ -509,11 +547,11 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
509
547
  memset(doneMarks, 0, bufferSize+16);
510
548
 
511
549
  /* limit sample set size (divsufsort limitation)*/
512
- if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>20));
550
+ if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));
513
551
  while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
514
552
 
515
553
  /* sort */
516
- DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
554
+ DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));
517
555
  { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
518
556
  if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
519
557
  }
@@ -523,7 +561,8 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
523
561
  { size_t pos;
524
562
  for (pos=0; pos < bufferSize; pos++)
525
563
  reverseSuffix[suffix[pos]] = (U32)pos;
526
- /* build file pos */
564
+ /* note filePos tracks borders between samples.
565
+ It's not used at this stage, but planned to become useful in a later update */
527
566
  filePos[0] = 0;
528
567
  for (pos=1; pos<nbFiles; pos++)
529
568
  filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);
@@ -535,23 +574,13 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
535
574
  { U32 cursor; for (cursor=0; cursor < bufferSize; ) {
536
575
  dictItem solution;
537
576
  if (doneMarks[cursor]) { cursor++; continue; }
538
- solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
577
+ solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
539
578
  if (solution.length==0) { cursor++; continue; }
540
- ZDICT_insertDictItem(dictList, dictListSize, solution);
579
+ ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
541
580
  cursor += solution.length;
542
581
  DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
543
582
  } }
544
583
 
545
- /* limit dictionary size */
546
- { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
547
- U32 currentSize = 0;
548
- U32 n; for (n=1; n<max; n++) {
549
- currentSize += dictList[n].length;
550
- if (currentSize > maxDictSize) break;
551
- }
552
- dictList->pos = n;
553
- }
554
-
555
584
  _cleanup:
556
585
  free(suffix0);
557
586
  free(reverseSuffix);
@@ -563,10 +592,12 @@ _cleanup:
563
592
 
564
593
  static void ZDICT_fillNoise(void* buffer, size_t length)
565
594
  {
566
- unsigned acc = PRIME1;
567
- size_t p=0;;
595
+ unsigned const prime1 = 2654435761U;
596
+ unsigned const prime2 = 2246822519U;
597
+ unsigned acc = prime1;
598
+ size_t p=0;
568
599
  for (p=0; p<length; p++) {
569
- acc *= PRIME2;
600
+ acc *= prime2;
570
601
  ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
571
602
  }
572
603
  }
@@ -574,29 +605,31 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
574
605
 
575
606
  typedef struct
576
607
  {
577
- ZSTD_CCtx* ref;
578
- ZSTD_CCtx* zc;
608
+ ZSTD_CDict* dict; /* dictionary */
609
+ ZSTD_CCtx* zc; /* working context */
579
610
  void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
580
611
  } EStats_ress_t;
581
612
 
582
613
  #define MAXREPOFFSET 1024
583
614
 
584
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
585
- U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
586
- const void* src, size_t srcSize)
615
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
616
+ unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
617
+ const void* src, size_t srcSize,
618
+ U32 notificationLevel)
587
619
  {
588
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
620
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
589
621
  size_t cSize;
590
622
 
591
623
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
592
- { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref);
593
- if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
594
- }
624
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
625
+ if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
626
+
627
+ }
595
628
  cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
596
- if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
629
+ if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
597
630
 
598
631
  if (cSize) { /* if == 0; block is not compressible */
599
- const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
632
+ const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
600
633
 
601
634
  /* literals stats */
602
635
  { const BYTE* bytePtr;
@@ -605,46 +638,34 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
605
638
  }
606
639
 
607
640
  /* seqStats */
608
- { size_t const nbSeq = (size_t)(seqStorePtr->offset - seqStorePtr->offsetStart);
609
- ZSTD_seqToCodes(seqStorePtr, nbSeq);
641
+ { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
642
+ ZSTD_seqToCodes(seqStorePtr);
610
643
 
611
- { const BYTE* codePtr = seqStorePtr->offCodeStart;
612
- size_t u;
644
+ { const BYTE* codePtr = seqStorePtr->ofCode;
645
+ U32 u;
613
646
  for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
614
647
  }
615
648
 
616
- { const BYTE* codePtr = seqStorePtr->mlCodeStart;
617
- size_t u;
649
+ { const BYTE* codePtr = seqStorePtr->mlCode;
650
+ U32 u;
618
651
  for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
619
652
  }
620
653
 
621
- { const BYTE* codePtr = seqStorePtr->llCodeStart;
622
- size_t u;
654
+ { const BYTE* codePtr = seqStorePtr->llCode;
655
+ U32 u;
623
656
  for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
624
- } }
625
-
626
- /* rep offsets */
627
- { const U32* const offsetPtr = seqStorePtr->offsetStart;
628
- U32 offset1 = offsetPtr[0] - 3;
629
- U32 offset2 = offsetPtr[1] - 3;
630
- if (offset1 >= MAXREPOFFSET) offset1 = 0;
631
- if (offset2 >= MAXREPOFFSET) offset2 = 0;
632
- repOffsets[offset1] += 3;
633
- repOffsets[offset2] += 1;
634
- }
635
- }
636
- }
657
+ }
637
658
 
638
- /*
639
- static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
640
- {
641
- unsigned u;
642
- size_t max=0;
643
- for (u=0; u<nbFiles; u++)
644
- if (max < fileSizes[u]) max = fileSizes[u];
645
- return max;
659
+ if (nbSeq >= 2) { /* rep offsets */
660
+ const seqDef* const seq = seqStorePtr->sequencesStart;
661
+ U32 offset1 = seq[0].offset - 3;
662
+ U32 offset2 = seq[1].offset - 3;
663
+ if (offset1 >= MAXREPOFFSET) offset1 = 0;
664
+ if (offset2 >= MAXREPOFFSET) offset2 = 0;
665
+ repOffsets[offset1] += 3;
666
+ repOffsets[offset2] += 1;
667
+ } } }
646
668
  }
647
- */
648
669
 
649
670
  static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
650
671
  {
@@ -670,72 +691,92 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
670
691
  }
671
692
  }
672
693
 
694
+ /* ZDICT_flatLit() :
695
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
696
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
697
+ */
698
+ static void ZDICT_flatLit(unsigned* countLit)
699
+ {
700
+ int u;
701
+ for (u=1; u<256; u++) countLit[u] = 2;
702
+ countLit[0] = 4;
703
+ countLit[253] = 1;
704
+ countLit[254] = 1;
705
+ }
673
706
 
674
- #define OFFCODE_MAX 18 /* only applicable to first block */
707
+ #define OFFCODE_MAX 30 /* only applicable to first block */
675
708
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
676
- unsigned compressionLevel,
677
- const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
678
- const void* dictBuffer, size_t dictBufferSize)
709
+ unsigned compressionLevel,
710
+ const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
711
+ const void* dictBuffer, size_t dictBufferSize,
712
+ unsigned notificationLevel)
679
713
  {
680
- U32 countLit[256];
714
+ unsigned countLit[256];
681
715
  HUF_CREATE_STATIC_CTABLE(hufTable, 255);
682
- U32 offcodeCount[OFFCODE_MAX+1];
716
+ unsigned offcodeCount[OFFCODE_MAX+1];
683
717
  short offcodeNCount[OFFCODE_MAX+1];
684
- U32 matchLengthCount[MaxML+1];
718
+ U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));
719
+ unsigned matchLengthCount[MaxML+1];
685
720
  short matchLengthNCount[MaxML+1];
686
- U32 litLengthCount[MaxLL+1];
721
+ unsigned litLengthCount[MaxLL+1];
687
722
  short litLengthNCount[MaxLL+1];
688
- U32 repOffset[MAXREPOFFSET] = { 0 };
723
+ U32 repOffset[MAXREPOFFSET];
689
724
  offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
690
- EStats_ress_t esr;
725
+ EStats_ress_t esr = { NULL, NULL, NULL };
691
726
  ZSTD_parameters params;
692
- U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
727
+ U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
693
728
  size_t pos = 0, errorCode;
694
729
  size_t eSize = 0;
695
730
  size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
696
- size_t const averageSampleSize = totalSrcSize / nbFiles;
731
+ size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
697
732
  BYTE* dstPtr = (BYTE*)dstBuffer;
698
733
 
699
734
  /* init */
700
- for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */
701
- for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1;
702
- for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
703
- for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
735
+ DEBUGLOG(4, "ZDICT_analyzeEntropy");
736
+ if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */
737
+ for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */
738
+ for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
739
+ for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
740
+ for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
741
+ memset(repOffset, 0, sizeof(repOffset));
704
742
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
705
743
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
706
- esr.ref = ZSTD_createCCtx();
744
+ if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
745
+ params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
746
+
747
+ esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
707
748
  esr.zc = ZSTD_createCCtx();
708
749
  esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
709
- if (!esr.ref || !esr.zc || !esr.workPlace) {
710
- eSize = ERROR(memory_allocation);
711
- DISPLAYLEVEL(1, "Not enough memory");
712
- goto _cleanup;
750
+ if (!esr.dict || !esr.zc || !esr.workPlace) {
751
+ eSize = ERROR(memory_allocation);
752
+ DISPLAYLEVEL(1, "Not enough memory \n");
753
+ goto _cleanup;
713
754
  }
714
- if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
715
- params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
716
- { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
717
- if (ZSTD_isError(beginResult)) {
718
- eSize = ERROR(GENERIC);
719
- DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed ");
720
- goto _cleanup;
721
- } }
722
-
723
- /* collect stats on all files */
755
+
756
+ /* collect stats on all samples */
724
757
  for (u=0; u<nbFiles; u++) {
725
- ZDICT_countEStats(esr, params,
726
- countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
727
- (const char*)srcBuffer + pos, fileSizes[u]);
758
+ ZDICT_countEStats(esr, &params,
759
+ countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
760
+ (const char*)srcBuffer + pos, fileSizes[u],
761
+ notificationLevel);
728
762
  pos += fileSizes[u];
729
763
  }
730
764
 
731
- /* analyze */
732
- errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
733
- if (HUF_isError(errorCode)) {
734
- eSize = ERROR(GENERIC);
735
- DISPLAYLEVEL(1, "HUF_buildCTable error");
736
- goto _cleanup;
765
+ /* analyze, build stats, starting with literals */
766
+ { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
767
+ if (HUF_isError(maxNbBits)) {
768
+ eSize = maxNbBits;
769
+ DISPLAYLEVEL(1, " HUF_buildCTable error \n");
770
+ goto _cleanup;
771
+ }
772
+ if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
773
+ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
774
+ ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
775
+ maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
776
+ assert(maxNbBits==9);
777
+ }
778
+ huffLog = (U32)maxNbBits;
737
779
  }
738
- huffLog = (U32)errorCode;
739
780
 
740
781
  /* looking for most common first offsets */
741
782
  { U32 offset;
@@ -744,11 +785,11 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
744
785
  }
745
786
  /* note : the result of this phase should be used to better appreciate the impact on statistics */
746
787
 
747
- total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
748
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
788
+ total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
789
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
749
790
  if (FSE_isError(errorCode)) {
750
- eSize = ERROR(GENERIC);
751
- DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
791
+ eSize = errorCode;
792
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
752
793
  goto _cleanup;
753
794
  }
754
795
  Offlog = (U32)errorCode;
@@ -756,8 +797,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
756
797
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
757
798
  errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
758
799
  if (FSE_isError(errorCode)) {
759
- eSize = ERROR(GENERIC);
760
- DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
800
+ eSize = errorCode;
801
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
761
802
  goto _cleanup;
762
803
  }
763
804
  mlLog = (U32)errorCode;
@@ -765,18 +806,17 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
765
806
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
766
807
  errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
767
808
  if (FSE_isError(errorCode)) {
768
- eSize = ERROR(GENERIC);
769
- DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount");
809
+ eSize = errorCode;
810
+ DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
770
811
  goto _cleanup;
771
812
  }
772
813
  llLog = (U32)errorCode;
773
814
 
774
-
775
815
  /* write result to buffer */
776
816
  { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
777
817
  if (HUF_isError(hhSize)) {
778
- eSize = ERROR(GENERIC);
779
- DISPLAYLEVEL(1, "HUF_writeCTable error");
818
+ eSize = hhSize;
819
+ DISPLAYLEVEL(1, "HUF_writeCTable error \n");
780
820
  goto _cleanup;
781
821
  }
782
822
  dstPtr += hhSize;
@@ -786,8 +826,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
786
826
 
787
827
  { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
788
828
  if (FSE_isError(ohSize)) {
789
- eSize = ERROR(GENERIC);
790
- DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
829
+ eSize = ohSize;
830
+ DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
791
831
  goto _cleanup;
792
832
  }
793
833
  dstPtr += ohSize;
@@ -797,8 +837,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
797
837
 
798
838
  { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
799
839
  if (FSE_isError(mhSize)) {
800
- eSize = ERROR(GENERIC);
801
- DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
840
+ eSize = mhSize;
841
+ DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
802
842
  goto _cleanup;
803
843
  }
804
844
  dstPtr += mhSize;
@@ -808,8 +848,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
808
848
 
809
849
  { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
810
850
  if (FSE_isError(lhSize)) {
811
- eSize = ERROR(GENERIC);
812
- DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
851
+ eSize = lhSize;
852
+ DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
813
853
  goto _cleanup;
814
854
  }
815
855
  dstPtr += lhSize;
@@ -818,8 +858,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
818
858
  }
819
859
 
820
860
  if (maxDstSize<12) {
821
- eSize = ERROR(GENERIC);
822
- DISPLAYLEVEL(1, "not enough space to write RepOffsets");
861
+ eSize = ERROR(dstSize_tooSmall);
862
+ DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
823
863
  goto _cleanup;
824
864
  }
825
865
  # if 0
@@ -833,11 +873,10 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
833
873
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
834
874
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
835
875
  #endif
836
- dstPtr += 12;
837
876
  eSize += 12;
838
877
 
839
878
  _cleanup:
840
- ZSTD_freeCCtx(esr.ref);
879
+ ZSTD_freeCDict(esr.dict);
841
880
  ZSTD_freeCCtx(esr.zc);
842
881
  free(esr.workPlace);
843
882
 
@@ -845,129 +884,180 @@ _cleanup:
845
884
  }
846
885
 
847
886
 
848
- #define DIB_FASTSEGMENTSIZE 64
849
- /*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
850
- Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
851
- up to `dictSize`.
852
- Filling starts from the end of `dictBuffer`, down to maximum possible.
853
- if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
854
- @return : amount of data written into `dictBuffer`,
855
- or an error code
856
- */
857
- static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
858
- const void* samplesBuffer, size_t samplesSize)
859
- {
860
- char* dstPtr = (char*)dictBuffer + dictSize;
861
- const char* srcPtr = (const char*)samplesBuffer;
862
- size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
863
- size_t segNb, interSize;
864
-
865
- if (nbSegments <= 2) return ERROR(srcSize_wrong);
866
- if (samplesSize < dictSize) return ERROR(srcSize_wrong);
867
-
868
- /* first and last segments are part of dictionary, in case they contain interesting header/footer */
869
- dstPtr -= DIB_FASTSEGMENTSIZE;
870
- memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
871
- dstPtr -= DIB_FASTSEGMENTSIZE;
872
- memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
873
-
874
- /* regularly copy a segment */
875
- interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
876
- srcPtr += DIB_FASTSEGMENTSIZE;
877
- for (segNb=2; segNb < nbSegments; segNb++) {
878
- srcPtr += interSize;
879
- dstPtr -= DIB_FASTSEGMENTSIZE;
880
- memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
881
- srcPtr += DIB_FASTSEGMENTSIZE;
882
- }
883
-
884
- return nbSegments * DIB_FASTSEGMENTSIZE;
885
- }
886
887
 
887
- size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
888
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
889
- ZDICT_params_t params)
888
+ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
889
+ const void* customDictContent, size_t dictContentSize,
890
+ const void* samplesBuffer, const size_t* samplesSizes,
891
+ unsigned nbSamples, ZDICT_params_t params)
890
892
  {
891
893
  size_t hSize;
892
- unsigned const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
894
+ #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
895
+ BYTE header[HBUFFSIZE];
896
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
897
+ U32 const notificationLevel = params.notificationLevel;
898
+
899
+ /* check conditions */
900
+ DEBUGLOG(4, "ZDICT_finalizeDictionary");
901
+ if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
902
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
903
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
893
904
 
894
905
  /* dictionary header */
895
- MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
896
- { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
906
+ MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
907
+ { U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
897
908
  U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
898
909
  U32 const dictID = params.dictID ? params.dictID : compliantID;
899
- MEM_writeLE32((char*)dictBuffer+4, dictID);
910
+ MEM_writeLE32(header+4, dictID);
900
911
  }
901
912
  hSize = 8;
902
913
 
903
914
  /* entropy tables */
904
915
  DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
905
916
  DISPLAYLEVEL(2, "statistics ... \n");
906
- hSize += ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
917
+ { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
918
+ compressionLevel,
919
+ samplesBuffer, samplesSizes, nbSamples,
920
+ customDictContent, dictContentSize,
921
+ notificationLevel);
922
+ if (ZDICT_isError(eSize)) return eSize;
923
+ hSize += eSize;
924
+ }
925
+
926
+ /* copy elements in final buffer ; note : src and dst buffer can overlap */
927
+ if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
928
+ { size_t const dictSize = hSize + dictContentSize;
929
+ char* dictEnd = (char*)dictBuffer + dictSize;
930
+ memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
931
+ memcpy(dictBuffer, header, hSize);
932
+ return dictSize;
933
+ }
934
+ }
935
+
936
+
937
+ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
938
+ void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
939
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
940
+ ZDICT_params_t params)
941
+ {
942
+ int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
943
+ U32 const notificationLevel = params.notificationLevel;
944
+ size_t hSize = 8;
945
+
946
+ /* calculate entropy tables */
947
+ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
948
+ DISPLAYLEVEL(2, "statistics ... \n");
949
+ { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
907
950
  compressionLevel,
908
951
  samplesBuffer, samplesSizes, nbSamples,
909
- (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
952
+ (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
953
+ notificationLevel);
954
+ if (ZDICT_isError(eSize)) return eSize;
955
+ hSize += eSize;
956
+ }
957
+
958
+ /* add dictionary header (after entropy tables) */
959
+ MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
960
+ { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
961
+ U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
962
+ U32 const dictID = params.dictID ? params.dictID : compliantID;
963
+ MEM_writeLE32((char*)dictBuffer+4, dictID);
964
+ }
910
965
 
911
966
  if (hSize + dictContentSize < dictBufferCapacity)
912
967
  memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
913
968
  return MIN(dictBufferCapacity, hSize+dictContentSize);
914
969
  }
915
970
 
916
-
917
- #define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3)
918
- /*! ZDICT_trainFromBuffer_unsafe() :
919
- * `samplesBuffer` must be followed by noisy guard band.
920
- * @return : size of dictionary.
971
+ /* Hidden declaration for dbio.c */
972
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
973
+ void* dictBuffer, size_t maxDictSize,
974
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
975
+ ZDICT_legacy_params_t params);
976
+ /*! ZDICT_trainFromBuffer_unsafe_legacy() :
977
+ * Warning : `samplesBuffer` must be followed by noisy guard band.
978
+ * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
921
979
  */
922
- size_t ZDICT_trainFromBuffer_unsafe(
980
+ size_t ZDICT_trainFromBuffer_unsafe_legacy(
923
981
  void* dictBuffer, size_t maxDictSize,
924
982
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
925
- ZDICT_params_t params)
983
+ ZDICT_legacy_params_t params)
926
984
  {
927
- U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
985
+ U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
928
986
  dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
929
- unsigned selectivity = params.selectivityLevel;
987
+ unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
988
+ unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
930
989
  size_t const targetDictSize = maxDictSize;
931
- size_t sBuffSize;
990
+ size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
932
991
  size_t dictSize = 0;
992
+ U32 const notificationLevel = params.zParams.notificationLevel;
933
993
 
934
994
  /* checks */
935
995
  if (!dictList) return ERROR(memory_allocation);
936
- if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
996
+ if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */
997
+ if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */
937
998
 
938
999
  /* init */
939
- { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
940
- if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
941
1000
  ZDICT_initDictItem(dictList);
942
- g_displayLevel = params.notificationLevel;
943
- if (selectivity==0) selectivity = g_selectivity_default;
944
1001
 
945
1002
  /* build dictionary */
946
- if (selectivity>1) { /* selectivity == 1 => fast mode */
947
- ZDICT_trainBuffer(dictList, dictListSize,
948
- samplesBuffer, sBuffSize,
949
- samplesSizes, nbSamples,
950
- selectivity, (U32)targetDictSize);
951
-
952
- /* display best matches */
953
- if (g_displayLevel>= 3) {
954
- U32 const nb = 25;
955
- U32 const dictContentSize = ZDICT_dictSize(dictList);
956
- U32 u;
957
- DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
958
- DISPLAYLEVEL(3, "list %u best segments \n", nb);
959
- for (u=1; u<=nb; u++) {
960
- U32 p = dictList[u].pos;
961
- U32 l = dictList[u].length;
962
- U32 d = MIN(40, l);
963
- DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
964
- u, l, p, dictList[u].savings);
965
- ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
966
- DISPLAYLEVEL(3, "| \n");
967
- } } }
1003
+ ZDICT_trainBuffer_legacy(dictList, dictListSize,
1004
+ samplesBuffer, samplesBuffSize,
1005
+ samplesSizes, nbSamples,
1006
+ minRep, notificationLevel);
1007
+
1008
+ /* display best matches */
1009
+ if (params.zParams.notificationLevel>= 3) {
1010
+ unsigned const nb = MIN(25, dictList[0].pos);
1011
+ unsigned const dictContentSize = ZDICT_dictSize(dictList);
1012
+ unsigned u;
1013
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);
1014
+ DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
1015
+ for (u=1; u<nb; u++) {
1016
+ unsigned const pos = dictList[u].pos;
1017
+ unsigned const length = dictList[u].length;
1018
+ U32 const printedLength = MIN(40, length);
1019
+ if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
1020
+ free(dictList);
1021
+ return ERROR(GENERIC); /* should never happen */
1022
+ }
1023
+ DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
1024
+ u, length, pos, (unsigned)dictList[u].savings);
1025
+ ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
1026
+ DISPLAYLEVEL(3, "| \n");
1027
+ } }
1028
+
968
1029
 
969
1030
  /* create dictionary */
970
- { U32 dictContentSize = ZDICT_dictSize(dictList);
1031
+ { unsigned dictContentSize = ZDICT_dictSize(dictList);
1032
+ if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */
1033
+ if (dictContentSize < targetDictSize/4) {
1034
+ DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);
1035
+ if (samplesBuffSize < 10 * targetDictSize)
1036
+ DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));
1037
+ if (minRep > MINRATIO) {
1038
+ DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
1039
+ DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
1040
+ }
1041
+ }
1042
+
1043
+ if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
1044
+ unsigned proposedSelectivity = selectivity-1;
1045
+ while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
1046
+ DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);
1047
+ DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
1048
+ DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
1049
+ }
1050
+
1051
+ /* limit dictionary size */
1052
+ { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
1053
+ U32 currentSize = 0;
1054
+ U32 n; for (n=1; n<max; n++) {
1055
+ currentSize += dictList[n].length;
1056
+ if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
1057
+ }
1058
+ dictList->pos = n;
1059
+ dictContentSize = currentSize;
1060
+ }
971
1061
 
972
1062
  /* build dict content */
973
1063
  { U32 u;
@@ -979,17 +1069,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
979
1069
  memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
980
1070
  } }
981
1071
 
982
- /* fast mode dict content */
983
- if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
984
- DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
985
- DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
986
- dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
987
- samplesBuffer, sBuffSize);
988
- }
989
-
990
1072
  dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
991
1073
  samplesBuffer, samplesSizes, nbSamples,
992
- params);
1074
+ params.zParams);
993
1075
  }
994
1076
 
995
1077
  /* clean up */
@@ -998,44 +1080,52 @@ size_t ZDICT_trainFromBuffer_unsafe(
998
1080
  }
999
1081
 
1000
1082
 
1001
- /* issue : samplesBuffer need to be followed by a noisy guard band.
1002
- * work around : duplicate the buffer, and add the noise */
1003
- size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
1004
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1005
- ZDICT_params_t params)
1083
+ /* ZDICT_trainFromBuffer_legacy() :
1084
+ * issue : samplesBuffer need to be followed by a noisy guard band.
1085
+ * work around : duplicate the buffer, and add the noise */
1086
+ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1087
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1088
+ ZDICT_legacy_params_t params)
1006
1089
  {
1090
+ size_t result;
1007
1091
  void* newBuff;
1008
- size_t sBuffSize;
1092
+ size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
1093
+ if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */
1009
1094
 
1010
- { unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
1011
- if (sBuffSize==0) return 0; /* empty content => no dictionary */
1012
1095
  newBuff = malloc(sBuffSize + NOISELENGTH);
1013
1096
  if (!newBuff) return ERROR(memory_allocation);
1014
1097
 
1015
1098
  memcpy(newBuff, samplesBuffer, sBuffSize);
1016
1099
  ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
1017
1100
 
1018
- { size_t const result = ZDICT_trainFromBuffer_unsafe(
1019
- dictBuffer, dictBufferCapacity,
1020
- newBuff, samplesSizes, nbSamples,
1021
- params);
1022
- free(newBuff);
1023
- return result; }
1101
+ result =
1102
+ ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1103
+ samplesSizes, nbSamples, params);
1104
+ free(newBuff);
1105
+ return result;
1024
1106
  }
1025
1107
 
1026
1108
 
1027
1109
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1028
1110
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1029
1111
  {
1030
- ZDICT_params_t params;
1112
+ ZDICT_fastCover_params_t params;
1113
+ DEBUGLOG(3, "ZDICT_trainFromBuffer");
1031
1114
  memset(&params, 0, sizeof(params));
1032
- return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
1033
- samplesBuffer, samplesSizes, nbSamples,
1034
- params);
1115
+ params.d = 8;
1116
+ params.steps = 4;
1117
+ /* Default to level 6 since no compression level information is available */
1118
+ params.zParams.compressionLevel = 3;
1119
+ #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1120
+ params.zParams.notificationLevel = DEBUGLEVEL;
1121
+ #endif
1122
+ return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
1123
+ samplesBuffer, samplesSizes, nbSamples,
1124
+ &params);
1035
1125
  }
1036
1126
 
1037
1127
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1038
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1128
+ const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1039
1129
  {
1040
1130
  ZDICT_params_t params;
1041
1131
  memset(&params, 0, sizeof(params));