zstd-ruby 1.4.4.0 → 1.5.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/README.md +78 -5
  4. data/Rakefile +8 -2
  5. data/ext/zstdruby/common.h +15 -0
  6. data/ext/zstdruby/extconf.rb +3 -2
  7. data/ext/zstdruby/libzstd/common/allocations.h +55 -0
  8. data/ext/zstdruby/libzstd/common/bits.h +200 -0
  9. data/ext/zstdruby/libzstd/common/bitstream.h +74 -97
  10. data/ext/zstdruby/libzstd/common/compiler.h +219 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +184 -80
  15. data/ext/zstdruby/libzstd/common/error_private.c +11 -2
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +47 -116
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +127 -127
  19. data/ext/zstdruby/libzstd/common/huf.h +112 -197
  20. data/ext/zstdruby/libzstd/common/mem.h +124 -142
  21. data/ext/zstdruby/libzstd/common/pool.c +54 -27
  22. data/ext/zstdruby/libzstd/common/pool.h +11 -5
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +156 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +78 -22
  25. data/ext/zstdruby/libzstd/common/threading.h +9 -13
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +2 -37
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +186 -144
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +99 -196
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +968 -331
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +4120 -1191
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +688 -159
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +121 -40
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -6
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +62 -35
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +577 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +322 -115
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +394 -154
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +4 -3
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +729 -253
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +4 -3
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1289 -247
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +61 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +339 -212
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +508 -282
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +217 -466
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +35 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1220 -572
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +576 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +23 -19
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +859 -273
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1244 -375
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +21 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +74 -11
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +75 -54
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +55 -36
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +126 -110
  72. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +248 -56
  73. data/ext/zstdruby/libzstd/zstd.h +1277 -306
  74. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +29 -8
  75. data/ext/zstdruby/main.c +20 -0
  76. data/ext/zstdruby/skippable_frame.c +63 -0
  77. data/ext/zstdruby/streaming_compress.c +177 -0
  78. data/ext/zstdruby/streaming_compress.h +5 -0
  79. data/ext/zstdruby/streaming_decompress.c +123 -0
  80. data/ext/zstdruby/zstdruby.c +114 -32
  81. data/lib/zstd-ruby/version.rb +1 -1
  82. data/lib/zstd-ruby.rb +0 -1
  83. data/zstd-ruby.gemspec +1 -1
  84. metadata +24 -39
  85. data/.travis.yml +0 -14
  86. data/ext/zstdruby/libzstd/.gitignore +0 -3
  87. data/ext/zstdruby/libzstd/BUCK +0 -234
  88. data/ext/zstdruby/libzstd/Makefile +0 -289
  89. data/ext/zstdruby/libzstd/README.md +0 -159
  90. data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
  91. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
  92. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -147
  93. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
  94. data/ext/zstdruby/libzstd/dll/example/Makefile +0 -47
  95. data/ext/zstdruby/libzstd/dll/example/README.md +0 -69
  96. data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
  97. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
  98. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
  99. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
  100. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2152
  101. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
  102. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3514
  103. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
  104. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3156
  105. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
  106. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3641
  107. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
  108. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4046
  109. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
  110. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4150
  111. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
  112. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4533
  113. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
  114. data/ext/zstdruby/libzstd/libzstd.pc.in +0 -15
  115. data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -23,9 +23,13 @@
23
23
  /* Unix Large Files support (>4GB) */
24
24
  #define _FILE_OFFSET_BITS 64
25
25
  #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
26
+ # ifndef _LARGEFILE_SOURCE
26
27
  # define _LARGEFILE_SOURCE
28
+ # endif
27
29
  #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
30
+ # ifndef _LARGEFILE64_SOURCE
28
31
  # define _LARGEFILE64_SOURCE
32
+ # endif
29
33
  #endif
30
34
 
31
35
 
@@ -37,17 +41,19 @@
37
41
  #include <stdio.h> /* fprintf, fopen, ftello64 */
38
42
  #include <time.h> /* clock */
39
43
 
40
- #include "mem.h" /* read */
41
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
42
- #define HUF_STATIC_LINKING_ONLY
43
- #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
- #include "zstd_internal.h" /* includes zstd.h */
45
- #include "xxhash.h" /* XXH64 */
46
- #include "divsufsort.h"
47
44
  #ifndef ZDICT_STATIC_LINKING_ONLY
48
45
  # define ZDICT_STATIC_LINKING_ONLY
49
46
  #endif
50
- #include "zdict.h"
47
+
48
+ #include "../common/mem.h" /* read */
49
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
50
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
51
+ #include "../common/zstd_internal.h" /* includes zstd.h */
52
+ #include "../common/xxhash.h" /* XXH64 */
53
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
54
+ #include "../zdict.h"
55
+ #include "divsufsort.h"
56
+ #include "../common/bits.h" /* ZSTD_NbCommonBytes */
51
57
 
52
58
 
53
59
  /*-*************************************
@@ -61,14 +67,15 @@
61
67
 
62
68
  #define NOISELENGTH 32
63
69
 
64
- static const int g_compressionLevel_default = 3;
65
70
  static const U32 g_selectivity_default = 9;
66
71
 
67
72
 
68
73
  /*-*************************************
69
74
  * Console display
70
75
  ***************************************/
76
+ #undef DISPLAY
71
77
  #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
78
+ #undef DISPLAYLEVEL
72
79
  #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
73
80
 
74
81
  static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
@@ -99,69 +106,30 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
99
106
  return MEM_readLE32((const char*)dictBuffer + 4);
100
107
  }
101
108
 
102
-
103
- /*-********************************************************
104
- * Dictionary training functions
105
- **********************************************************/
106
- static unsigned ZDICT_NbCommonBytes (size_t val)
109
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
107
110
  {
108
- if (MEM_isLittleEndian()) {
109
- if (MEM_64bits()) {
110
- # if defined(_MSC_VER) && defined(_WIN64)
111
- unsigned long r = 0;
112
- _BitScanForward64( &r, (U64)val );
113
- return (unsigned)(r>>3);
114
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
115
- return (__builtin_ctzll((U64)val) >> 3);
116
- # else
117
- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
118
- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
119
- # endif
120
- } else { /* 32 bits */
121
- # if defined(_MSC_VER)
122
- unsigned long r=0;
123
- _BitScanForward( &r, (U32)val );
124
- return (unsigned)(r>>3);
125
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
126
- return (__builtin_ctz((U32)val) >> 3);
127
- # else
128
- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
129
- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
130
- # endif
111
+ size_t headerSize;
112
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
113
+
114
+ { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
115
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
116
+ if (!bs || !wksp) {
117
+ headerSize = ERROR(memory_allocation);
118
+ } else {
119
+ ZSTD_reset_compressedBlockState(bs);
120
+ headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
131
121
  }
132
- } else { /* Big Endian CPU */
133
- if (MEM_64bits()) {
134
- # if defined(_MSC_VER) && defined(_WIN64)
135
- unsigned long r = 0;
136
- _BitScanReverse64( &r, val );
137
- return (unsigned)(r>>3);
138
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
139
- return (__builtin_clzll(val) >> 3);
140
- # else
141
- unsigned r;
142
- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
143
- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
144
- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
145
- r += (!val);
146
- return r;
147
- # endif
148
- } else { /* 32 bits */
149
- # if defined(_MSC_VER)
150
- unsigned long r = 0;
151
- _BitScanReverse( &r, (unsigned long)val );
152
- return (unsigned)(r>>3);
153
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
154
- return (__builtin_clz((U32)val) >> 3);
155
- # else
156
- unsigned r;
157
- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
158
- r += (!val);
159
- return r;
160
- # endif
161
- } }
162
- }
163
122
 
123
+ free(bs);
124
+ free(wksp);
125
+ }
164
126
 
127
+ return headerSize;
128
+ }
129
+
130
+ /*-********************************************************
131
+ * Dictionary training functions
132
+ **********************************************************/
165
133
  /*! ZDICT_count() :
166
134
  Count the nb of common bytes between 2 pointers.
167
135
  Note : this function presumes end of buffer followed by noisy guard band.
@@ -176,7 +144,7 @@ static size_t ZDICT_count(const void* pIn, const void* pMatch)
176
144
  pMatch = (const char*)pMatch+sizeof(size_t);
177
145
  continue;
178
146
  }
179
- pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
147
+ pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff);
180
148
  return (size_t)((const char*)pIn - pStart);
181
149
  }
182
150
  }
@@ -208,7 +176,7 @@ static dictItem ZDICT_analyzePos(
208
176
  U32 savings[LLIMIT] = {0};
209
177
  const BYTE* b = (const BYTE*)buffer;
210
178
  size_t maxLength = LLIMIT;
211
- size_t pos = suffix[start];
179
+ size_t pos = (size_t)suffix[start];
212
180
  U32 end = start;
213
181
  dictItem solution;
214
182
 
@@ -342,7 +310,7 @@ static dictItem ZDICT_analyzePos(
342
310
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
343
311
 
344
312
  DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
345
- (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
313
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);
346
314
 
347
315
  solution.pos = (U32)pos;
348
316
  solution.length = (U32)maxLength;
@@ -352,7 +320,7 @@ static dictItem ZDICT_analyzePos(
352
320
  { U32 id;
353
321
  for (id=start; id<end; id++) {
354
322
  U32 p, pEnd, length;
355
- U32 const testedPos = suffix[id];
323
+ U32 const testedPos = (U32)suffix[id];
356
324
  if (testedPos == pos)
357
325
  length = solution.length;
358
326
  else {
@@ -404,7 +372,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
404
372
  elt = table[u];
405
373
  /* sort : improve rank */
406
374
  while ((u>1) && (table[u-1].savings < elt.savings))
407
- table[u] = table[u-1], u--;
375
+ table[u] = table[u-1], u--;
408
376
  table[u] = elt;
409
377
  return u;
410
378
  } }
@@ -415,7 +383,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
415
383
 
416
384
  if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
417
385
  /* append */
418
- int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
386
+ int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);
419
387
  table[u].savings += elt.length / 8; /* rough approx bonus */
420
388
  if (addedLength > 0) { /* otherwise, elt fully included into existing */
421
389
  table[u].length += addedLength;
@@ -508,6 +476,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
508
476
  clock_t displayClock = 0;
509
477
  clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
510
478
 
479
+ # undef DISPLAYUPDATE
511
480
  # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
512
481
  if (ZDICT_clockSpan(displayClock) > refreshRate) \
513
482
  { displayClock = clock(); DISPLAY(__VA_ARGS__); \
@@ -554,7 +523,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
554
523
  if (solution.length==0) { cursor++; continue; }
555
524
  ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
556
525
  cursor += solution.length;
557
- DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
526
+ DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0);
558
527
  } }
559
528
 
560
529
  _cleanup:
@@ -588,20 +557,20 @@ typedef struct
588
557
 
589
558
  #define MAXREPOFFSET 1024
590
559
 
591
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
560
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
592
561
  unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
593
562
  const void* src, size_t srcSize,
594
563
  U32 notificationLevel)
595
564
  {
596
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
565
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
597
566
  size_t cSize;
598
567
 
599
568
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
600
- { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
569
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict);
601
570
  if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
602
571
 
603
572
  }
604
- cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
573
+ cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
605
574
  if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
606
575
 
607
576
  if (cSize) { /* if == 0; block is not compressible */
@@ -634,8 +603,8 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
634
603
 
635
604
  if (nbSeq >= 2) { /* rep offsets */
636
605
  const seqDef* const seq = seqStorePtr->sequencesStart;
637
- U32 offset1 = seq[0].offset - 3;
638
- U32 offset2 = seq[1].offset - 3;
606
+ U32 offset1 = seq[0].offBase - ZSTD_REP_NUM;
607
+ U32 offset2 = seq[1].offBase - ZSTD_REP_NUM;
639
608
  if (offset1 >= MAXREPOFFSET) offset1 = 0;
640
609
  if (offset2 >= MAXREPOFFSET) offset2 = 0;
641
610
  repOffsets[offset1] += 3;
@@ -682,7 +651,7 @@ static void ZDICT_flatLit(unsigned* countLit)
682
651
 
683
652
  #define OFFCODE_MAX 30 /* only applicable to first block */
684
653
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
685
- unsigned compressionLevel,
654
+ int compressionLevel,
686
655
  const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
687
656
  const void* dictBuffer, size_t dictBufferSize,
688
657
  unsigned notificationLevel)
@@ -706,6 +675,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
706
675
  size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
707
676
  size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
708
677
  BYTE* dstPtr = (BYTE*)dstBuffer;
678
+ U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32];
709
679
 
710
680
  /* init */
711
681
  DEBUGLOG(4, "ZDICT_analyzeEntropy");
@@ -717,7 +687,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
717
687
  memset(repOffset, 0, sizeof(repOffset));
718
688
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
719
689
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
720
- if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
690
+ if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
721
691
  params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
722
692
 
723
693
  esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
@@ -731,15 +701,22 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
731
701
 
732
702
  /* collect stats on all samples */
733
703
  for (u=0; u<nbFiles; u++) {
734
- ZDICT_countEStats(esr, params,
704
+ ZDICT_countEStats(esr, &params,
735
705
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
736
706
  (const char*)srcBuffer + pos, fileSizes[u],
737
707
  notificationLevel);
738
708
  pos += fileSizes[u];
739
709
  }
740
710
 
711
+ if (notificationLevel >= 4) {
712
+ /* writeStats */
713
+ DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
714
+ for (u=0; u<=offcodeMax; u++) {
715
+ DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
716
+ } }
717
+
741
718
  /* analyze, build stats, starting with literals */
742
- { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
719
+ { size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
743
720
  if (HUF_isError(maxNbBits)) {
744
721
  eSize = maxNbBits;
745
722
  DISPLAYLEVEL(1, " HUF_buildCTable error \n");
@@ -748,7 +725,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
748
725
  if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
749
726
  DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
750
727
  ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
751
- maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
728
+ maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
752
729
  assert(maxNbBits==9);
753
730
  }
754
731
  huffLog = (U32)maxNbBits;
@@ -762,7 +739,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
762
739
  /* note : the result of this phase should be used to better appreciate the impact on statistics */
763
740
 
764
741
  total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
765
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
742
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
766
743
  if (FSE_isError(errorCode)) {
767
744
  eSize = errorCode;
768
745
  DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
@@ -771,7 +748,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
771
748
  Offlog = (U32)errorCode;
772
749
 
773
750
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
774
- errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
751
+ errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
775
752
  if (FSE_isError(errorCode)) {
776
753
  eSize = errorCode;
777
754
  DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
@@ -780,7 +757,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
780
757
  mlLog = (U32)errorCode;
781
758
 
782
759
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
783
- errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
760
+ errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
784
761
  if (FSE_isError(errorCode)) {
785
762
  eSize = errorCode;
786
763
  DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
@@ -789,7 +766,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
789
766
  llLog = (U32)errorCode;
790
767
 
791
768
  /* write result to buffer */
792
- { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
769
+ { size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp));
793
770
  if (HUF_isError(hhSize)) {
794
771
  eSize = hhSize;
795
772
  DISPLAYLEVEL(1, "HUF_writeCTable error \n");
@@ -844,7 +821,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
844
821
  MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
845
822
  #else
846
823
  /* at this stage, we don't use the result of "most common first offset",
847
- as the impact of statistics is not properly evaluated */
824
+ * as the impact of statistics is not properly evaluated */
848
825
  MEM_writeLE32(dstPtr+0, repStartValue[0]);
849
826
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
850
827
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
@@ -860,6 +837,17 @@ _cleanup:
860
837
  }
861
838
 
862
839
 
840
+ /**
841
+ * @returns the maximum repcode value
842
+ */
843
+ static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
844
+ {
845
+ U32 maxRep = reps[0];
846
+ int r;
847
+ for (r = 1; r < ZSTD_REP_NUM; ++r)
848
+ maxRep = MAX(maxRep, reps[r]);
849
+ return maxRep;
850
+ }
863
851
 
864
852
  size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
865
853
  const void* customDictContent, size_t dictContentSize,
@@ -869,13 +857,15 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
869
857
  size_t hSize;
870
858
  #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
871
859
  BYTE header[HBUFFSIZE];
872
- int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
860
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
873
861
  U32 const notificationLevel = params.notificationLevel;
862
+ /* The final dictionary content must be at least as large as the largest repcode */
863
+ size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
864
+ size_t paddingSize;
874
865
 
875
866
  /* check conditions */
876
867
  DEBUGLOG(4, "ZDICT_finalizeDictionary");
877
868
  if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
878
- if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
879
869
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
880
870
 
881
871
  /* dictionary header */
@@ -899,12 +889,43 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
899
889
  hSize += eSize;
900
890
  }
901
891
 
902
- /* copy elements in final buffer ; note : src and dst buffer can overlap */
903
- if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
904
- { size_t const dictSize = hSize + dictContentSize;
905
- char* dictEnd = (char*)dictBuffer + dictSize;
906
- memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
907
- memcpy(dictBuffer, header, hSize);
892
+ /* Shrink the content size if it doesn't fit in the buffer */
893
+ if (hSize + dictContentSize > dictBufferCapacity) {
894
+ dictContentSize = dictBufferCapacity - hSize;
895
+ }
896
+
897
+ /* Pad the dictionary content with zeros if it is too small */
898
+ if (dictContentSize < minContentSize) {
899
+ RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
900
+ "dictBufferCapacity too small to fit max repcode");
901
+ paddingSize = minContentSize - dictContentSize;
902
+ } else {
903
+ paddingSize = 0;
904
+ }
905
+
906
+ {
907
+ size_t const dictSize = hSize + paddingSize + dictContentSize;
908
+
909
+ /* The dictionary consists of the header, optional padding, and the content.
910
+ * The padding comes before the content because the "best" position in the
911
+ * dictionary is the last byte.
912
+ */
913
+ BYTE* const outDictHeader = (BYTE*)dictBuffer;
914
+ BYTE* const outDictPadding = outDictHeader + hSize;
915
+ BYTE* const outDictContent = outDictPadding + paddingSize;
916
+
917
+ assert(dictSize <= dictBufferCapacity);
918
+ assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
919
+
920
+ /* First copy the customDictContent into its final location.
921
+ * `customDictContent` and `dictBuffer` may overlap, so we must
922
+ * do this before any other writes into the output buffer.
923
+ * Then copy the header & padding into the output buffer.
924
+ */
925
+ memmove(outDictContent, customDictContent, dictContentSize);
926
+ memcpy(outDictHeader, header, hSize);
927
+ memset(outDictPadding, 0, paddingSize);
928
+
908
929
  return dictSize;
909
930
  }
910
931
  }
@@ -915,7 +936,7 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
915
936
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
916
937
  ZDICT_params_t params)
917
938
  {
918
- int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
939
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
919
940
  U32 const notificationLevel = params.notificationLevel;
920
941
  size_t hSize = 8;
921
942
 
@@ -944,16 +965,11 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
944
965
  return MIN(dictBufferCapacity, hSize+dictContentSize);
945
966
  }
946
967
 
947
- /* Hidden declaration for dbio.c */
948
- size_t ZDICT_trainFromBuffer_unsafe_legacy(
949
- void* dictBuffer, size_t maxDictSize,
950
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
951
- ZDICT_legacy_params_t params);
952
968
  /*! ZDICT_trainFromBuffer_unsafe_legacy() :
953
- * Warning : `samplesBuffer` must be followed by noisy guard band.
969
+ * Warning : `samplesBuffer` must be followed by noisy guard band !!!
954
970
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
955
971
  */
956
- size_t ZDICT_trainFromBuffer_unsafe_legacy(
972
+ static size_t ZDICT_trainFromBuffer_unsafe_legacy(
957
973
  void* dictBuffer, size_t maxDictSize,
958
974
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
959
975
  ZDICT_legacy_params_t params)
@@ -1090,8 +1106,8 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1090
1106
  memset(&params, 0, sizeof(params));
1091
1107
  params.d = 8;
1092
1108
  params.steps = 4;
1093
- /* Default to level 6 since no compression level information is available */
1094
- params.zParams.compressionLevel = 3;
1109
+ /* Use default level since no compression level information is available */
1110
+ params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1095
1111
  #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1096
1112
  params.zParams.notificationLevel = DEBUGLEVEL;
1097
1113
  #endif