zstd-ruby 1.4.4.0 → 1.5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/README.md +78 -5
  4. data/Rakefile +8 -2
  5. data/ext/zstdruby/common.h +15 -0
  6. data/ext/zstdruby/extconf.rb +3 -2
  7. data/ext/zstdruby/libzstd/common/allocations.h +55 -0
  8. data/ext/zstdruby/libzstd/common/bits.h +200 -0
  9. data/ext/zstdruby/libzstd/common/bitstream.h +74 -97
  10. data/ext/zstdruby/libzstd/common/compiler.h +219 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +184 -80
  15. data/ext/zstdruby/libzstd/common/error_private.c +11 -2
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +47 -116
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +127 -127
  19. data/ext/zstdruby/libzstd/common/huf.h +112 -197
  20. data/ext/zstdruby/libzstd/common/mem.h +124 -142
  21. data/ext/zstdruby/libzstd/common/pool.c +54 -27
  22. data/ext/zstdruby/libzstd/common/pool.h +11 -5
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +156 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +78 -22
  25. data/ext/zstdruby/libzstd/common/threading.h +9 -13
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +2 -37
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +186 -144
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +99 -196
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +968 -331
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +4120 -1191
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +688 -159
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +121 -40
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +16 -6
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +62 -35
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +577 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +322 -115
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +394 -154
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +4 -3
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +729 -253
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +4 -3
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1289 -247
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +61 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +339 -212
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +508 -282
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +217 -466
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +35 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1220 -572
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +576 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +23 -19
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +859 -273
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1244 -375
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +21 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +74 -11
  67. data/ext/zstdruby/libzstd/dictBuilder/cover.c +75 -54
  68. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  69. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  70. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +55 -36
  71. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +126 -110
  72. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +248 -56
  73. data/ext/zstdruby/libzstd/zstd.h +1277 -306
  74. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +29 -8
  75. data/ext/zstdruby/main.c +20 -0
  76. data/ext/zstdruby/skippable_frame.c +63 -0
  77. data/ext/zstdruby/streaming_compress.c +177 -0
  78. data/ext/zstdruby/streaming_compress.h +5 -0
  79. data/ext/zstdruby/streaming_decompress.c +123 -0
  80. data/ext/zstdruby/zstdruby.c +114 -32
  81. data/lib/zstd-ruby/version.rb +1 -1
  82. data/lib/zstd-ruby.rb +0 -1
  83. data/zstd-ruby.gemspec +1 -1
  84. metadata +24 -39
  85. data/.travis.yml +0 -14
  86. data/ext/zstdruby/libzstd/.gitignore +0 -3
  87. data/ext/zstdruby/libzstd/BUCK +0 -234
  88. data/ext/zstdruby/libzstd/Makefile +0 -289
  89. data/ext/zstdruby/libzstd/README.md +0 -159
  90. data/ext/zstdruby/libzstd/deprecated/zbuff.h +0 -214
  91. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +0 -26
  92. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +0 -147
  93. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +0 -75
  94. data/ext/zstdruby/libzstd/dll/example/Makefile +0 -47
  95. data/ext/zstdruby/libzstd/dll/example/README.md +0 -69
  96. data/ext/zstdruby/libzstd/dll/example/build_package.bat +0 -20
  97. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.sln +0 -25
  98. data/ext/zstdruby/libzstd/dll/example/fullbench-dll.vcxproj +0 -181
  99. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +0 -415
  100. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +0 -2152
  101. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +0 -94
  102. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +0 -3514
  103. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +0 -93
  104. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +0 -3156
  105. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +0 -93
  106. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +0 -3641
  107. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +0 -142
  108. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +0 -4046
  109. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +0 -162
  110. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +0 -4150
  111. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +0 -172
  112. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +0 -4533
  113. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +0 -187
  114. data/ext/zstdruby/libzstd/libzstd.pc.in +0 -15
  115. data/ext/zstdruby/zstdruby.h +0 -6
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -23,9 +23,13 @@
23
23
  /* Unix Large Files support (>4GB) */
24
24
  #define _FILE_OFFSET_BITS 64
25
25
  #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
26
+ # ifndef _LARGEFILE_SOURCE
26
27
  # define _LARGEFILE_SOURCE
28
+ # endif
27
29
  #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
30
+ # ifndef _LARGEFILE64_SOURCE
28
31
  # define _LARGEFILE64_SOURCE
32
+ # endif
29
33
  #endif
30
34
 
31
35
 
@@ -37,17 +41,19 @@
37
41
  #include <stdio.h> /* fprintf, fopen, ftello64 */
38
42
  #include <time.h> /* clock */
39
43
 
40
- #include "mem.h" /* read */
41
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
42
- #define HUF_STATIC_LINKING_ONLY
43
- #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
- #include "zstd_internal.h" /* includes zstd.h */
45
- #include "xxhash.h" /* XXH64 */
46
- #include "divsufsort.h"
47
44
  #ifndef ZDICT_STATIC_LINKING_ONLY
48
45
  # define ZDICT_STATIC_LINKING_ONLY
49
46
  #endif
50
- #include "zdict.h"
47
+
48
+ #include "../common/mem.h" /* read */
49
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
50
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
51
+ #include "../common/zstd_internal.h" /* includes zstd.h */
52
+ #include "../common/xxhash.h" /* XXH64 */
53
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
54
+ #include "../zdict.h"
55
+ #include "divsufsort.h"
56
+ #include "../common/bits.h" /* ZSTD_NbCommonBytes */
51
57
 
52
58
 
53
59
  /*-*************************************
@@ -61,14 +67,15 @@
61
67
 
62
68
  #define NOISELENGTH 32
63
69
 
64
- static const int g_compressionLevel_default = 3;
65
70
  static const U32 g_selectivity_default = 9;
66
71
 
67
72
 
68
73
  /*-*************************************
69
74
  * Console display
70
75
  ***************************************/
76
+ #undef DISPLAY
71
77
  #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
78
+ #undef DISPLAYLEVEL
72
79
  #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
73
80
 
74
81
  static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
@@ -99,69 +106,30 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
99
106
  return MEM_readLE32((const char*)dictBuffer + 4);
100
107
  }
101
108
 
102
-
103
- /*-********************************************************
104
- * Dictionary training functions
105
- **********************************************************/
106
- static unsigned ZDICT_NbCommonBytes (size_t val)
109
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
107
110
  {
108
- if (MEM_isLittleEndian()) {
109
- if (MEM_64bits()) {
110
- # if defined(_MSC_VER) && defined(_WIN64)
111
- unsigned long r = 0;
112
- _BitScanForward64( &r, (U64)val );
113
- return (unsigned)(r>>3);
114
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
115
- return (__builtin_ctzll((U64)val) >> 3);
116
- # else
117
- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
118
- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
119
- # endif
120
- } else { /* 32 bits */
121
- # if defined(_MSC_VER)
122
- unsigned long r=0;
123
- _BitScanForward( &r, (U32)val );
124
- return (unsigned)(r>>3);
125
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
126
- return (__builtin_ctz((U32)val) >> 3);
127
- # else
128
- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
129
- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
130
- # endif
111
+ size_t headerSize;
112
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
113
+
114
+ { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
115
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
116
+ if (!bs || !wksp) {
117
+ headerSize = ERROR(memory_allocation);
118
+ } else {
119
+ ZSTD_reset_compressedBlockState(bs);
120
+ headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
131
121
  }
132
- } else { /* Big Endian CPU */
133
- if (MEM_64bits()) {
134
- # if defined(_MSC_VER) && defined(_WIN64)
135
- unsigned long r = 0;
136
- _BitScanReverse64( &r, val );
137
- return (unsigned)(r>>3);
138
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
139
- return (__builtin_clzll(val) >> 3);
140
- # else
141
- unsigned r;
142
- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
143
- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
144
- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
145
- r += (!val);
146
- return r;
147
- # endif
148
- } else { /* 32 bits */
149
- # if defined(_MSC_VER)
150
- unsigned long r = 0;
151
- _BitScanReverse( &r, (unsigned long)val );
152
- return (unsigned)(r>>3);
153
- # elif defined(__GNUC__) && (__GNUC__ >= 3)
154
- return (__builtin_clz((U32)val) >> 3);
155
- # else
156
- unsigned r;
157
- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
158
- r += (!val);
159
- return r;
160
- # endif
161
- } }
162
- }
163
122
 
123
+ free(bs);
124
+ free(wksp);
125
+ }
164
126
 
127
+ return headerSize;
128
+ }
129
+
130
+ /*-********************************************************
131
+ * Dictionary training functions
132
+ **********************************************************/
165
133
  /*! ZDICT_count() :
166
134
  Count the nb of common bytes between 2 pointers.
167
135
  Note : this function presumes end of buffer followed by noisy guard band.
@@ -176,7 +144,7 @@ static size_t ZDICT_count(const void* pIn, const void* pMatch)
176
144
  pMatch = (const char*)pMatch+sizeof(size_t);
177
145
  continue;
178
146
  }
179
- pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
147
+ pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff);
180
148
  return (size_t)((const char*)pIn - pStart);
181
149
  }
182
150
  }
@@ -208,7 +176,7 @@ static dictItem ZDICT_analyzePos(
208
176
  U32 savings[LLIMIT] = {0};
209
177
  const BYTE* b = (const BYTE*)buffer;
210
178
  size_t maxLength = LLIMIT;
211
- size_t pos = suffix[start];
179
+ size_t pos = (size_t)suffix[start];
212
180
  U32 end = start;
213
181
  dictItem solution;
214
182
 
@@ -342,7 +310,7 @@ static dictItem ZDICT_analyzePos(
342
310
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
343
311
 
344
312
  DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
345
- (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
313
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);
346
314
 
347
315
  solution.pos = (U32)pos;
348
316
  solution.length = (U32)maxLength;
@@ -352,7 +320,7 @@ static dictItem ZDICT_analyzePos(
352
320
  { U32 id;
353
321
  for (id=start; id<end; id++) {
354
322
  U32 p, pEnd, length;
355
- U32 const testedPos = suffix[id];
323
+ U32 const testedPos = (U32)suffix[id];
356
324
  if (testedPos == pos)
357
325
  length = solution.length;
358
326
  else {
@@ -404,7 +372,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
404
372
  elt = table[u];
405
373
  /* sort : improve rank */
406
374
  while ((u>1) && (table[u-1].savings < elt.savings))
407
- table[u] = table[u-1], u--;
375
+ table[u] = table[u-1], u--;
408
376
  table[u] = elt;
409
377
  return u;
410
378
  } }
@@ -415,7 +383,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
415
383
 
416
384
  if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
417
385
  /* append */
418
- int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
386
+ int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);
419
387
  table[u].savings += elt.length / 8; /* rough approx bonus */
420
388
  if (addedLength > 0) { /* otherwise, elt fully included into existing */
421
389
  table[u].length += addedLength;
@@ -508,6 +476,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
508
476
  clock_t displayClock = 0;
509
477
  clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
510
478
 
479
+ # undef DISPLAYUPDATE
511
480
  # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
512
481
  if (ZDICT_clockSpan(displayClock) > refreshRate) \
513
482
  { displayClock = clock(); DISPLAY(__VA_ARGS__); \
@@ -554,7 +523,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
554
523
  if (solution.length==0) { cursor++; continue; }
555
524
  ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
556
525
  cursor += solution.length;
557
- DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
526
+ DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0);
558
527
  } }
559
528
 
560
529
  _cleanup:
@@ -588,20 +557,20 @@ typedef struct
588
557
 
589
558
  #define MAXREPOFFSET 1024
590
559
 
591
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
560
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
592
561
  unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
593
562
  const void* src, size_t srcSize,
594
563
  U32 notificationLevel)
595
564
  {
596
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
565
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
597
566
  size_t cSize;
598
567
 
599
568
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
600
- { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
569
+ { size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict);
601
570
  if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
602
571
 
603
572
  }
604
- cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
573
+ cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
605
574
  if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }
606
575
 
607
576
  if (cSize) { /* if == 0; block is not compressible */
@@ -634,8 +603,8 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
634
603
 
635
604
  if (nbSeq >= 2) { /* rep offsets */
636
605
  const seqDef* const seq = seqStorePtr->sequencesStart;
637
- U32 offset1 = seq[0].offset - 3;
638
- U32 offset2 = seq[1].offset - 3;
606
+ U32 offset1 = seq[0].offBase - ZSTD_REP_NUM;
607
+ U32 offset2 = seq[1].offBase - ZSTD_REP_NUM;
639
608
  if (offset1 >= MAXREPOFFSET) offset1 = 0;
640
609
  if (offset2 >= MAXREPOFFSET) offset2 = 0;
641
610
  repOffsets[offset1] += 3;
@@ -682,7 +651,7 @@ static void ZDICT_flatLit(unsigned* countLit)
682
651
 
683
652
  #define OFFCODE_MAX 30 /* only applicable to first block */
684
653
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
685
- unsigned compressionLevel,
654
+ int compressionLevel,
686
655
  const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
687
656
  const void* dictBuffer, size_t dictBufferSize,
688
657
  unsigned notificationLevel)
@@ -706,6 +675,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
706
675
  size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
707
676
  size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
708
677
  BYTE* dstPtr = (BYTE*)dstBuffer;
678
+ U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32];
709
679
 
710
680
  /* init */
711
681
  DEBUGLOG(4, "ZDICT_analyzeEntropy");
@@ -717,7 +687,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
717
687
  memset(repOffset, 0, sizeof(repOffset));
718
688
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
719
689
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
720
- if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
690
+ if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
721
691
  params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
722
692
 
723
693
  esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
@@ -731,15 +701,22 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
731
701
 
732
702
  /* collect stats on all samples */
733
703
  for (u=0; u<nbFiles; u++) {
734
- ZDICT_countEStats(esr, params,
704
+ ZDICT_countEStats(esr, &params,
735
705
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
736
706
  (const char*)srcBuffer + pos, fileSizes[u],
737
707
  notificationLevel);
738
708
  pos += fileSizes[u];
739
709
  }
740
710
 
711
+ if (notificationLevel >= 4) {
712
+ /* writeStats */
713
+ DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
714
+ for (u=0; u<=offcodeMax; u++) {
715
+ DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
716
+ } }
717
+
741
718
  /* analyze, build stats, starting with literals */
742
- { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
719
+ { size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
743
720
  if (HUF_isError(maxNbBits)) {
744
721
  eSize = maxNbBits;
745
722
  DISPLAYLEVEL(1, " HUF_buildCTable error \n");
@@ -748,7 +725,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
748
725
  if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
749
726
  DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
750
727
  ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
751
- maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
728
+ maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp));
752
729
  assert(maxNbBits==9);
753
730
  }
754
731
  huffLog = (U32)maxNbBits;
@@ -762,7 +739,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
762
739
  /* note : the result of this phase should be used to better appreciate the impact on statistics */
763
740
 
764
741
  total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
765
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
742
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
766
743
  if (FSE_isError(errorCode)) {
767
744
  eSize = errorCode;
768
745
  DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
@@ -771,7 +748,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
771
748
  Offlog = (U32)errorCode;
772
749
 
773
750
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
774
- errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
751
+ errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
775
752
  if (FSE_isError(errorCode)) {
776
753
  eSize = errorCode;
777
754
  DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
@@ -780,7 +757,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
780
757
  mlLog = (U32)errorCode;
781
758
 
782
759
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
783
- errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
760
+ errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
784
761
  if (FSE_isError(errorCode)) {
785
762
  eSize = errorCode;
786
763
  DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
@@ -789,7 +766,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
789
766
  llLog = (U32)errorCode;
790
767
 
791
768
  /* write result to buffer */
792
- { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);
769
+ { size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp));
793
770
  if (HUF_isError(hhSize)) {
794
771
  eSize = hhSize;
795
772
  DISPLAYLEVEL(1, "HUF_writeCTable error \n");
@@ -844,7 +821,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
844
821
  MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
845
822
  #else
846
823
  /* at this stage, we don't use the result of "most common first offset",
847
- as the impact of statistics is not properly evaluated */
824
+ * as the impact of statistics is not properly evaluated */
848
825
  MEM_writeLE32(dstPtr+0, repStartValue[0]);
849
826
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
850
827
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
@@ -860,6 +837,17 @@ _cleanup:
860
837
  }
861
838
 
862
839
 
840
+ /**
841
+ * @returns the maximum repcode value
842
+ */
843
+ static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
844
+ {
845
+ U32 maxRep = reps[0];
846
+ int r;
847
+ for (r = 1; r < ZSTD_REP_NUM; ++r)
848
+ maxRep = MAX(maxRep, reps[r]);
849
+ return maxRep;
850
+ }
863
851
 
864
852
  size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
865
853
  const void* customDictContent, size_t dictContentSize,
@@ -869,13 +857,15 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
869
857
  size_t hSize;
870
858
  #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
871
859
  BYTE header[HBUFFSIZE];
872
- int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
860
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
873
861
  U32 const notificationLevel = params.notificationLevel;
862
+ /* The final dictionary content must be at least as large as the largest repcode */
863
+ size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
864
+ size_t paddingSize;
874
865
 
875
866
  /* check conditions */
876
867
  DEBUGLOG(4, "ZDICT_finalizeDictionary");
877
868
  if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
878
- if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
879
869
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
880
870
 
881
871
  /* dictionary header */
@@ -899,12 +889,43 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
899
889
  hSize += eSize;
900
890
  }
901
891
 
902
- /* copy elements in final buffer ; note : src and dst buffer can overlap */
903
- if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
904
- { size_t const dictSize = hSize + dictContentSize;
905
- char* dictEnd = (char*)dictBuffer + dictSize;
906
- memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
907
- memcpy(dictBuffer, header, hSize);
892
+ /* Shrink the content size if it doesn't fit in the buffer */
893
+ if (hSize + dictContentSize > dictBufferCapacity) {
894
+ dictContentSize = dictBufferCapacity - hSize;
895
+ }
896
+
897
+ /* Pad the dictionary content with zeros if it is too small */
898
+ if (dictContentSize < minContentSize) {
899
+ RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
900
+ "dictBufferCapacity too small to fit max repcode");
901
+ paddingSize = minContentSize - dictContentSize;
902
+ } else {
903
+ paddingSize = 0;
904
+ }
905
+
906
+ {
907
+ size_t const dictSize = hSize + paddingSize + dictContentSize;
908
+
909
+ /* The dictionary consists of the header, optional padding, and the content.
910
+ * The padding comes before the content because the "best" position in the
911
+ * dictionary is the last byte.
912
+ */
913
+ BYTE* const outDictHeader = (BYTE*)dictBuffer;
914
+ BYTE* const outDictPadding = outDictHeader + hSize;
915
+ BYTE* const outDictContent = outDictPadding + paddingSize;
916
+
917
+ assert(dictSize <= dictBufferCapacity);
918
+ assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
919
+
920
+ /* First copy the customDictContent into its final location.
921
+ * `customDictContent` and `dictBuffer` may overlap, so we must
922
+ * do this before any other writes into the output buffer.
923
+ * Then copy the header & padding into the output buffer.
924
+ */
925
+ memmove(outDictContent, customDictContent, dictContentSize);
926
+ memcpy(outDictHeader, header, hSize);
927
+ memset(outDictPadding, 0, paddingSize);
928
+
908
929
  return dictSize;
909
930
  }
910
931
  }
@@ -915,7 +936,7 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
915
936
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
916
937
  ZDICT_params_t params)
917
938
  {
918
- int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
939
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
919
940
  U32 const notificationLevel = params.notificationLevel;
920
941
  size_t hSize = 8;
921
942
 
@@ -944,16 +965,11 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
944
965
  return MIN(dictBufferCapacity, hSize+dictContentSize);
945
966
  }
946
967
 
947
- /* Hidden declaration for dbio.c */
948
- size_t ZDICT_trainFromBuffer_unsafe_legacy(
949
- void* dictBuffer, size_t maxDictSize,
950
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
951
- ZDICT_legacy_params_t params);
952
968
  /*! ZDICT_trainFromBuffer_unsafe_legacy() :
953
- * Warning : `samplesBuffer` must be followed by noisy guard band.
969
+ * Warning : `samplesBuffer` must be followed by noisy guard band !!!
954
970
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
955
971
  */
956
- size_t ZDICT_trainFromBuffer_unsafe_legacy(
972
+ static size_t ZDICT_trainFromBuffer_unsafe_legacy(
957
973
  void* dictBuffer, size_t maxDictSize,
958
974
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
959
975
  ZDICT_legacy_params_t params)
@@ -1090,8 +1106,8 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1090
1106
  memset(&params, 0, sizeof(params));
1091
1107
  params.d = 8;
1092
1108
  params.steps = 4;
1093
- /* Default to level 6 since no compression level information is available */
1094
- params.zParams.compressionLevel = 3;
1109
+ /* Use default level since no compression level information is available */
1110
+ params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1095
1111
  #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1096
1112
  params.zParams.notificationLevel = DEBUGLEVEL;
1097
1113
  #endif