zstd-ruby 1.4.4.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/.github/dependabot.yml +8 -0
  3. data/.github/workflows/ruby.yml +35 -0
  4. data/README.md +2 -2
  5. data/ext/zstdruby/extconf.rb +1 -0
  6. data/ext/zstdruby/libzstd/BUCK +5 -7
  7. data/ext/zstdruby/libzstd/Makefile +241 -173
  8. data/ext/zstdruby/libzstd/README.md +76 -18
  9. data/ext/zstdruby/libzstd/common/bitstream.h +75 -57
  10. data/ext/zstdruby/libzstd/common/compiler.h +196 -20
  11. data/ext/zstdruby/libzstd/common/cpu.h +1 -3
  12. data/ext/zstdruby/libzstd/common/debug.c +11 -31
  13. data/ext/zstdruby/libzstd/common/debug.h +22 -49
  14. data/ext/zstdruby/libzstd/common/entropy_common.c +208 -76
  15. data/ext/zstdruby/libzstd/common/error_private.c +3 -1
  16. data/ext/zstdruby/libzstd/common/error_private.h +87 -4
  17. data/ext/zstdruby/libzstd/common/fse.h +51 -42
  18. data/ext/zstdruby/libzstd/common/fse_decompress.c +149 -57
  19. data/ext/zstdruby/libzstd/common/huf.h +60 -54
  20. data/ext/zstdruby/libzstd/common/mem.h +87 -98
  21. data/ext/zstdruby/libzstd/common/pool.c +23 -17
  22. data/ext/zstdruby/libzstd/common/pool.h +3 -3
  23. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  24. data/ext/zstdruby/libzstd/common/threading.c +10 -8
  25. data/ext/zstdruby/libzstd/common/threading.h +4 -3
  26. data/ext/zstdruby/libzstd/common/xxhash.c +15 -873
  27. data/ext/zstdruby/libzstd/common/xxhash.h +5572 -191
  28. data/ext/zstdruby/libzstd/common/zstd_common.c +10 -10
  29. data/ext/zstdruby/libzstd/common/zstd_deps.h +111 -0
  30. data/ext/zstdruby/libzstd/common/zstd_internal.h +252 -108
  31. data/ext/zstdruby/libzstd/common/zstd_trace.h +163 -0
  32. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  33. data/ext/zstdruby/libzstd/compress/fse_compress.c +105 -85
  34. data/ext/zstdruby/libzstd/compress/hist.c +41 -63
  35. data/ext/zstdruby/libzstd/compress/hist.h +13 -33
  36. data/ext/zstdruby/libzstd/compress/huf_compress.c +831 -259
  37. data/ext/zstdruby/libzstd/compress/zstd_compress.c +3213 -1007
  38. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +493 -71
  39. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +21 -16
  40. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +4 -2
  41. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +51 -24
  42. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.h +10 -3
  43. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +573 -0
  44. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.h +32 -0
  45. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +208 -81
  46. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +315 -137
  47. data/ext/zstdruby/libzstd/compress/zstd_double_fast.h +2 -2
  48. data/ext/zstdruby/libzstd/compress/zstd_fast.c +319 -128
  49. data/ext/zstdruby/libzstd/compress/zstd_fast.h +2 -2
  50. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +1156 -171
  51. data/ext/zstdruby/libzstd/compress/zstd_lazy.h +59 -1
  52. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +331 -206
  53. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +15 -3
  54. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +106 -0
  55. data/ext/zstdruby/libzstd/compress/zstd_opt.c +403 -226
  56. data/ext/zstdruby/libzstd/compress/zstd_opt.h +1 -1
  57. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +188 -453
  58. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +32 -114
  59. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +1065 -410
  60. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  61. data/ext/zstdruby/libzstd/decompress/zstd_ddict.c +20 -16
  62. data/ext/zstdruby/libzstd/decompress/zstd_ddict.h +3 -3
  63. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +691 -230
  64. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +1072 -323
  65. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +16 -7
  66. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +71 -10
  67. data/ext/zstdruby/libzstd/deprecated/zbuff.h +3 -3
  68. data/ext/zstdruby/libzstd/deprecated/zbuff_common.c +2 -2
  69. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +24 -4
  70. data/ext/zstdruby/libzstd/deprecated/zbuff_decompress.c +1 -1
  71. data/ext/zstdruby/libzstd/dictBuilder/cover.c +57 -40
  72. data/ext/zstdruby/libzstd/dictBuilder/cover.h +20 -9
  73. data/ext/zstdruby/libzstd/dictBuilder/divsufsort.c +1 -1
  74. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +54 -35
  75. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +151 -57
  76. data/ext/zstdruby/libzstd/dll/example/Makefile +2 -1
  77. data/ext/zstdruby/libzstd/dll/example/README.md +16 -22
  78. data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -4
  79. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +25 -19
  80. data/ext/zstdruby/libzstd/legacy/zstd_v01.h +1 -1
  81. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +18 -14
  82. data/ext/zstdruby/libzstd/legacy/zstd_v02.h +1 -1
  83. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +18 -14
  84. data/ext/zstdruby/libzstd/legacy/zstd_v03.h +1 -1
  85. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +22 -16
  86. data/ext/zstdruby/libzstd/legacy/zstd_v04.h +1 -1
  87. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +29 -25
  88. data/ext/zstdruby/libzstd/legacy/zstd_v05.h +2 -2
  89. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +29 -25
  90. data/ext/zstdruby/libzstd/legacy/zstd_v06.h +1 -1
  91. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +34 -26
  92. data/ext/zstdruby/libzstd/legacy/zstd_v07.h +1 -1
  93. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  94. data/ext/zstdruby/libzstd/libzstd.pc.in +4 -3
  95. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  96. data/ext/zstdruby/libzstd/{dictBuilder/zdict.h → zdict.h} +201 -31
  97. data/ext/zstdruby/libzstd/zstd.h +760 -234
  98. data/ext/zstdruby/libzstd/{common/zstd_errors.h → zstd_errors.h} +3 -1
  99. data/ext/zstdruby/zstdruby.c +2 -2
  100. data/lib/zstd-ruby/version.rb +1 -1
  101. metadata +20 -9
  102. data/.travis.yml +0 -14
@@ -1,3 +1,13 @@
1
+ /*
2
+ * Copyright (c) Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
1
11
  /*-*************************************
2
12
  * Dependencies
3
13
  ***************************************/
@@ -6,24 +16,33 @@
6
16
  #include <string.h> /* memset */
7
17
  #include <time.h> /* clock */
8
18
 
9
- #include "mem.h" /* read */
10
- #include "pool.h"
11
- #include "threading.h"
12
- #include "cover.h"
13
- #include "zstd_internal.h" /* includes zstd.h */
14
19
  #ifndef ZDICT_STATIC_LINKING_ONLY
15
- #define ZDICT_STATIC_LINKING_ONLY
20
+ # define ZDICT_STATIC_LINKING_ONLY
16
21
  #endif
17
- #include "zdict.h"
22
+
23
+ #include "../common/mem.h" /* read */
24
+ #include "../common/pool.h"
25
+ #include "../common/threading.h"
26
+ #include "../common/zstd_internal.h" /* includes zstd.h */
27
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_hash*() */
28
+ #include "../zdict.h"
29
+ #include "cover.h"
18
30
 
19
31
 
20
32
  /*-*************************************
21
33
  * Constants
22
34
  ***************************************/
35
+ /**
36
+ * There are 32bit indexes used to ref samples, so limit samples size to 4GB
37
+ * on 64bit builds.
38
+ * For 32bit builds we choose 1 GB.
39
+ * Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
40
+ * contiguous buffer, so 1GB is already a high limit.
41
+ */
23
42
  #define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
24
43
  #define FASTCOVER_MAX_F 31
25
44
  #define FASTCOVER_MAX_ACCEL 10
26
- #define DEFAULT_SPLITPOINT 0.75
45
+ #define FASTCOVER_DEFAULT_SPLITPOINT 0.75
27
46
  #define DEFAULT_F 20
28
47
  #define DEFAULT_ACCEL 1
29
48
 
@@ -31,50 +50,50 @@
31
50
  /*-*************************************
32
51
  * Console display
33
52
  ***************************************/
34
- static int g_displayLevel = 2;
53
+ #ifndef LOCALDISPLAYLEVEL
54
+ static int g_displayLevel = 0;
55
+ #endif
56
+ #undef DISPLAY
35
57
  #define DISPLAY(...) \
36
58
  { \
37
59
  fprintf(stderr, __VA_ARGS__); \
38
60
  fflush(stderr); \
39
61
  }
62
+ #undef LOCALDISPLAYLEVEL
40
63
  #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
41
64
  if (displayLevel >= l) { \
42
65
  DISPLAY(__VA_ARGS__); \
43
66
  } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
67
+ #undef DISPLAYLEVEL
44
68
  #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
45
69
 
70
+ #ifndef LOCALDISPLAYUPDATE
71
+ static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
72
+ static clock_t g_time = 0;
73
+ #endif
74
+ #undef LOCALDISPLAYUPDATE
46
75
  #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
47
76
  if (displayLevel >= l) { \
48
- if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \
77
+ if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
49
78
  g_time = clock(); \
50
79
  DISPLAY(__VA_ARGS__); \
51
80
  } \
52
81
  }
82
+ #undef DISPLAYUPDATE
53
83
  #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
54
- static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
55
- static clock_t g_time = 0;
56
84
 
57
85
 
58
86
  /*-*************************************
59
87
  * Hash Functions
60
88
  ***************************************/
61
- static const U64 prime6bytes = 227718039650203ULL;
62
- static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
63
- static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
64
-
65
- static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
66
- static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
67
- static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
68
-
69
-
70
89
  /**
71
- * Hash the d-byte value pointed to by p and mod 2^f
90
+ * Hash the d-byte value pointed to by p and mod 2^f into the frequency vector
72
91
  */
73
- static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
92
+ static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 f, unsigned d) {
74
93
  if (d == 6) {
75
- return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
94
+ return ZSTD_hash6Ptr(p, f);
76
95
  }
77
- return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
96
+ return ZSTD_hash8Ptr(p, f);
78
97
  }
79
98
 
80
99
 
@@ -451,20 +470,20 @@ typedef struct FASTCOVER_tryParameters_data_s {
451
470
  * This function is thread safe if zstd is compiled with multithreaded support.
452
471
  * It takes its parameters as an *OWNING* opaque pointer to support threading.
453
472
  */
454
- static void FASTCOVER_tryParameters(void *opaque)
473
+ static void FASTCOVER_tryParameters(void* opaque)
455
474
  {
456
475
  /* Save parameters as local variables */
457
- FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
476
+ FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t*)opaque;
458
477
  const FASTCOVER_ctx_t *const ctx = data->ctx;
459
478
  const ZDICT_cover_params_t parameters = data->parameters;
460
479
  size_t dictBufferCapacity = data->dictBufferCapacity;
461
480
  size_t totalCompressedSize = ERROR(GENERIC);
462
481
  /* Initialize array to keep track of frequency of dmer within activeSegment */
463
- U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
482
+ U16* segmentFreqs = (U16*)calloc(((U64)1 << ctx->f), sizeof(U16));
464
483
  /* Allocate space for hash table, dict, and freqs */
465
- BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
484
+ BYTE *const dict = (BYTE*)malloc(dictBufferCapacity);
466
485
  COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
467
- U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
486
+ U32* freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
468
487
  if (!segmentFreqs || !dict || !freqs) {
469
488
  DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
470
489
  goto _cleanup;
@@ -476,7 +495,7 @@ static void FASTCOVER_tryParameters(void *opaque)
476
495
  parameters, segmentFreqs);
477
496
 
478
497
  const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
479
- selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
498
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
480
499
  ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
481
500
  totalCompressedSize);
482
501
 
@@ -537,7 +556,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
537
556
  ZDICT_cover_params_t coverParams;
538
557
  FASTCOVER_accel_t accelParams;
539
558
  /* Initialize global data */
540
- g_displayLevel = parameters.zParams.notificationLevel;
559
+ g_displayLevel = (int)parameters.zParams.notificationLevel;
541
560
  /* Assign splitPoint and f if not provided */
542
561
  parameters.splitPoint = 1.0;
543
562
  parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
@@ -607,7 +626,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
607
626
  /* constants */
608
627
  const unsigned nbThreads = parameters->nbThreads;
609
628
  const double splitPoint =
610
- parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
629
+ parameters->splitPoint <= 0.0 ? FASTCOVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
611
630
  const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
612
631
  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
613
632
  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
@@ -620,7 +639,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
620
639
  const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
621
640
  const unsigned shrinkDict = 0;
622
641
  /* Local variables */
623
- const int displayLevel = parameters->zParams.notificationLevel;
642
+ const int displayLevel = (int)parameters->zParams.notificationLevel;
624
643
  unsigned iteration = 1;
625
644
  unsigned d;
626
645
  unsigned k;
@@ -704,7 +723,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
704
723
  data->parameters.splitPoint = splitPoint;
705
724
  data->parameters.steps = kSteps;
706
725
  data->parameters.shrinkDict = shrinkDict;
707
- data->parameters.zParams.notificationLevel = g_displayLevel;
726
+ data->parameters.zParams.notificationLevel = (unsigned)g_displayLevel;
708
727
  /* Check the parameters */
709
728
  if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
710
729
  data->ctx->f, accel)) {
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Yann Collet, Facebook, Inc.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -23,9 +23,13 @@
23
23
  /* Unix Large Files support (>4GB) */
24
24
  #define _FILE_OFFSET_BITS 64
25
25
  #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
26
+ # ifndef _LARGEFILE_SOURCE
26
27
  # define _LARGEFILE_SOURCE
28
+ # endif
27
29
  #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
30
+ # ifndef _LARGEFILE64_SOURCE
28
31
  # define _LARGEFILE64_SOURCE
32
+ # endif
29
33
  #endif
30
34
 
31
35
 
@@ -37,17 +41,19 @@
37
41
  #include <stdio.h> /* fprintf, fopen, ftello64 */
38
42
  #include <time.h> /* clock */
39
43
 
40
- #include "mem.h" /* read */
41
- #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
42
- #define HUF_STATIC_LINKING_ONLY
43
- #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
44
- #include "zstd_internal.h" /* includes zstd.h */
45
- #include "xxhash.h" /* XXH64 */
46
- #include "divsufsort.h"
47
44
  #ifndef ZDICT_STATIC_LINKING_ONLY
48
45
  # define ZDICT_STATIC_LINKING_ONLY
49
46
  #endif
50
- #include "zdict.h"
47
+ #define HUF_STATIC_LINKING_ONLY
48
+
49
+ #include "../common/mem.h" /* read */
50
+ #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */
51
+ #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */
52
+ #include "../common/zstd_internal.h" /* includes zstd.h */
53
+ #include "../common/xxhash.h" /* XXH64 */
54
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */
55
+ #include "../zdict.h"
56
+ #include "divsufsort.h"
51
57
 
52
58
 
53
59
  /*-*************************************
@@ -61,14 +67,15 @@
61
67
 
62
68
  #define NOISELENGTH 32
63
69
 
64
- static const int g_compressionLevel_default = 3;
65
70
  static const U32 g_selectivity_default = 9;
66
71
 
67
72
 
68
73
  /*-*************************************
69
74
  * Console display
70
75
  ***************************************/
76
+ #undef DISPLAY
71
77
  #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
78
+ #undef DISPLAYLEVEL
72
79
  #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
73
80
 
74
81
  static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
@@ -99,6 +106,26 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
99
106
  return MEM_readLE32((const char*)dictBuffer + 4);
100
107
  }
101
108
 
109
+ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
110
+ {
111
+ size_t headerSize;
112
+ if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
113
+
114
+ { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
115
+ U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
116
+ if (!bs || !wksp) {
117
+ headerSize = ERROR(memory_allocation);
118
+ } else {
119
+ ZSTD_reset_compressedBlockState(bs);
120
+ headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
121
+ }
122
+
123
+ free(bs);
124
+ free(wksp);
125
+ }
126
+
127
+ return headerSize;
128
+ }
102
129
 
103
130
  /*-********************************************************
104
131
  * Dictionary training functions
@@ -108,22 +135,32 @@ static unsigned ZDICT_NbCommonBytes (size_t val)
108
135
  if (MEM_isLittleEndian()) {
109
136
  if (MEM_64bits()) {
110
137
  # if defined(_MSC_VER) && defined(_WIN64)
111
- unsigned long r = 0;
112
- _BitScanForward64( &r, (U64)val );
113
- return (unsigned)(r>>3);
138
+ if (val != 0) {
139
+ unsigned long r;
140
+ _BitScanForward64(&r, (U64)val);
141
+ return (unsigned)(r >> 3);
142
+ } else {
143
+ /* Should not reach this code path */
144
+ __assume(0);
145
+ }
114
146
  # elif defined(__GNUC__) && (__GNUC__ >= 3)
115
- return (__builtin_ctzll((U64)val) >> 3);
147
+ return (unsigned)(__builtin_ctzll((U64)val) >> 3);
116
148
  # else
117
149
  static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
118
150
  return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
119
151
  # endif
120
152
  } else { /* 32 bits */
121
153
  # if defined(_MSC_VER)
122
- unsigned long r=0;
123
- _BitScanForward( &r, (U32)val );
124
- return (unsigned)(r>>3);
154
+ if (val != 0) {
155
+ unsigned long r;
156
+ _BitScanForward(&r, (U32)val);
157
+ return (unsigned)(r >> 3);
158
+ } else {
159
+ /* Should not reach this code path */
160
+ __assume(0);
161
+ }
125
162
  # elif defined(__GNUC__) && (__GNUC__ >= 3)
126
- return (__builtin_ctz((U32)val) >> 3);
163
+ return (unsigned)(__builtin_ctz((U32)val) >> 3);
127
164
  # else
128
165
  static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
129
166
  return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
@@ -132,11 +169,16 @@ static unsigned ZDICT_NbCommonBytes (size_t val)
132
169
  } else { /* Big Endian CPU */
133
170
  if (MEM_64bits()) {
134
171
  # if defined(_MSC_VER) && defined(_WIN64)
135
- unsigned long r = 0;
136
- _BitScanReverse64( &r, val );
137
- return (unsigned)(r>>3);
172
+ if (val != 0) {
173
+ unsigned long r;
174
+ _BitScanReverse64(&r, val);
175
+ return (unsigned)(r >> 3);
176
+ } else {
177
+ /* Should not reach this code path */
178
+ __assume(0);
179
+ }
138
180
  # elif defined(__GNUC__) && (__GNUC__ >= 3)
139
- return (__builtin_clzll(val) >> 3);
181
+ return (unsigned)(__builtin_clzll(val) >> 3);
140
182
  # else
141
183
  unsigned r;
142
184
  const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
@@ -147,11 +189,16 @@ static unsigned ZDICT_NbCommonBytes (size_t val)
147
189
  # endif
148
190
  } else { /* 32 bits */
149
191
  # if defined(_MSC_VER)
150
- unsigned long r = 0;
151
- _BitScanReverse( &r, (unsigned long)val );
152
- return (unsigned)(r>>3);
192
+ if (val != 0) {
193
+ unsigned long r;
194
+ _BitScanReverse(&r, (unsigned long)val);
195
+ return (unsigned)(r >> 3);
196
+ } else {
197
+ /* Should not reach this code path */
198
+ __assume(0);
199
+ }
153
200
  # elif defined(__GNUC__) && (__GNUC__ >= 3)
154
- return (__builtin_clz((U32)val) >> 3);
201
+ return (unsigned)(__builtin_clz((U32)val) >> 3);
155
202
  # else
156
203
  unsigned r;
157
204
  if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
@@ -208,7 +255,7 @@ static dictItem ZDICT_analyzePos(
208
255
  U32 savings[LLIMIT] = {0};
209
256
  const BYTE* b = (const BYTE*)buffer;
210
257
  size_t maxLength = LLIMIT;
211
- size_t pos = suffix[start];
258
+ size_t pos = (size_t)suffix[start];
212
259
  U32 end = start;
213
260
  dictItem solution;
214
261
 
@@ -342,7 +389,7 @@ static dictItem ZDICT_analyzePos(
342
389
  savings[i] = savings[i-1] + (lengthList[i] * (i-3));
343
390
 
344
391
  DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
345
- (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength);
392
+ (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);
346
393
 
347
394
  solution.pos = (U32)pos;
348
395
  solution.length = (U32)maxLength;
@@ -352,7 +399,7 @@ static dictItem ZDICT_analyzePos(
352
399
  { U32 id;
353
400
  for (id=start; id<end; id++) {
354
401
  U32 p, pEnd, length;
355
- U32 const testedPos = suffix[id];
402
+ U32 const testedPos = (U32)suffix[id];
356
403
  if (testedPos == pos)
357
404
  length = solution.length;
358
405
  else {
@@ -415,7 +462,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
415
462
 
416
463
  if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */
417
464
  /* append */
418
- int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
465
+ int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);
419
466
  table[u].savings += elt.length / 8; /* rough approx bonus */
420
467
  if (addedLength > 0) { /* otherwise, elt fully included into existing */
421
468
  table[u].length += addedLength;
@@ -508,6 +555,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
508
555
  clock_t displayClock = 0;
509
556
  clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
510
557
 
558
+ # undef DISPLAYUPDATE
511
559
  # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
512
560
  if (ZDICT_clockSpan(displayClock) > refreshRate) \
513
561
  { displayClock = clock(); DISPLAY(__VA_ARGS__); \
@@ -588,12 +636,12 @@ typedef struct
588
636
 
589
637
  #define MAXREPOFFSET 1024
590
638
 
591
- static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
639
+ static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
592
640
  unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
593
641
  const void* src, size_t srcSize,
594
642
  U32 notificationLevel)
595
643
  {
596
- size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
644
+ size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
597
645
  size_t cSize;
598
646
 
599
647
  if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
@@ -682,7 +730,7 @@ static void ZDICT_flatLit(unsigned* countLit)
682
730
 
683
731
  #define OFFCODE_MAX 30 /* only applicable to first block */
684
732
  static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
685
- unsigned compressionLevel,
733
+ int compressionLevel,
686
734
  const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
687
735
  const void* dictBuffer, size_t dictBufferSize,
688
736
  unsigned notificationLevel)
@@ -717,7 +765,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
717
765
  memset(repOffset, 0, sizeof(repOffset));
718
766
  repOffset[1] = repOffset[4] = repOffset[8] = 1;
719
767
  memset(bestRepOffset, 0, sizeof(bestRepOffset));
720
- if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
768
+ if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;
721
769
  params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
722
770
 
723
771
  esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
@@ -731,13 +779,20 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
731
779
 
732
780
  /* collect stats on all samples */
733
781
  for (u=0; u<nbFiles; u++) {
734
- ZDICT_countEStats(esr, params,
782
+ ZDICT_countEStats(esr, &params,
735
783
  countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
736
784
  (const char*)srcBuffer + pos, fileSizes[u],
737
785
  notificationLevel);
738
786
  pos += fileSizes[u];
739
787
  }
740
788
 
789
+ if (notificationLevel >= 4) {
790
+ /* writeStats */
791
+ DISPLAYLEVEL(4, "Offset Code Frequencies : \n");
792
+ for (u=0; u<=offcodeMax; u++) {
793
+ DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);
794
+ } }
795
+
741
796
  /* analyze, build stats, starting with literals */
742
797
  { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
743
798
  if (HUF_isError(maxNbBits)) {
@@ -762,7 +817,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
762
817
  /* note : the result of this phase should be used to better appreciate the impact on statistics */
763
818
 
764
819
  total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];
765
- errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
820
+ errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);
766
821
  if (FSE_isError(errorCode)) {
767
822
  eSize = errorCode;
768
823
  DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
@@ -771,7 +826,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
771
826
  Offlog = (U32)errorCode;
772
827
 
773
828
  total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
774
- errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
829
+ errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);
775
830
  if (FSE_isError(errorCode)) {
776
831
  eSize = errorCode;
777
832
  DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
@@ -780,7 +835,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
780
835
  mlLog = (U32)errorCode;
781
836
 
782
837
  total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];
783
- errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
838
+ errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);
784
839
  if (FSE_isError(errorCode)) {
785
840
  eSize = errorCode;
786
841
  DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
@@ -844,7 +899,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
844
899
  MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);
845
900
  #else
846
901
  /* at this stage, we don't use the result of "most common first offset",
847
- as the impact of statistics is not properly evaluated */
902
+ * as the impact of statistics is not properly evaluated */
848
903
  MEM_writeLE32(dstPtr+0, repStartValue[0]);
849
904
  MEM_writeLE32(dstPtr+4, repStartValue[1]);
850
905
  MEM_writeLE32(dstPtr+8, repStartValue[2]);
@@ -860,6 +915,17 @@ _cleanup:
860
915
  }
861
916
 
862
917
 
918
+ /**
919
+ * @returns the maximum repcode value
920
+ */
921
+ static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
922
+ {
923
+ U32 maxRep = reps[0];
924
+ int r;
925
+ for (r = 1; r < ZSTD_REP_NUM; ++r)
926
+ maxRep = MAX(maxRep, reps[r]);
927
+ return maxRep;
928
+ }
863
929
 
864
930
  size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
865
931
  const void* customDictContent, size_t dictContentSize,
@@ -869,13 +935,15 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
869
935
  size_t hSize;
870
936
  #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
871
937
  BYTE header[HBUFFSIZE];
872
- int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
938
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
873
939
  U32 const notificationLevel = params.notificationLevel;
940
+ /* The final dictionary content must be at least as large as the largest repcode */
941
+ size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
942
+ size_t paddingSize;
874
943
 
875
944
  /* check conditions */
876
945
  DEBUGLOG(4, "ZDICT_finalizeDictionary");
877
946
  if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
878
- if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
879
947
  if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
880
948
 
881
949
  /* dictionary header */
@@ -899,12 +967,43 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
899
967
  hSize += eSize;
900
968
  }
901
969
 
902
- /* copy elements in final buffer ; note : src and dst buffer can overlap */
903
- if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
904
- { size_t const dictSize = hSize + dictContentSize;
905
- char* dictEnd = (char*)dictBuffer + dictSize;
906
- memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
907
- memcpy(dictBuffer, header, hSize);
970
+ /* Shrink the content size if it doesn't fit in the buffer */
971
+ if (hSize + dictContentSize > dictBufferCapacity) {
972
+ dictContentSize = dictBufferCapacity - hSize;
973
+ }
974
+
975
+ /* Pad the dictionary content with zeros if it is too small */
976
+ if (dictContentSize < minContentSize) {
977
+ RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
978
+ "dictBufferCapacity too small to fit max repcode");
979
+ paddingSize = minContentSize - dictContentSize;
980
+ } else {
981
+ paddingSize = 0;
982
+ }
983
+
984
+ {
985
+ size_t const dictSize = hSize + paddingSize + dictContentSize;
986
+
987
+ /* The dictionary consists of the header, optional padding, and the content.
988
+ * The padding comes before the content because the "best" position in the
989
+ * dictionary is the last byte.
990
+ */
991
+ BYTE* const outDictHeader = (BYTE*)dictBuffer;
992
+ BYTE* const outDictPadding = outDictHeader + hSize;
993
+ BYTE* const outDictContent = outDictPadding + paddingSize;
994
+
995
+ assert(dictSize <= dictBufferCapacity);
996
+ assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
997
+
998
+ /* First copy the customDictContent into its final location.
999
+ * `customDictContent` and `dictBuffer` may overlap, so we must
1000
+ * do this before any other writes into the output buffer.
1001
+ * Then copy the header & padding into the output buffer.
1002
+ */
1003
+ memmove(outDictContent, customDictContent, dictContentSize);
1004
+ memcpy(outDictHeader, header, hSize);
1005
+ memset(outDictPadding, 0, paddingSize);
1006
+
908
1007
  return dictSize;
909
1008
  }
910
1009
  }
@@ -915,7 +1014,7 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
915
1014
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
916
1015
  ZDICT_params_t params)
917
1016
  {
918
- int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
1017
+ int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
919
1018
  U32 const notificationLevel = params.notificationLevel;
920
1019
  size_t hSize = 8;
921
1020
 
@@ -944,16 +1043,11 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
944
1043
  return MIN(dictBufferCapacity, hSize+dictContentSize);
945
1044
  }
946
1045
 
947
- /* Hidden declaration for dbio.c */
948
- size_t ZDICT_trainFromBuffer_unsafe_legacy(
949
- void* dictBuffer, size_t maxDictSize,
950
- const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
951
- ZDICT_legacy_params_t params);
952
1046
  /*! ZDICT_trainFromBuffer_unsafe_legacy() :
953
- * Warning : `samplesBuffer` must be followed by noisy guard band.
1047
+ * Warning : `samplesBuffer` must be followed by noisy guard band !!!
954
1048
  * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
955
1049
  */
956
- size_t ZDICT_trainFromBuffer_unsafe_legacy(
1050
+ static size_t ZDICT_trainFromBuffer_unsafe_legacy(
957
1051
  void* dictBuffer, size_t maxDictSize,
958
1052
  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
959
1053
  ZDICT_legacy_params_t params)
@@ -1090,8 +1184,8 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1090
1184
  memset(&params, 0, sizeof(params));
1091
1185
  params.d = 8;
1092
1186
  params.steps = 4;
1093
- /* Default to level 6 since no compression level information is available */
1094
- params.zParams.compressionLevel = 3;
1187
+ /* Use default level since no compression level information is available */
1188
+ params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
1095
1189
  #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1096
1190
  params.zParams.notificationLevel = DEBUGLEVEL;
1097
1191
  #endif
@@ -1,10 +1,11 @@
1
1
  # ################################################################
2
- # Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
2
+ # Copyright (c) Yann Collet, Facebook, Inc.
3
3
  # All rights reserved.
4
4
  #
5
5
  # This source code is licensed under both the BSD-style license (found in the
6
6
  # LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
7
  # in the COPYING file in the root directory of this source tree).
8
+ # You may select, at your option, one of the above-listed licenses.
8
9
  # ################################################################
9
10
 
10
11
  VOID := /dev/null