extzstd 0.2 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.ja.md +13 -0
  3. data/README.md +17 -14
  4. data/contrib/zstd/{NEWS → CHANGELOG} +115 -2
  5. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  6. data/contrib/zstd/Makefile +99 -53
  7. data/contrib/zstd/README.md +59 -39
  8. data/contrib/zstd/TESTING.md +1 -1
  9. data/contrib/zstd/appveyor.yml +17 -6
  10. data/contrib/zstd/lib/BUCK +29 -2
  11. data/contrib/zstd/lib/Makefile +118 -21
  12. data/contrib/zstd/lib/README.md +84 -44
  13. data/contrib/zstd/lib/common/bitstream.h +17 -33
  14. data/contrib/zstd/lib/common/compiler.h +62 -8
  15. data/contrib/zstd/lib/common/cpu.h +215 -0
  16. data/contrib/zstd/lib/common/debug.c +44 -0
  17. data/contrib/zstd/lib/common/debug.h +134 -0
  18. data/contrib/zstd/lib/common/entropy_common.c +16 -1
  19. data/contrib/zstd/lib/common/error_private.c +7 -0
  20. data/contrib/zstd/lib/common/fse.h +48 -44
  21. data/contrib/zstd/lib/common/fse_decompress.c +3 -3
  22. data/contrib/zstd/lib/common/huf.h +169 -113
  23. data/contrib/zstd/lib/common/mem.h +20 -2
  24. data/contrib/zstd/lib/common/pool.c +135 -49
  25. data/contrib/zstd/lib/common/pool.h +40 -21
  26. data/contrib/zstd/lib/common/threading.c +2 -2
  27. data/contrib/zstd/lib/common/threading.h +12 -12
  28. data/contrib/zstd/lib/common/xxhash.c +3 -2
  29. data/contrib/zstd/lib/common/zstd_common.c +3 -6
  30. data/contrib/zstd/lib/common/zstd_errors.h +17 -7
  31. data/contrib/zstd/lib/common/zstd_internal.h +76 -48
  32. data/contrib/zstd/lib/compress/fse_compress.c +89 -209
  33. data/contrib/zstd/lib/compress/hist.c +203 -0
  34. data/contrib/zstd/lib/compress/hist.h +95 -0
  35. data/contrib/zstd/lib/compress/huf_compress.c +188 -80
  36. data/contrib/zstd/lib/compress/zstd_compress.c +2500 -1203
  37. data/contrib/zstd/lib/compress/zstd_compress_internal.h +463 -62
  38. data/contrib/zstd/lib/compress/zstd_double_fast.c +321 -131
  39. data/contrib/zstd/lib/compress/zstd_double_fast.h +13 -4
  40. data/contrib/zstd/lib/compress/zstd_fast.c +335 -108
  41. data/contrib/zstd/lib/compress/zstd_fast.h +12 -6
  42. data/contrib/zstd/lib/compress/zstd_lazy.c +654 -313
  43. data/contrib/zstd/lib/compress/zstd_lazy.h +44 -16
  44. data/contrib/zstd/lib/compress/zstd_ldm.c +310 -420
  45. data/contrib/zstd/lib/compress/zstd_ldm.h +63 -26
  46. data/contrib/zstd/lib/compress/zstd_opt.c +773 -325
  47. data/contrib/zstd/lib/compress/zstd_opt.h +31 -5
  48. data/contrib/zstd/lib/compress/zstdmt_compress.c +1468 -518
  49. data/contrib/zstd/lib/compress/zstdmt_compress.h +96 -45
  50. data/contrib/zstd/lib/decompress/huf_decompress.c +518 -282
  51. data/contrib/zstd/lib/decompress/zstd_ddict.c +240 -0
  52. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  53. data/contrib/zstd/lib/decompress/zstd_decompress.c +613 -1513
  54. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1311 -0
  55. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
  56. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +175 -0
  57. data/contrib/zstd/lib/dictBuilder/cover.c +194 -113
  58. data/contrib/zstd/lib/dictBuilder/cover.h +112 -0
  59. data/contrib/zstd/lib/dictBuilder/divsufsort.c +3 -3
  60. data/contrib/zstd/lib/dictBuilder/fastcover.c +740 -0
  61. data/contrib/zstd/lib/dictBuilder/zdict.c +142 -106
  62. data/contrib/zstd/lib/dictBuilder/zdict.h +115 -49
  63. data/contrib/zstd/lib/legacy/zstd_legacy.h +44 -12
  64. data/contrib/zstd/lib/legacy/zstd_v01.c +41 -10
  65. data/contrib/zstd/lib/legacy/zstd_v01.h +12 -7
  66. data/contrib/zstd/lib/legacy/zstd_v02.c +37 -12
  67. data/contrib/zstd/lib/legacy/zstd_v02.h +12 -7
  68. data/contrib/zstd/lib/legacy/zstd_v03.c +38 -12
  69. data/contrib/zstd/lib/legacy/zstd_v03.h +12 -7
  70. data/contrib/zstd/lib/legacy/zstd_v04.c +55 -174
  71. data/contrib/zstd/lib/legacy/zstd_v04.h +12 -7
  72. data/contrib/zstd/lib/legacy/zstd_v05.c +59 -31
  73. data/contrib/zstd/lib/legacy/zstd_v05.h +12 -7
  74. data/contrib/zstd/lib/legacy/zstd_v06.c +48 -20
  75. data/contrib/zstd/lib/legacy/zstd_v06.h +10 -5
  76. data/contrib/zstd/lib/legacy/zstd_v07.c +62 -29
  77. data/contrib/zstd/lib/legacy/zstd_v07.h +10 -5
  78. data/contrib/zstd/lib/zstd.h +1346 -832
  79. data/ext/extzstd.c +27 -19
  80. data/ext/extzstd_stream.c +20 -4
  81. data/ext/zstd_compress.c +1 -0
  82. data/ext/zstd_decompress.c +4 -0
  83. data/ext/zstd_dictbuilder.c +4 -0
  84. data/ext/zstd_dictbuilder_fastcover.c +5 -0
  85. data/lib/extzstd.rb +52 -220
  86. data/lib/extzstd/version.rb +1 -1
  87. metadata +21 -7
  88. data/contrib/zstd/circle.yml +0 -63
@@ -0,0 +1,59 @@
1
+ /*
2
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+
12
+ #ifndef ZSTD_DEC_BLOCK_H
13
+ #define ZSTD_DEC_BLOCK_H
14
+
15
+ /*-*******************************************************
16
+ * Dependencies
17
+ *********************************************************/
18
+ #include <stddef.h> /* size_t */
19
+ #include "zstd.h" /* DCtx, and some public functions */
20
+ #include "zstd_internal.h" /* blockProperties_t, and some public functions */
21
+ #include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */
22
+
23
+
24
+ /* === Prototypes === */
25
+
26
+ /* note: prototypes already published within `zstd.h` :
27
+ * ZSTD_decompressBlock()
28
+ */
29
+
30
+ /* note: prototypes already published within `zstd_internal.h` :
31
+ * ZSTD_getcBlockSize()
32
+ * ZSTD_decodeSeqHeaders()
33
+ */
34
+
35
+
36
+ /* ZSTD_decompressBlock_internal() :
37
+ * decompress block, starting at `src`,
38
+ * into destination buffer `dst`.
39
+ * @return : decompressed block size,
40
+ * or an error code (which can be tested using ZSTD_isError())
41
+ */
42
+ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
43
+ void* dst, size_t dstCapacity,
44
+ const void* src, size_t srcSize, const int frame);
45
+
46
+ /* ZSTD_buildFSETable() :
47
+ * generate FSE decoding table for one symbol (ll, ml or off)
48
+ * this function must be called with valid parameters only
49
+ * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
50
+ * in which case it cannot fail.
51
+ * Internal use only.
52
+ */
53
+ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
54
+ const short* normalizedCounter, unsigned maxSymbolValue,
55
+ const U32* baseValue, const U32* nbAdditionalBits,
56
+ unsigned tableLog);
57
+
58
+
59
+ #endif /* ZSTD_DEC_BLOCK_H */
@@ -0,0 +1,175 @@
1
+ /*
2
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+
12
+ /* zstd_decompress_internal:
13
+ * objects and definitions shared within lib/decompress modules */
14
+
15
+ #ifndef ZSTD_DECOMPRESS_INTERNAL_H
16
+ #define ZSTD_DECOMPRESS_INTERNAL_H
17
+
18
+
19
+ /*-*******************************************************
20
+ * Dependencies
21
+ *********************************************************/
22
+ #include "mem.h" /* BYTE, U16, U32 */
23
+ #include "zstd_internal.h" /* ZSTD_seqSymbol */
24
+
25
+
26
+
27
+ /*-*******************************************************
28
+ * Constants
29
+ *********************************************************/
30
+ static const U32 LL_base[MaxLL+1] = {
31
+ 0, 1, 2, 3, 4, 5, 6, 7,
32
+ 8, 9, 10, 11, 12, 13, 14, 15,
33
+ 16, 18, 20, 22, 24, 28, 32, 40,
34
+ 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
35
+ 0x2000, 0x4000, 0x8000, 0x10000 };
36
+
37
+ static const U32 OF_base[MaxOff+1] = {
38
+ 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D,
39
+ 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD,
40
+ 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
41
+ 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
42
+
43
+ static const U32 OF_bits[MaxOff+1] = {
44
+ 0, 1, 2, 3, 4, 5, 6, 7,
45
+ 8, 9, 10, 11, 12, 13, 14, 15,
46
+ 16, 17, 18, 19, 20, 21, 22, 23,
47
+ 24, 25, 26, 27, 28, 29, 30, 31 };
48
+
49
+ static const U32 ML_base[MaxML+1] = {
50
+ 3, 4, 5, 6, 7, 8, 9, 10,
51
+ 11, 12, 13, 14, 15, 16, 17, 18,
52
+ 19, 20, 21, 22, 23, 24, 25, 26,
53
+ 27, 28, 29, 30, 31, 32, 33, 34,
54
+ 35, 37, 39, 41, 43, 47, 51, 59,
55
+ 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
56
+ 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
57
+
58
+
59
+ /*-*******************************************************
60
+ * Decompression types
61
+ *********************************************************/
62
+ typedef struct {
63
+ U32 fastMode;
64
+ U32 tableLog;
65
+ } ZSTD_seqSymbol_header;
66
+
67
+ typedef struct {
68
+ U16 nextState;
69
+ BYTE nbAdditionalBits;
70
+ BYTE nbBits;
71
+ U32 baseValue;
72
+ } ZSTD_seqSymbol;
73
+
74
+ #define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
75
+
76
+ typedef struct {
77
+ ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
78
+ ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
79
+ ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
80
+ HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
81
+ U32 rep[ZSTD_REP_NUM];
82
+ } ZSTD_entropyDTables_t;
83
+
84
+ typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
85
+ ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
86
+ ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
87
+ ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
88
+
89
+ typedef enum { zdss_init=0, zdss_loadHeader,
90
+ zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
91
+
92
+ typedef enum {
93
+ ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */
94
+ ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */
95
+ ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
96
+ } ZSTD_dictUses_e;
97
+
98
+ struct ZSTD_DCtx_s
99
+ {
100
+ const ZSTD_seqSymbol* LLTptr;
101
+ const ZSTD_seqSymbol* MLTptr;
102
+ const ZSTD_seqSymbol* OFTptr;
103
+ const HUF_DTable* HUFptr;
104
+ ZSTD_entropyDTables_t entropy;
105
+ U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; /* space needed when building huffman tables */
106
+ const void* previousDstEnd; /* detect continuity */
107
+ const void* prefixStart; /* start of current segment */
108
+ const void* virtualStart; /* virtual start of previous segment if it was just before current one */
109
+ const void* dictEnd; /* end of previous segment */
110
+ size_t expected;
111
+ ZSTD_frameHeader fParams;
112
+ U64 decodedSize;
113
+ blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
114
+ ZSTD_dStage stage;
115
+ U32 litEntropy;
116
+ U32 fseEntropy;
117
+ XXH64_state_t xxhState;
118
+ size_t headerSize;
119
+ ZSTD_format_e format;
120
+ const BYTE* litPtr;
121
+ ZSTD_customMem customMem;
122
+ size_t litSize;
123
+ size_t rleSize;
124
+ size_t staticSize;
125
+ int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
126
+
127
+ /* dictionary */
128
+ ZSTD_DDict* ddictLocal;
129
+ const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
130
+ U32 dictID;
131
+ int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
132
+ ZSTD_dictUses_e dictUses;
133
+
134
+ /* streaming */
135
+ ZSTD_dStreamStage streamStage;
136
+ char* inBuff;
137
+ size_t inBuffSize;
138
+ size_t inPos;
139
+ size_t maxWindowSize;
140
+ char* outBuff;
141
+ size_t outBuffSize;
142
+ size_t outStart;
143
+ size_t outEnd;
144
+ size_t lhSize;
145
+ void* legacyContext;
146
+ U32 previousLegacyVersion;
147
+ U32 legacyVersion;
148
+ U32 hostageByte;
149
+ int noForwardProgress;
150
+
151
+ /* workspace */
152
+ BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
153
+ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
154
+ }; /* typedef'd to ZSTD_DCtx within "zstd.h" */
155
+
156
+
157
+ /*-*******************************************************
158
+ * Shared internal functions
159
+ *********************************************************/
160
+
161
+ /*! ZSTD_loadDEntropy() :
162
+ * dict : must point at beginning of a valid zstd dictionary.
163
+ * @return : size of entropy tables read */
164
+ size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
165
+ const void* const dict, size_t const dictSize);
166
+
167
+ /*! ZSTD_checkContinuity() :
168
+ * check if next `dst` follows previous position, where decompression ended.
169
+ * If yes, do nothing (continue on current segment).
170
+ * If not, classify previous segment as "external dictionary", and start a new segment.
171
+ * This function cannot fail. */
172
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst);
173
+
174
+
175
+ #endif /* ZSTD_DECOMPRESS_INTERNAL_H */
@@ -29,6 +29,7 @@
29
29
  #include "mem.h" /* read */
30
30
  #include "pool.h"
31
31
  #include "threading.h"
32
+ #include "cover.h"
32
33
  #include "zstd_internal.h" /* includes zstd.h */
33
34
  #ifndef ZDICT_STATIC_LINKING_ONLY
34
35
  #define ZDICT_STATIC_LINKING_ONLY
@@ -38,7 +39,8 @@
38
39
  /*-*************************************
39
40
  * Constants
40
41
  ***************************************/
41
- #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
42
+ #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
43
+ #define DEFAULT_SPLITPOINT 1.0
42
44
 
43
45
  /*-*************************************
44
46
  * Console display
@@ -184,7 +186,7 @@ static void COVER_map_remove(COVER_map_t *map, U32 key) {
184
186
  }
185
187
 
186
188
  /**
187
- * Destroyes a map that is inited with COVER_map_init().
189
+ * Destroys a map that is inited with COVER_map_init().
188
190
  */
189
191
  static void COVER_map_destroy(COVER_map_t *map) {
190
192
  if (map->data) {
@@ -203,6 +205,8 @@ typedef struct {
203
205
  size_t *offsets;
204
206
  const size_t *samplesSizes;
205
207
  size_t nbSamples;
208
+ size_t nbTrainSamples;
209
+ size_t nbTestSamples;
206
210
  U32 *suffix;
207
211
  size_t suffixSize;
208
212
  U32 *freqs;
@@ -220,9 +224,9 @@ static COVER_ctx_t *g_ctx = NULL;
220
224
  /**
221
225
  * Returns the sum of the sample sizes.
222
226
  */
223
- static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
227
+ size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
224
228
  size_t sum = 0;
225
- size_t i;
229
+ unsigned i;
226
230
  for (i = 0; i < nbSamples; ++i) {
227
231
  sum += samplesSizes[i];
228
232
  }
@@ -377,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
377
381
  ctx->suffix[dmerId] = freq;
378
382
  }
379
383
 
380
- /**
381
- * A segment is a range in the source as well as the score of the segment.
382
- */
383
- typedef struct {
384
- U32 begin;
385
- U32 end;
386
- U32 score;
387
- } COVER_segment_t;
388
384
 
389
385
  /**
390
386
  * Selects the best segment in an epoch.
@@ -395,7 +391,7 @@ typedef struct {
395
391
  *
396
392
  * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
397
393
  *
398
- * Once the dmer d is in the dictionay we set F(d) = 0.
394
+ * Once the dmer d is in the dictionary we set F(d) = 0.
399
395
  */
400
396
  static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
401
397
  COVER_map_t *activeDmers, U32 begin,
@@ -439,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
439
435
  U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
440
436
  activeSegment.begin += 1;
441
437
  *delDmerOcc -= 1;
442
- /* If this is the last occurence of the dmer, subtract its score */
438
+ /* If this is the last occurrence of the dmer, subtract its score */
443
439
  if (*delDmerOcc == 0) {
444
440
  COVER_map_remove(activeDmers, delDmer);
445
441
  activeSegment.score -= freqs[delDmer];
@@ -494,6 +490,10 @@ static int COVER_checkParameters(ZDICT_cover_params_t parameters,
494
490
  if (parameters.d > parameters.k) {
495
491
  return 0;
496
492
  }
493
+ /* 0 < splitPoint <= 1 */
494
+ if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
495
+ return 0;
496
+ }
497
497
  return 1;
498
498
  }
499
499
 
@@ -531,25 +531,44 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
531
531
  */
532
532
  static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
533
533
  const size_t *samplesSizes, unsigned nbSamples,
534
- unsigned d) {
534
+ unsigned d, double splitPoint) {
535
535
  const BYTE *const samples = (const BYTE *)samplesBuffer;
536
536
  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
537
+ /* Split samples into testing and training sets */
538
+ const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
539
+ const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
540
+ const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
541
+ const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
537
542
  /* Checks */
538
543
  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
539
544
  totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
540
- DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
541
- (COVER_MAX_SAMPLES_SIZE >> 20));
545
+ DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
546
+ (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
547
+ return 0;
548
+ }
549
+ /* Check if there are at least 5 training samples */
550
+ if (nbTrainSamples < 5) {
551
+ DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
552
+ return 0;
553
+ }
554
+ /* Check if there's testing sample */
555
+ if (nbTestSamples < 1) {
556
+ DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
542
557
  return 0;
543
558
  }
544
559
  /* Zero the context */
545
560
  memset(ctx, 0, sizeof(*ctx));
546
- DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
547
- (U32)totalSamplesSize);
561
+ DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
562
+ (unsigned)trainingSamplesSize);
563
+ DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
564
+ (unsigned)testSamplesSize);
548
565
  ctx->samples = samples;
549
566
  ctx->samplesSizes = samplesSizes;
550
567
  ctx->nbSamples = nbSamples;
568
+ ctx->nbTrainSamples = nbTrainSamples;
569
+ ctx->nbTestSamples = nbTestSamples;
551
570
  /* Partial suffix array */
552
- ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
571
+ ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
553
572
  ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
554
573
  /* Maps index to the dmerID */
555
574
  ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
@@ -563,7 +582,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
563
582
  ctx->freqs = NULL;
564
583
  ctx->d = d;
565
584
 
566
- /* Fill offsets from the samlesSizes */
585
+ /* Fill offsets from the samplesSizes */
567
586
  {
568
587
  U32 i;
569
588
  ctx->offsets[0] = 0;
@@ -581,10 +600,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
581
600
  for (i = 0; i < ctx->suffixSize; ++i) {
582
601
  ctx->suffix[i] = i;
583
602
  }
584
- /* qsort doesn't take an opaque pointer, so pass as a global */
603
+ /* qsort doesn't take an opaque pointer, so pass as a global.
604
+ * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
605
+ */
585
606
  g_ctx = ctx;
607
+ #if defined(__OpenBSD__)
608
+ mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
609
+ (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
610
+ #else
586
611
  qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
587
612
  (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
613
+ #endif
588
614
  }
589
615
  DISPLAYLEVEL(2, "Computing frequencies\n");
590
616
  /* For each dmer group (group of positions with the same first d bytes):
@@ -601,6 +627,39 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
601
627
  return 1;
602
628
  }
603
629
 
630
+ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
631
+ {
632
+ const double ratio = (double)nbDmers / maxDictSize;
633
+ if (ratio >= 10) {
634
+ return;
635
+ }
636
+ LOCALDISPLAYLEVEL(displayLevel, 1,
637
+ "WARNING: The maximum dictionary size %u is too large "
638
+ "compared to the source size %u! "
639
+ "size(source)/size(dictionary) = %f, but it should be >= "
640
+ "10! This may lead to a subpar dictionary! We recommend "
641
+ "training on sources at least 10x, and up to 100x the "
642
+ "size of the dictionary!\n", (U32)maxDictSize,
643
+ (U32)nbDmers, ratio);
644
+ }
645
+
646
+ COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
647
+ U32 nbDmers, U32 k, U32 passes)
648
+ {
649
+ const U32 minEpochSize = k * 10;
650
+ COVER_epoch_info_t epochs;
651
+ epochs.num = MAX(1, maxDictSize / k / passes);
652
+ epochs.size = nbDmers / epochs.num;
653
+ if (epochs.size >= minEpochSize) {
654
+ assert(epochs.size * epochs.num <= nbDmers);
655
+ return epochs;
656
+ }
657
+ epochs.size = MIN(minEpochSize, nbDmers);
658
+ epochs.num = nbDmers / epochs.size;
659
+ assert(epochs.size * epochs.num <= nbDmers);
660
+ return epochs;
661
+ }
662
+
604
663
  /**
605
664
  * Given the prepared context build the dictionary.
606
665
  */
@@ -610,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
610
669
  ZDICT_cover_params_t parameters) {
611
670
  BYTE *const dict = (BYTE *)dictBuffer;
612
671
  size_t tail = dictBufferCapacity;
613
- /* Divide the data up into epochs of equal size.
614
- * We will select at least one segment from each epoch.
615
- */
616
- const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
617
- const U32 epochSize = (U32)(ctx->suffixSize / epochs);
672
+ /* Divide the data into epochs. We will select one segment from each epoch. */
673
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
674
+ (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
675
+ const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
676
+ size_t zeroScoreRun = 0;
618
677
  size_t epoch;
619
- DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
620
- epochSize);
678
+ DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
679
+ (U32)epochs.num, (U32)epochs.size);
621
680
  /* Loop through the epochs until there are no more segments or the dictionary
622
681
  * is full.
623
682
  */
624
- for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
625
- const U32 epochBegin = (U32)(epoch * epochSize);
626
- const U32 epochEnd = epochBegin + epochSize;
683
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
684
+ const U32 epochBegin = (U32)(epoch * epochs.size);
685
+ const U32 epochEnd = epochBegin + epochs.size;
627
686
  size_t segmentSize;
628
687
  /* Select a segment */
629
688
  COVER_segment_t segment = COVER_selectSegment(
630
689
  ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
631
- /* If the segment covers no dmers, then we are out of content */
690
+ /* If the segment covers no dmers, then we are out of content.
691
+ * There may be new content in other epochs, for continue for some time.
692
+ */
632
693
  if (segment.score == 0) {
633
- break;
694
+ if (++zeroScoreRun >= maxZeroScoreRun) {
695
+ break;
696
+ }
697
+ continue;
634
698
  }
699
+ zeroScoreRun = 0;
635
700
  /* Trim the segment if necessary and if it is too small then we are done */
636
701
  segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
637
702
  if (segmentSize < parameters.d) {
@@ -644,19 +709,23 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
644
709
  memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
645
710
  DISPLAYUPDATE(
646
711
  2, "\r%u%% ",
647
- (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
712
+ (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
648
713
  }
649
714
  DISPLAYLEVEL(2, "\r%79s\r", "");
650
715
  return tail;
651
716
  }
652
717
 
653
718
  ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
654
- void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
655
- const size_t *samplesSizes, unsigned nbSamples,
656
- ZDICT_cover_params_t parameters) {
657
- BYTE *const dict = (BYTE *)dictBuffer;
719
+ void *dictBuffer, size_t dictBufferCapacity,
720
+ const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
721
+ ZDICT_cover_params_t parameters)
722
+ {
723
+ BYTE* const dict = (BYTE*)dictBuffer;
658
724
  COVER_ctx_t ctx;
659
725
  COVER_map_t activeDmers;
726
+ parameters.splitPoint = 1.0;
727
+ /* Initialize global data */
728
+ g_displayLevel = parameters.zParams.notificationLevel;
660
729
  /* Checks */
661
730
  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
662
731
  DISPLAYLEVEL(1, "Cover parameters incorrect\n");
@@ -671,13 +740,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
671
740
  ZDICT_DICTSIZE_MIN);
672
741
  return ERROR(dstSize_tooSmall);
673
742
  }
674
- /* Initialize global data */
675
- g_displayLevel = parameters.zParams.notificationLevel;
676
743
  /* Initialize context and activeDmers */
677
744
  if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
678
- parameters.d)) {
745
+ parameters.d, parameters.splitPoint)) {
679
746
  return ERROR(GENERIC);
680
747
  }
748
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
681
749
  if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
682
750
  DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
683
751
  COVER_ctx_destroy(&ctx);
@@ -694,7 +762,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
694
762
  samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
695
763
  if (!ZSTD_isError(dictionarySize)) {
696
764
  DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
697
- (U32)dictionarySize);
765
+ (unsigned)dictionarySize);
698
766
  }
699
767
  COVER_ctx_destroy(&ctx);
700
768
  COVER_map_destroy(&activeDmers);
@@ -702,28 +770,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
702
770
  }
703
771
  }
704
772
 
705
- /**
706
- * COVER_best_t is used for two purposes:
707
- * 1. Synchronizing threads.
708
- * 2. Saving the best parameters and dictionary.
709
- *
710
- * All of the methods except COVER_best_init() are thread safe if zstd is
711
- * compiled with multithreaded support.
712
- */
713
- typedef struct COVER_best_s {
714
- ZSTD_pthread_mutex_t mutex;
715
- ZSTD_pthread_cond_t cond;
716
- size_t liveJobs;
717
- void *dict;
718
- size_t dictSize;
719
- ZDICT_cover_params_t parameters;
720
- size_t compressedSize;
721
- } COVER_best_t;
773
+
774
+
775
+ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
776
+ const size_t *samplesSizes, const BYTE *samples,
777
+ size_t *offsets,
778
+ size_t nbTrainSamples, size_t nbSamples,
779
+ BYTE *const dict, size_t dictBufferCapacity) {
780
+ size_t totalCompressedSize = ERROR(GENERIC);
781
+ /* Pointers */
782
+ ZSTD_CCtx *cctx;
783
+ ZSTD_CDict *cdict;
784
+ void *dst;
785
+ /* Local variables */
786
+ size_t dstCapacity;
787
+ size_t i;
788
+ /* Allocate dst with enough space to compress the maximum sized sample */
789
+ {
790
+ size_t maxSampleSize = 0;
791
+ i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
792
+ for (; i < nbSamples; ++i) {
793
+ maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
794
+ }
795
+ dstCapacity = ZSTD_compressBound(maxSampleSize);
796
+ dst = malloc(dstCapacity);
797
+ }
798
+ /* Create the cctx and cdict */
799
+ cctx = ZSTD_createCCtx();
800
+ cdict = ZSTD_createCDict(dict, dictBufferCapacity,
801
+ parameters.zParams.compressionLevel);
802
+ if (!dst || !cctx || !cdict) {
803
+ goto _compressCleanup;
804
+ }
805
+ /* Compress each sample and sum their sizes (or error) */
806
+ totalCompressedSize = dictBufferCapacity;
807
+ i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
808
+ for (; i < nbSamples; ++i) {
809
+ const size_t size = ZSTD_compress_usingCDict(
810
+ cctx, dst, dstCapacity, samples + offsets[i],
811
+ samplesSizes[i], cdict);
812
+ if (ZSTD_isError(size)) {
813
+ totalCompressedSize = ERROR(GENERIC);
814
+ goto _compressCleanup;
815
+ }
816
+ totalCompressedSize += size;
817
+ }
818
+ _compressCleanup:
819
+ ZSTD_freeCCtx(cctx);
820
+ ZSTD_freeCDict(cdict);
821
+ if (dst) {
822
+ free(dst);
823
+ }
824
+ return totalCompressedSize;
825
+ }
826
+
722
827
 
723
828
  /**
724
829
  * Initialize the `COVER_best_t`.
725
830
  */
726
- static void COVER_best_init(COVER_best_t *best) {
831
+ void COVER_best_init(COVER_best_t *best) {
727
832
  if (best==NULL) return; /* compatible with init on NULL */
728
833
  (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
729
834
  (void)ZSTD_pthread_cond_init(&best->cond, NULL);
@@ -737,7 +842,7 @@ static void COVER_best_init(COVER_best_t *best) {
737
842
  /**
738
843
  * Wait until liveJobs == 0.
739
844
  */
740
- static void COVER_best_wait(COVER_best_t *best) {
845
+ void COVER_best_wait(COVER_best_t *best) {
741
846
  if (!best) {
742
847
  return;
743
848
  }
@@ -751,7 +856,7 @@ static void COVER_best_wait(COVER_best_t *best) {
751
856
  /**
752
857
  * Call COVER_best_wait() and then destroy the COVER_best_t.
753
858
  */
754
- static void COVER_best_destroy(COVER_best_t *best) {
859
+ void COVER_best_destroy(COVER_best_t *best) {
755
860
  if (!best) {
756
861
  return;
757
862
  }
@@ -767,7 +872,7 @@ static void COVER_best_destroy(COVER_best_t *best) {
767
872
  * Called when a thread is about to be launched.
768
873
  * Increments liveJobs.
769
874
  */
770
- static void COVER_best_start(COVER_best_t *best) {
875
+ void COVER_best_start(COVER_best_t *best) {
771
876
  if (!best) {
772
877
  return;
773
878
  }
@@ -781,7 +886,7 @@ static void COVER_best_start(COVER_best_t *best) {
781
886
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
782
887
  * If this dictionary is the best so far save it and its parameters.
783
888
  */
784
- static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
889
+ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
785
890
  ZDICT_cover_params_t parameters, void *dict,
786
891
  size_t dictSize) {
787
892
  if (!best) {
@@ -803,6 +908,8 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
803
908
  if (!best->dict) {
804
909
  best->compressedSize = ERROR(GENERIC);
805
910
  best->dictSize = 0;
911
+ ZSTD_pthread_cond_signal(&best->cond);
912
+ ZSTD_pthread_mutex_unlock(&best->mutex);
806
913
  return;
807
914
  }
808
915
  }
@@ -812,10 +919,10 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
812
919
  best->parameters = parameters;
813
920
  best->compressedSize = compressedSize;
814
921
  }
815
- ZSTD_pthread_mutex_unlock(&best->mutex);
816
922
  if (liveJobs == 0) {
817
923
  ZSTD_pthread_cond_broadcast(&best->cond);
818
924
  }
925
+ ZSTD_pthread_mutex_unlock(&best->mutex);
819
926
  }
820
927
  }
821
928
 
@@ -830,7 +937,7 @@ typedef struct COVER_tryParameters_data_s {
830
937
  } COVER_tryParameters_data_t;
831
938
 
832
939
  /**
833
- * Tries a set of parameters and upates the COVER_best_t with the results.
940
+ * Tries a set of parameters and updates the COVER_best_t with the results.
834
941
  * This function is thread safe if zstd is compiled with multithreaded support.
835
942
  * It takes its parameters as an *OWNING* opaque pointer to support threading.
836
943
  */
@@ -861,7 +968,7 @@ static void COVER_tryParameters(void *opaque) {
861
968
  dictBufferCapacity, parameters);
862
969
  dictBufferCapacity = ZDICT_finalizeDictionary(
863
970
  dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
864
- ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
971
+ ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
865
972
  parameters.zParams);
866
973
  if (ZDICT_isError(dictBufferCapacity)) {
867
974
  DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
@@ -869,49 +976,10 @@ static void COVER_tryParameters(void *opaque) {
869
976
  }
870
977
  }
871
978
  /* Check total compressed size */
872
- {
873
- /* Pointers */
874
- ZSTD_CCtx *cctx;
875
- ZSTD_CDict *cdict;
876
- void *dst;
877
- /* Local variables */
878
- size_t dstCapacity;
879
- size_t i;
880
- /* Allocate dst with enough space to compress the maximum sized sample */
881
- {
882
- size_t maxSampleSize = 0;
883
- for (i = 0; i < ctx->nbSamples; ++i) {
884
- maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
885
- }
886
- dstCapacity = ZSTD_compressBound(maxSampleSize);
887
- dst = malloc(dstCapacity);
888
- }
889
- /* Create the cctx and cdict */
890
- cctx = ZSTD_createCCtx();
891
- cdict = ZSTD_createCDict(dict, dictBufferCapacity,
892
- parameters.zParams.compressionLevel);
893
- if (!dst || !cctx || !cdict) {
894
- goto _compressCleanup;
895
- }
896
- /* Compress each sample and sum their sizes (or error) */
897
- totalCompressedSize = dictBufferCapacity;
898
- for (i = 0; i < ctx->nbSamples; ++i) {
899
- const size_t size = ZSTD_compress_usingCDict(
900
- cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
901
- ctx->samplesSizes[i], cdict);
902
- if (ZSTD_isError(size)) {
903
- totalCompressedSize = ERROR(GENERIC);
904
- goto _compressCleanup;
905
- }
906
- totalCompressedSize += size;
907
- }
908
- _compressCleanup:
909
- ZSTD_freeCCtx(cctx);
910
- ZSTD_freeCDict(cdict);
911
- if (dst) {
912
- free(dst);
913
- }
914
- }
979
+ totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
980
+ ctx->samples, ctx->offsets,
981
+ ctx->nbTrainSamples, ctx->nbSamples,
982
+ dict, dictBufferCapacity);
915
983
 
916
984
  _cleanup:
917
985
  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
@@ -932,6 +1000,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
932
1000
  ZDICT_cover_params_t *parameters) {
933
1001
  /* constants */
934
1002
  const unsigned nbThreads = parameters->nbThreads;
1003
+ const double splitPoint =
1004
+ parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
935
1005
  const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
936
1006
  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
937
1007
  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
@@ -947,7 +1017,13 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
947
1017
  unsigned k;
948
1018
  COVER_best_t best;
949
1019
  POOL_ctx *pool = NULL;
1020
+ int warned = 0;
1021
+
950
1022
  /* Checks */
1023
+ if (splitPoint <= 0 || splitPoint > 1) {
1024
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
1025
+ return ERROR(GENERIC);
1026
+ }
951
1027
  if (kMinK < kMaxD || kMaxK < kMinK) {
952
1028
  LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
953
1029
  return ERROR(GENERIC);
@@ -978,12 +1054,16 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
978
1054
  /* Initialize the context for this value of d */
979
1055
  COVER_ctx_t ctx;
980
1056
  LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
981
- if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
1057
+ if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
982
1058
  LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
983
1059
  COVER_best_destroy(&best);
984
1060
  POOL_free(pool);
985
1061
  return ERROR(GENERIC);
986
1062
  }
1063
+ if (!warned) {
1064
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
1065
+ warned = 1;
1066
+ }
987
1067
  /* Loop through k reusing the same context */
988
1068
  for (k = kMinK; k <= kMaxK; k += kStepSize) {
989
1069
  /* Prepare the arguments */
@@ -1003,6 +1083,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1003
1083
  data->parameters = *parameters;
1004
1084
  data->parameters.k = k;
1005
1085
  data->parameters.d = d;
1086
+ data->parameters.splitPoint = splitPoint;
1006
1087
  data->parameters.steps = kSteps;
1007
1088
  data->parameters.zParams.notificationLevel = g_displayLevel;
1008
1089
  /* Check the parameters */
@@ -1020,7 +1101,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1020
1101
  }
1021
1102
  /* Print status */
1022
1103
  LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
1023
- (U32)((iteration * 100) / kIterations));
1104
+ (unsigned)((iteration * 100) / kIterations));
1024
1105
  ++iteration;
1025
1106
  }
1026
1107
  COVER_best_wait(&best);