extzstd 0.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.ja.md +39 -0
  3. data/README.md +38 -56
  4. data/contrib/zstd/CHANGELOG +613 -0
  5. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  6. data/contrib/zstd/CONTRIBUTING.md +406 -0
  7. data/contrib/zstd/COPYING +339 -0
  8. data/contrib/zstd/Makefile +420 -0
  9. data/contrib/zstd/README.md +179 -41
  10. data/contrib/zstd/TESTING.md +44 -0
  11. data/contrib/zstd/appveyor.yml +292 -0
  12. data/contrib/zstd/lib/BUCK +234 -0
  13. data/contrib/zstd/lib/Makefile +451 -0
  14. data/contrib/zstd/lib/README.md +207 -0
  15. data/contrib/zstd/{common → lib/common}/bitstream.h +187 -138
  16. data/contrib/zstd/lib/common/compiler.h +288 -0
  17. data/contrib/zstd/lib/common/cpu.h +213 -0
  18. data/contrib/zstd/lib/common/debug.c +24 -0
  19. data/contrib/zstd/lib/common/debug.h +107 -0
  20. data/contrib/zstd/lib/common/entropy_common.c +362 -0
  21. data/contrib/zstd/{common → lib/common}/error_private.c +25 -12
  22. data/contrib/zstd/{common → lib/common}/error_private.h +14 -10
  23. data/contrib/zstd/{common → lib/common}/fse.h +173 -92
  24. data/contrib/zstd/{common → lib/common}/fse_decompress.c +149 -85
  25. data/contrib/zstd/lib/common/huf.h +361 -0
  26. data/contrib/zstd/{common → lib/common}/mem.h +115 -59
  27. data/contrib/zstd/lib/common/pool.c +350 -0
  28. data/contrib/zstd/lib/common/pool.h +84 -0
  29. data/contrib/zstd/lib/common/threading.c +122 -0
  30. data/contrib/zstd/lib/common/threading.h +155 -0
  31. data/contrib/zstd/{common → lib/common}/xxhash.c +55 -96
  32. data/contrib/zstd/{common → lib/common}/xxhash.h +23 -47
  33. data/contrib/zstd/lib/common/zstd_common.c +83 -0
  34. data/contrib/zstd/lib/common/zstd_deps.h +111 -0
  35. data/contrib/zstd/lib/common/zstd_errors.h +95 -0
  36. data/contrib/zstd/lib/common/zstd_internal.h +478 -0
  37. data/contrib/zstd/{compress → lib/compress}/fse_compress.c +214 -319
  38. data/contrib/zstd/lib/compress/hist.c +181 -0
  39. data/contrib/zstd/lib/compress/hist.h +75 -0
  40. data/contrib/zstd/lib/compress/huf_compress.c +913 -0
  41. data/contrib/zstd/lib/compress/zstd_compress.c +5208 -0
  42. data/contrib/zstd/lib/compress/zstd_compress_internal.h +1203 -0
  43. data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
  44. data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
  45. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +433 -0
  46. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
  47. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +849 -0
  48. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
  49. data/contrib/zstd/lib/compress/zstd_cwksp.h +561 -0
  50. data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
  51. data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
  52. data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
  53. data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
  54. data/contrib/zstd/lib/compress/zstd_lazy.c +1412 -0
  55. data/contrib/zstd/lib/compress/zstd_lazy.h +87 -0
  56. data/contrib/zstd/lib/compress/zstd_ldm.c +660 -0
  57. data/contrib/zstd/lib/compress/zstd_ldm.h +116 -0
  58. data/contrib/zstd/lib/compress/zstd_opt.c +1345 -0
  59. data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
  60. data/contrib/zstd/lib/compress/zstdmt_compress.c +1811 -0
  61. data/contrib/zstd/lib/compress/zstdmt_compress.h +110 -0
  62. data/contrib/zstd/lib/decompress/huf_decompress.c +1350 -0
  63. data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
  64. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  65. data/contrib/zstd/lib/decompress/zstd_decompress.c +1930 -0
  66. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1540 -0
  67. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +62 -0
  68. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +190 -0
  69. data/contrib/zstd/{common → lib/deprecated}/zbuff.h +68 -45
  70. data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
  71. data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
  72. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
  73. data/contrib/zstd/lib/dictBuilder/cover.c +1245 -0
  74. data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
  75. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
  76. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +0 -0
  77. data/contrib/zstd/lib/dictBuilder/fastcover.c +758 -0
  78. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +318 -194
  79. data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
  80. data/contrib/zstd/{legacy → lib/legacy}/zstd_legacy.h +171 -15
  81. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +191 -124
  82. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +19 -5
  83. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +125 -125
  84. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +19 -5
  85. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +125 -124
  86. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +20 -6
  87. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +151 -299
  88. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +19 -5
  89. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +237 -243
  90. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +19 -6
  91. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +130 -143
  92. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +18 -5
  93. data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.c +158 -157
  94. data/contrib/zstd/{legacy → lib/legacy}/zstd_v07.h +19 -5
  95. data/contrib/zstd/lib/libzstd.pc.in +15 -0
  96. data/contrib/zstd/lib/zstd.h +2391 -0
  97. data/ext/depend +2 -0
  98. data/ext/extconf.rb +15 -6
  99. data/ext/extzstd.c +76 -145
  100. data/ext/extzstd.h +80 -31
  101. data/ext/extzstd_stream.c +417 -142
  102. data/ext/libzstd_conf.h +8 -0
  103. data/ext/zstd_common.c +10 -7
  104. data/ext/zstd_compress.c +14 -5
  105. data/ext/zstd_decompress.c +5 -4
  106. data/ext/zstd_dictbuilder.c +9 -4
  107. data/ext/zstd_dictbuilder_fastcover.c +3 -0
  108. data/ext/zstd_legacy_v01.c +3 -1
  109. data/ext/zstd_legacy_v02.c +3 -1
  110. data/ext/zstd_legacy_v03.c +3 -1
  111. data/ext/zstd_legacy_v04.c +3 -1
  112. data/ext/zstd_legacy_v05.c +3 -1
  113. data/ext/zstd_legacy_v06.c +3 -1
  114. data/ext/zstd_legacy_v07.c +3 -1
  115. data/gemstub.rb +10 -24
  116. data/lib/extzstd.rb +64 -179
  117. data/lib/extzstd/version.rb +6 -1
  118. data/test/test_basic.rb +9 -6
  119. metadata +113 -57
  120. data/HISTORY.ja +0 -5
  121. data/contrib/zstd/common/entropy_common.c +0 -225
  122. data/contrib/zstd/common/huf.h +0 -228
  123. data/contrib/zstd/common/zstd_common.c +0 -83
  124. data/contrib/zstd/common/zstd_errors.h +0 -60
  125. data/contrib/zstd/common/zstd_internal.h +0 -267
  126. data/contrib/zstd/compress/huf_compress.c +0 -533
  127. data/contrib/zstd/compress/zbuff_compress.c +0 -319
  128. data/contrib/zstd/compress/zstd_compress.c +0 -3264
  129. data/contrib/zstd/compress/zstd_opt.h +0 -900
  130. data/contrib/zstd/decompress/huf_decompress.c +0 -883
  131. data/contrib/zstd/decompress/zbuff_decompress.c +0 -252
  132. data/contrib/zstd/decompress/zstd_decompress.c +0 -1842
  133. data/contrib/zstd/dictBuilder/zdict.h +0 -111
  134. data/contrib/zstd/zstd.h +0 -640
@@ -0,0 +1,157 @@
1
+ /*
2
+ * Copyright (c) 2017-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include <stdio.h> /* fprintf */
12
+ #include <stdlib.h> /* malloc, free, qsort */
13
+ #include <string.h> /* memset */
14
+ #include <time.h> /* clock */
15
+ #include "../common/mem.h" /* read */
16
+ #include "../common/pool.h"
17
+ #include "../common/threading.h"
18
+ #include "../common/zstd_internal.h" /* includes zstd.h */
19
+ #ifndef ZDICT_STATIC_LINKING_ONLY
20
+ #define ZDICT_STATIC_LINKING_ONLY
21
+ #endif
22
+ #include "zdict.h"
23
+
24
+ /**
25
+ * COVER_best_t is used for two purposes:
26
+ * 1. Synchronizing threads.
27
+ * 2. Saving the best parameters and dictionary.
28
+ *
29
+ * All of the methods except COVER_best_init() are thread safe if zstd is
30
+ * compiled with multithreaded support.
31
+ */
32
+ typedef struct COVER_best_s {
33
+ ZSTD_pthread_mutex_t mutex;
34
+ ZSTD_pthread_cond_t cond;
35
+ size_t liveJobs;
36
+ void *dict;
37
+ size_t dictSize;
38
+ ZDICT_cover_params_t parameters;
39
+ size_t compressedSize;
40
+ } COVER_best_t;
41
+
42
+ /**
43
+ * A segment is a range in the source as well as the score of the segment.
44
+ */
45
+ typedef struct {
46
+ U32 begin;
47
+ U32 end;
48
+ U32 score;
49
+ } COVER_segment_t;
50
+
51
+ /**
52
+ *Number of epochs and size of each epoch.
53
+ */
54
+ typedef struct {
55
+ U32 num;
56
+ U32 size;
57
+ } COVER_epoch_info_t;
58
+
59
+ /**
60
+ * Struct used for the dictionary selection function.
61
+ */
62
+ typedef struct COVER_dictSelection {
63
+ BYTE* dictContent;
64
+ size_t dictSize;
65
+ size_t totalCompressedSize;
66
+ } COVER_dictSelection_t;
67
+
68
+ /**
69
+ * Computes the number of epochs and the size of each epoch.
70
+ * We will make sure that each epoch gets at least 10 * k bytes.
71
+ *
72
+ * The COVER algorithms divide the data up into epochs of equal size and
73
+ * select one segment from each epoch.
74
+ *
75
+ * @param maxDictSize The maximum allowed dictionary size.
76
+ * @param nbDmers The number of dmers we are training on.
77
+ * @param k The parameter k (segment size).
78
+ * @param passes The target number of passes over the dmer corpus.
79
+ * More passes means a better dictionary.
80
+ */
81
+ COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
82
+ U32 k, U32 passes);
83
+
84
+ /**
85
+ * Warns the user when their corpus is too small.
86
+ */
87
+ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
88
+
89
+ /**
90
+ * Checks total compressed size of a dictionary
91
+ */
92
+ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
93
+ const size_t *samplesSizes, const BYTE *samples,
94
+ size_t *offsets,
95
+ size_t nbTrainSamples, size_t nbSamples,
96
+ BYTE *const dict, size_t dictBufferCapacity);
97
+
98
+ /**
99
+ * Returns the sum of the sample sizes.
100
+ */
101
+ size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
102
+
103
+ /**
104
+ * Initialize the `COVER_best_t`.
105
+ */
106
+ void COVER_best_init(COVER_best_t *best);
107
+
108
+ /**
109
+ * Wait until liveJobs == 0.
110
+ */
111
+ void COVER_best_wait(COVER_best_t *best);
112
+
113
+ /**
114
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
115
+ */
116
+ void COVER_best_destroy(COVER_best_t *best);
117
+
118
+ /**
119
+ * Called when a thread is about to be launched.
120
+ * Increments liveJobs.
121
+ */
122
+ void COVER_best_start(COVER_best_t *best);
123
+
124
+ /**
125
+ * Called when a thread finishes executing, both on error or success.
126
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
127
+ * If this dictionary is the best so far save it and its parameters.
128
+ */
129
+ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
130
+ COVER_dictSelection_t selection);
131
+ /**
132
+ * Error function for COVER_selectDict function. Checks if the return
133
+ * value is an error.
134
+ */
135
+ unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
136
+
137
+ /**
138
+ * Error function for COVER_selectDict function. Returns a struct where
139
+ * return.totalCompressedSize is a ZSTD error.
140
+ */
141
+ COVER_dictSelection_t COVER_dictSelectionError(size_t error);
142
+
143
+ /**
144
+ * Always call after selectDict is called to free up used memory from
145
+ * newly created dictionary.
146
+ */
147
+ void COVER_dictSelectionFree(COVER_dictSelection_t selection);
148
+
149
+ /**
150
+ * Called to finalize the dictionary and select one based on whether or not
151
+ * the shrink-dict flag was enabled. If enabled the dictionary used is the
152
+ * smallest dictionary within a specified regression of the compressed size
153
+ * from the largest dictionary.
154
+ */
155
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
156
+ size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
157
+ size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
@@ -1637,7 +1637,7 @@ construct_SA(const unsigned char *T, int *SA,
1637
1637
  if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
1638
1638
  k = SA + BUCKET_B(c2 = c0, c1);
1639
1639
  }
1640
- assert(k < j);
1640
+ assert(k < j); assert(k != NULL);
1641
1641
  *k-- = s;
1642
1642
  } else {
1643
1643
  assert(((s == 0) && (T[s] == c1)) || (s < 0));
@@ -1701,7 +1701,7 @@ construct_BWT(const unsigned char *T, int *SA,
1701
1701
  if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
1702
1702
  k = SA + BUCKET_B(c2 = c0, c1);
1703
1703
  }
1704
- assert(k < j);
1704
+ assert(k < j); assert(k != NULL);
1705
1705
  *k-- = s;
1706
1706
  } else if(s != 0) {
1707
1707
  *j = ~s;
@@ -1785,7 +1785,7 @@ construct_BWT_indexes(const unsigned char *T, int *SA,
1785
1785
  if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
1786
1786
  k = SA + BUCKET_B(c2 = c0, c1);
1787
1787
  }
1788
- assert(k < j);
1788
+ assert(k < j); assert(k != NULL);
1789
1789
  *k-- = s;
1790
1790
  } else if(s != 0) {
1791
1791
  *j = ~s;
@@ -0,0 +1,758 @@
1
+ /*
2
+ * Copyright (c) 2018-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ /*-*************************************
12
+ * Dependencies
13
+ ***************************************/
14
+ #include <stdio.h> /* fprintf */
15
+ #include <stdlib.h> /* malloc, free, qsort */
16
+ #include <string.h> /* memset */
17
+ #include <time.h> /* clock */
18
+
19
+ #include "../common/mem.h" /* read */
20
+ #include "../common/pool.h"
21
+ #include "../common/threading.h"
22
+ #include "cover.h"
23
+ #include "../common/zstd_internal.h" /* includes zstd.h */
24
+ #include "../compress/zstd_compress_internal.h" /* ZSTD_hash*() */
25
+ #ifndef ZDICT_STATIC_LINKING_ONLY
26
+ #define ZDICT_STATIC_LINKING_ONLY
27
+ #endif
28
+ #include "zdict.h"
29
+
30
+
31
+ /*-*************************************
32
+ * Constants
33
+ ***************************************/
34
+ #define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
35
+ #define FASTCOVER_MAX_F 31
36
+ #define FASTCOVER_MAX_ACCEL 10
37
+ #define FASTCOVER_DEFAULT_SPLITPOINT 0.75
38
+ #define DEFAULT_F 20
39
+ #define DEFAULT_ACCEL 1
40
+
41
+
42
+ /*-*************************************
43
+ * Console display
44
+ ***************************************/
45
+ #ifndef LOCALDISPLAYLEVEL
46
+ static int g_displayLevel = 2;
47
+ #endif
48
+ #undef DISPLAY
49
+ #define DISPLAY(...) \
50
+ { \
51
+ fprintf(stderr, __VA_ARGS__); \
52
+ fflush(stderr); \
53
+ }
54
+ #undef LOCALDISPLAYLEVEL
55
+ #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
56
+ if (displayLevel >= l) { \
57
+ DISPLAY(__VA_ARGS__); \
58
+ } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
59
+ #undef DISPLAYLEVEL
60
+ #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
61
+
62
+ #ifndef LOCALDISPLAYUPDATE
63
+ static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
64
+ static clock_t g_time = 0;
65
+ #endif
66
+ #undef LOCALDISPLAYUPDATE
67
+ #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
68
+ if (displayLevel >= l) { \
69
+ if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
70
+ g_time = clock(); \
71
+ DISPLAY(__VA_ARGS__); \
72
+ } \
73
+ }
74
+ #undef DISPLAYUPDATE
75
+ #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
76
+
77
+
78
+ /*-*************************************
79
+ * Hash Functions
80
+ ***************************************/
81
+ /**
82
+ * Hash the d-byte value pointed to by p and mod 2^f into the frequency vector
83
+ */
84
+ static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 f, unsigned d) {
85
+ if (d == 6) {
86
+ return ZSTD_hash6Ptr(p, f);
87
+ }
88
+ return ZSTD_hash8Ptr(p, f);
89
+ }
90
+
91
+
92
+ /*-*************************************
93
+ * Acceleration
94
+ ***************************************/
95
+ typedef struct {
96
+ unsigned finalize; /* Percentage of training samples used for ZDICT_finalizeDictionary */
97
+ unsigned skip; /* Number of dmer skipped between each dmer counted in computeFrequency */
98
+ } FASTCOVER_accel_t;
99
+
100
+
101
+ static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
102
+ { 100, 0 }, /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
103
+ { 100, 0 }, /* accel = 1 */
104
+ { 50, 1 }, /* accel = 2 */
105
+ { 34, 2 }, /* accel = 3 */
106
+ { 25, 3 }, /* accel = 4 */
107
+ { 20, 4 }, /* accel = 5 */
108
+ { 17, 5 }, /* accel = 6 */
109
+ { 14, 6 }, /* accel = 7 */
110
+ { 13, 7 }, /* accel = 8 */
111
+ { 11, 8 }, /* accel = 9 */
112
+ { 10, 9 }, /* accel = 10 */
113
+ };
114
+
115
+
116
+ /*-*************************************
117
+ * Context
118
+ ***************************************/
119
+ typedef struct {
120
+ const BYTE *samples;
121
+ size_t *offsets;
122
+ const size_t *samplesSizes;
123
+ size_t nbSamples;
124
+ size_t nbTrainSamples;
125
+ size_t nbTestSamples;
126
+ size_t nbDmers;
127
+ U32 *freqs;
128
+ unsigned d;
129
+ unsigned f;
130
+ FASTCOVER_accel_t accelParams;
131
+ } FASTCOVER_ctx_t;
132
+
133
+
134
+ /*-*************************************
135
+ * Helper functions
136
+ ***************************************/
137
+ /**
138
+ * Selects the best segment in an epoch.
139
+ * Segments of are scored according to the function:
140
+ *
141
+ * Let F(d) be the frequency of all dmers with hash value d.
142
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
143
+ *
144
+ * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
145
+ *
146
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
147
+ */
148
+ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
149
+ U32 *freqs, U32 begin, U32 end,
150
+ ZDICT_cover_params_t parameters,
151
+ U16* segmentFreqs) {
152
+ /* Constants */
153
+ const U32 k = parameters.k;
154
+ const U32 d = parameters.d;
155
+ const U32 f = ctx->f;
156
+ const U32 dmersInK = k - d + 1;
157
+
158
+ /* Try each segment (activeSegment) and save the best (bestSegment) */
159
+ COVER_segment_t bestSegment = {0, 0, 0};
160
+ COVER_segment_t activeSegment;
161
+
162
+ /* Reset the activeDmers in the segment */
163
+ /* The activeSegment starts at the beginning of the epoch. */
164
+ activeSegment.begin = begin;
165
+ activeSegment.end = begin;
166
+ activeSegment.score = 0;
167
+
168
+ /* Slide the activeSegment through the whole epoch.
169
+ * Save the best segment in bestSegment.
170
+ */
171
+ while (activeSegment.end < end) {
172
+ /* Get hash value of current dmer */
173
+ const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
174
+
175
+ /* Add frequency of this index to score if this is the first occurrence of index in active segment */
176
+ if (segmentFreqs[idx] == 0) {
177
+ activeSegment.score += freqs[idx];
178
+ }
179
+ /* Increment end of segment and segmentFreqs*/
180
+ activeSegment.end += 1;
181
+ segmentFreqs[idx] += 1;
182
+ /* If the window is now too large, drop the first position */
183
+ if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
184
+ /* Get hash value of the dmer to be eliminated from active segment */
185
+ const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
186
+ segmentFreqs[delIndex] -= 1;
187
+ /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
188
+ if (segmentFreqs[delIndex] == 0) {
189
+ activeSegment.score -= freqs[delIndex];
190
+ }
191
+ /* Increment start of segment */
192
+ activeSegment.begin += 1;
193
+ }
194
+
195
+ /* If this segment is the best so far save it */
196
+ if (activeSegment.score > bestSegment.score) {
197
+ bestSegment = activeSegment;
198
+ }
199
+ }
200
+
201
+ /* Zero out rest of segmentFreqs array */
202
+ while (activeSegment.begin < end) {
203
+ const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
204
+ segmentFreqs[delIndex] -= 1;
205
+ activeSegment.begin += 1;
206
+ }
207
+
208
+ {
209
+ /* Zero the frequency of hash value of each dmer covered by the chosen segment. */
210
+ U32 pos;
211
+ for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
212
+ const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
213
+ freqs[i] = 0;
214
+ }
215
+ }
216
+
217
+ return bestSegment;
218
+ }
219
+
220
+
221
+ static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
222
+ size_t maxDictSize, unsigned f,
223
+ unsigned accel) {
224
+ /* k, d, and f are required parameters */
225
+ if (parameters.d == 0 || parameters.k == 0) {
226
+ return 0;
227
+ }
228
+ /* d has to be 6 or 8 */
229
+ if (parameters.d != 6 && parameters.d != 8) {
230
+ return 0;
231
+ }
232
+ /* k <= maxDictSize */
233
+ if (parameters.k > maxDictSize) {
234
+ return 0;
235
+ }
236
+ /* d <= k */
237
+ if (parameters.d > parameters.k) {
238
+ return 0;
239
+ }
240
+ /* 0 < f <= FASTCOVER_MAX_F*/
241
+ if (f > FASTCOVER_MAX_F || f == 0) {
242
+ return 0;
243
+ }
244
+ /* 0 < splitPoint <= 1 */
245
+ if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
246
+ return 0;
247
+ }
248
+ /* 0 < accel <= 10 */
249
+ if (accel > 10 || accel == 0) {
250
+ return 0;
251
+ }
252
+ return 1;
253
+ }
254
+
255
+
256
+ /**
257
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
258
+ */
259
+ static void
260
+ FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
261
+ {
262
+ if (!ctx) return;
263
+
264
+ free(ctx->freqs);
265
+ ctx->freqs = NULL;
266
+
267
+ free(ctx->offsets);
268
+ ctx->offsets = NULL;
269
+ }
270
+
271
+
272
+ /**
273
+ * Calculate for frequency of hash value of each dmer in ctx->samples
274
+ */
275
+ static void
276
+ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
277
+ {
278
+ const unsigned f = ctx->f;
279
+ const unsigned d = ctx->d;
280
+ const unsigned skip = ctx->accelParams.skip;
281
+ const unsigned readLength = MAX(d, 8);
282
+ size_t i;
283
+ assert(ctx->nbTrainSamples >= 5);
284
+ assert(ctx->nbTrainSamples <= ctx->nbSamples);
285
+ for (i = 0; i < ctx->nbTrainSamples; i++) {
286
+ size_t start = ctx->offsets[i]; /* start of current dmer */
287
+ size_t const currSampleEnd = ctx->offsets[i+1];
288
+ while (start + readLength <= currSampleEnd) {
289
+ const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
290
+ freqs[dmerIndex]++;
291
+ start = start + skip + 1;
292
+ }
293
+ }
294
+ }
295
+
296
+
297
+ /**
298
+ * Prepare a context for dictionary building.
299
+ * The context is only dependent on the parameter `d` and can used multiple
300
+ * times.
301
+ * Returns 0 on success or error code on error.
302
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
303
+ */
304
+ static size_t
305
+ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
306
+ const void* samplesBuffer,
307
+ const size_t* samplesSizes, unsigned nbSamples,
308
+ unsigned d, double splitPoint, unsigned f,
309
+ FASTCOVER_accel_t accelParams)
310
+ {
311
+ const BYTE* const samples = (const BYTE*)samplesBuffer;
312
+ const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
313
+ /* Split samples into testing and training sets */
314
+ const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
315
+ const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
316
+ const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
317
+ const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
318
+
319
+ /* Checks */
320
+ if (totalSamplesSize < MAX(d, sizeof(U64)) ||
321
+ totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
322
+ DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
323
+ (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
324
+ return ERROR(srcSize_wrong);
325
+ }
326
+
327
+ /* Check if there are at least 5 training samples */
328
+ if (nbTrainSamples < 5) {
329
+ DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
330
+ return ERROR(srcSize_wrong);
331
+ }
332
+
333
+ /* Check if there's testing sample */
334
+ if (nbTestSamples < 1) {
335
+ DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
336
+ return ERROR(srcSize_wrong);
337
+ }
338
+
339
+ /* Zero the context */
340
+ memset(ctx, 0, sizeof(*ctx));
341
+ DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
342
+ (unsigned)trainingSamplesSize);
343
+ DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
344
+ (unsigned)testSamplesSize);
345
+
346
+ ctx->samples = samples;
347
+ ctx->samplesSizes = samplesSizes;
348
+ ctx->nbSamples = nbSamples;
349
+ ctx->nbTrainSamples = nbTrainSamples;
350
+ ctx->nbTestSamples = nbTestSamples;
351
+ ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
352
+ ctx->d = d;
353
+ ctx->f = f;
354
+ ctx->accelParams = accelParams;
355
+
356
+ /* The offsets of each file */
357
+ ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
358
+ if (ctx->offsets == NULL) {
359
+ DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
360
+ FASTCOVER_ctx_destroy(ctx);
361
+ return ERROR(memory_allocation);
362
+ }
363
+
364
+ /* Fill offsets from the samplesSizes */
365
+ { U32 i;
366
+ ctx->offsets[0] = 0;
367
+ assert(nbSamples >= 5);
368
+ for (i = 1; i <= nbSamples; ++i) {
369
+ ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
370
+ }
371
+ }
372
+
373
+ /* Initialize frequency array of size 2^f */
374
+ ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
375
+ if (ctx->freqs == NULL) {
376
+ DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
377
+ FASTCOVER_ctx_destroy(ctx);
378
+ return ERROR(memory_allocation);
379
+ }
380
+
381
+ DISPLAYLEVEL(2, "Computing frequencies\n");
382
+ FASTCOVER_computeFrequency(ctx->freqs, ctx);
383
+
384
+ return 0;
385
+ }
386
+
387
+
388
+ /**
389
+ * Given the prepared context build the dictionary.
390
+ */
391
+ static size_t
392
+ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
393
+ U32* freqs,
394
+ void* dictBuffer, size_t dictBufferCapacity,
395
+ ZDICT_cover_params_t parameters,
396
+ U16* segmentFreqs)
397
+ {
398
+ BYTE *const dict = (BYTE *)dictBuffer;
399
+ size_t tail = dictBufferCapacity;
400
+ /* Divide the data into epochs. We will select one segment from each epoch. */
401
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
402
+ (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
403
+ const size_t maxZeroScoreRun = 10;
404
+ size_t zeroScoreRun = 0;
405
+ size_t epoch;
406
+ DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
407
+ (U32)epochs.num, (U32)epochs.size);
408
+ /* Loop through the epochs until there are no more segments or the dictionary
409
+ * is full.
410
+ */
411
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
412
+ const U32 epochBegin = (U32)(epoch * epochs.size);
413
+ const U32 epochEnd = epochBegin + epochs.size;
414
+ size_t segmentSize;
415
+ /* Select a segment */
416
+ COVER_segment_t segment = FASTCOVER_selectSegment(
417
+ ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
418
+
419
+ /* If the segment covers no dmers, then we are out of content.
420
+ * There may be new content in other epochs, for continue for some time.
421
+ */
422
+ if (segment.score == 0) {
423
+ if (++zeroScoreRun >= maxZeroScoreRun) {
424
+ break;
425
+ }
426
+ continue;
427
+ }
428
+ zeroScoreRun = 0;
429
+
430
+ /* Trim the segment if necessary and if it is too small then we are done */
431
+ segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
432
+ if (segmentSize < parameters.d) {
433
+ break;
434
+ }
435
+
436
+ /* We fill the dictionary from the back to allow the best segments to be
437
+ * referenced with the smallest offsets.
438
+ */
439
+ tail -= segmentSize;
440
+ memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
441
+ DISPLAYUPDATE(
442
+ 2, "\r%u%% ",
443
+ (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
444
+ }
445
+ DISPLAYLEVEL(2, "\r%79s\r", "");
446
+ return tail;
447
+ }
448
+
449
+ /**
450
+ * Parameters for FASTCOVER_tryParameters().
451
+ */
452
+ typedef struct FASTCOVER_tryParameters_data_s {
453
+ const FASTCOVER_ctx_t* ctx;
454
+ COVER_best_t* best;
455
+ size_t dictBufferCapacity;
456
+ ZDICT_cover_params_t parameters;
457
+ } FASTCOVER_tryParameters_data_t;
458
+
459
+
460
+ /**
461
+ * Tries a set of parameters and updates the COVER_best_t with the results.
462
+ * This function is thread safe if zstd is compiled with multithreaded support.
463
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
464
+ */
465
+ static void FASTCOVER_tryParameters(void *opaque)
466
+ {
467
+ /* Save parameters as local variables */
468
+ FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
469
+ const FASTCOVER_ctx_t *const ctx = data->ctx;
470
+ const ZDICT_cover_params_t parameters = data->parameters;
471
+ size_t dictBufferCapacity = data->dictBufferCapacity;
472
+ size_t totalCompressedSize = ERROR(GENERIC);
473
+ /* Initialize array to keep track of frequency of dmer within activeSegment */
474
+ U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
475
+ /* Allocate space for hash table, dict, and freqs */
476
+ BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
477
+ COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
478
+ U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
479
+ if (!segmentFreqs || !dict || !freqs) {
480
+ DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
481
+ goto _cleanup;
482
+ }
483
+ /* Copy the frequencies because we need to modify them */
484
+ memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
485
+ /* Build the dictionary */
486
+ { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
487
+ parameters, segmentFreqs);
488
+
489
+ const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
490
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
491
+ ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
492
+ totalCompressedSize);
493
+
494
+ if (COVER_dictSelectionIsError(selection)) {
495
+ DISPLAYLEVEL(1, "Failed to select dictionary\n");
496
+ goto _cleanup;
497
+ }
498
+ }
499
+ _cleanup:
500
+ free(dict);
501
+ COVER_best_finish(data->best, parameters, selection);
502
+ free(data);
503
+ free(segmentFreqs);
504
+ COVER_dictSelectionFree(selection);
505
+ free(freqs);
506
+ }
507
+
508
+
509
+ static void
510
+ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
511
+ ZDICT_cover_params_t* coverParams)
512
+ {
513
+ coverParams->k = fastCoverParams.k;
514
+ coverParams->d = fastCoverParams.d;
515
+ coverParams->steps = fastCoverParams.steps;
516
+ coverParams->nbThreads = fastCoverParams.nbThreads;
517
+ coverParams->splitPoint = fastCoverParams.splitPoint;
518
+ coverParams->zParams = fastCoverParams.zParams;
519
+ coverParams->shrinkDict = fastCoverParams.shrinkDict;
520
+ }
521
+
522
+
523
+ static void
524
+ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
525
+ ZDICT_fastCover_params_t* fastCoverParams,
526
+ unsigned f, unsigned accel)
527
+ {
528
+ fastCoverParams->k = coverParams.k;
529
+ fastCoverParams->d = coverParams.d;
530
+ fastCoverParams->steps = coverParams.steps;
531
+ fastCoverParams->nbThreads = coverParams.nbThreads;
532
+ fastCoverParams->splitPoint = coverParams.splitPoint;
533
+ fastCoverParams->f = f;
534
+ fastCoverParams->accel = accel;
535
+ fastCoverParams->zParams = coverParams.zParams;
536
+ fastCoverParams->shrinkDict = coverParams.shrinkDict;
537
+ }
538
+
539
+
540
+ ZDICTLIB_API size_t
541
+ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
542
+ const void* samplesBuffer,
543
+ const size_t* samplesSizes, unsigned nbSamples,
544
+ ZDICT_fastCover_params_t parameters)
545
+ {
546
+ BYTE* const dict = (BYTE*)dictBuffer;
547
+ FASTCOVER_ctx_t ctx;
548
+ ZDICT_cover_params_t coverParams;
549
+ FASTCOVER_accel_t accelParams;
550
+ /* Initialize global data */
551
+ g_displayLevel = parameters.zParams.notificationLevel;
552
+ /* Assign splitPoint and f if not provided */
553
+ parameters.splitPoint = 1.0;
554
+ parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
555
+ parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
556
+ /* Convert to cover parameter */
557
+ memset(&coverParams, 0 , sizeof(coverParams));
558
+ FASTCOVER_convertToCoverParams(parameters, &coverParams);
559
+ /* Checks */
560
+ if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
561
+ parameters.accel)) {
562
+ DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
563
+ return ERROR(parameter_outOfBound);
564
+ }
565
+ if (nbSamples == 0) {
566
+ DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
567
+ return ERROR(srcSize_wrong);
568
+ }
569
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
570
+ DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
571
+ ZDICT_DICTSIZE_MIN);
572
+ return ERROR(dstSize_tooSmall);
573
+ }
574
+ /* Assign corresponding FASTCOVER_accel_t to accelParams*/
575
+ accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
576
+ /* Initialize context */
577
+ {
578
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
579
+ coverParams.d, parameters.splitPoint, parameters.f,
580
+ accelParams);
581
+ if (ZSTD_isError(initVal)) {
582
+ DISPLAYLEVEL(1, "Failed to initialize context\n");
583
+ return initVal;
584
+ }
585
+ }
586
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
587
+ /* Build the dictionary */
588
+ DISPLAYLEVEL(2, "Building dictionary\n");
589
+ {
590
+ /* Initialize array to keep track of frequency of dmer within activeSegment */
591
+ U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
592
+ const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
593
+ dictBufferCapacity, coverParams, segmentFreqs);
594
+ const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
595
+ const size_t dictionarySize = ZDICT_finalizeDictionary(
596
+ dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
597
+ samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
598
+ if (!ZSTD_isError(dictionarySize)) {
599
+ DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
600
+ (unsigned)dictionarySize);
601
+ }
602
+ FASTCOVER_ctx_destroy(&ctx);
603
+ free(segmentFreqs);
604
+ return dictionarySize;
605
+ }
606
+ }
607
+
608
+
609
+ ZDICTLIB_API size_t
610
+ ZDICT_optimizeTrainFromBuffer_fastCover(
611
+ void* dictBuffer, size_t dictBufferCapacity,
612
+ const void* samplesBuffer,
613
+ const size_t* samplesSizes, unsigned nbSamples,
614
+ ZDICT_fastCover_params_t* parameters)
615
+ {
616
+ ZDICT_cover_params_t coverParams;
617
+ FASTCOVER_accel_t accelParams;
618
+ /* constants */
619
+ const unsigned nbThreads = parameters->nbThreads;
620
+ const double splitPoint =
621
+ parameters->splitPoint <= 0.0 ? FASTCOVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
622
+ const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
623
+ const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
624
+ const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
625
+ const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
626
+ const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
627
+ const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
628
+ const unsigned kIterations =
629
+ (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
630
+ const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
631
+ const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
632
+ const unsigned shrinkDict = 0;
633
+ /* Local variables */
634
+ const int displayLevel = parameters->zParams.notificationLevel;
635
+ unsigned iteration = 1;
636
+ unsigned d;
637
+ unsigned k;
638
+ COVER_best_t best;
639
+ POOL_ctx *pool = NULL;
640
+ int warned = 0;
641
+ /* Checks */
642
+ if (splitPoint <= 0 || splitPoint > 1) {
643
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
644
+ return ERROR(parameter_outOfBound);
645
+ }
646
+ if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
647
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
648
+ return ERROR(parameter_outOfBound);
649
+ }
650
+ if (kMinK < kMaxD || kMaxK < kMinK) {
651
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
652
+ return ERROR(parameter_outOfBound);
653
+ }
654
+ if (nbSamples == 0) {
655
+ LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
656
+ return ERROR(srcSize_wrong);
657
+ }
658
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
659
+ LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
660
+ ZDICT_DICTSIZE_MIN);
661
+ return ERROR(dstSize_tooSmall);
662
+ }
663
+ if (nbThreads > 1) {
664
+ pool = POOL_create(nbThreads, 1);
665
+ if (!pool) {
666
+ return ERROR(memory_allocation);
667
+ }
668
+ }
669
+ /* Initialization */
670
+ COVER_best_init(&best);
671
+ memset(&coverParams, 0 , sizeof(coverParams));
672
+ FASTCOVER_convertToCoverParams(*parameters, &coverParams);
673
+ accelParams = FASTCOVER_defaultAccelParameters[accel];
674
+ /* Turn down global display level to clean up display at level 2 and below */
675
+ g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
676
+ /* Loop through d first because each new value needs a new context */
677
+ LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
678
+ kIterations);
679
+ for (d = kMinD; d <= kMaxD; d += 2) {
680
+ /* Initialize the context for this value of d */
681
+ FASTCOVER_ctx_t ctx;
682
+ LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
683
+ {
684
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
685
+ if (ZSTD_isError(initVal)) {
686
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
687
+ COVER_best_destroy(&best);
688
+ POOL_free(pool);
689
+ return initVal;
690
+ }
691
+ }
692
+ if (!warned) {
693
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
694
+ warned = 1;
695
+ }
696
+ /* Loop through k reusing the same context */
697
+ for (k = kMinK; k <= kMaxK; k += kStepSize) {
698
+ /* Prepare the arguments */
699
+ FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
700
+ sizeof(FASTCOVER_tryParameters_data_t));
701
+ LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
702
+ if (!data) {
703
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
704
+ COVER_best_destroy(&best);
705
+ FASTCOVER_ctx_destroy(&ctx);
706
+ POOL_free(pool);
707
+ return ERROR(memory_allocation);
708
+ }
709
+ data->ctx = &ctx;
710
+ data->best = &best;
711
+ data->dictBufferCapacity = dictBufferCapacity;
712
+ data->parameters = coverParams;
713
+ data->parameters.k = k;
714
+ data->parameters.d = d;
715
+ data->parameters.splitPoint = splitPoint;
716
+ data->parameters.steps = kSteps;
717
+ data->parameters.shrinkDict = shrinkDict;
718
+ data->parameters.zParams.notificationLevel = g_displayLevel;
719
+ /* Check the parameters */
720
+ if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
721
+ data->ctx->f, accel)) {
722
+ DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
723
+ free(data);
724
+ continue;
725
+ }
726
+ /* Call the function and pass ownership of data to it */
727
+ COVER_best_start(&best);
728
+ if (pool) {
729
+ POOL_add(pool, &FASTCOVER_tryParameters, data);
730
+ } else {
731
+ FASTCOVER_tryParameters(data);
732
+ }
733
+ /* Print status */
734
+ LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
735
+ (unsigned)((iteration * 100) / kIterations));
736
+ ++iteration;
737
+ }
738
+ COVER_best_wait(&best);
739
+ FASTCOVER_ctx_destroy(&ctx);
740
+ }
741
+ LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
742
+ /* Fill the output buffer and parameters with output of the best parameters */
743
+ {
744
+ const size_t dictSize = best.dictSize;
745
+ if (ZSTD_isError(best.compressedSize)) {
746
+ const size_t compressedSize = best.compressedSize;
747
+ COVER_best_destroy(&best);
748
+ POOL_free(pool);
749
+ return compressedSize;
750
+ }
751
+ FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
752
+ memcpy(dictBuffer, best.dict, dictSize);
753
+ COVER_best_destroy(&best);
754
+ POOL_free(pool);
755
+ return dictSize;
756
+ }
757
+
758
+ }