extzstd 0.0.3.CONCEPT → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. checksums.yaml +5 -5
  2. data/HISTORY.ja.md +39 -0
  3. data/LICENSE +6 -6
  4. data/README.md +26 -45
  5. data/contrib/zstd/CHANGELOG +555 -0
  6. data/contrib/zstd/CODE_OF_CONDUCT.md +5 -0
  7. data/contrib/zstd/CONTRIBUTING.md +392 -0
  8. data/contrib/zstd/COPYING +339 -0
  9. data/contrib/zstd/LICENSE +13 -9
  10. data/contrib/zstd/Makefile +414 -0
  11. data/contrib/zstd/README.md +170 -45
  12. data/contrib/zstd/TESTING.md +44 -0
  13. data/contrib/zstd/appveyor.yml +289 -0
  14. data/contrib/zstd/lib/BUCK +234 -0
  15. data/contrib/zstd/lib/Makefile +354 -0
  16. data/contrib/zstd/lib/README.md +179 -0
  17. data/contrib/zstd/{common → lib/common}/bitstream.h +170 -130
  18. data/contrib/zstd/lib/common/compiler.h +175 -0
  19. data/contrib/zstd/lib/common/cpu.h +215 -0
  20. data/contrib/zstd/lib/common/debug.c +24 -0
  21. data/contrib/zstd/lib/common/debug.h +114 -0
  22. data/contrib/zstd/{common → lib/common}/entropy_common.c +79 -94
  23. data/contrib/zstd/lib/common/error_private.c +55 -0
  24. data/contrib/zstd/lib/common/error_private.h +80 -0
  25. data/contrib/zstd/{common → lib/common}/fse.h +153 -93
  26. data/contrib/zstd/{common → lib/common}/fse_decompress.c +37 -82
  27. data/contrib/zstd/lib/common/huf.h +340 -0
  28. data/contrib/zstd/{common → lib/common}/mem.h +154 -78
  29. data/contrib/zstd/lib/common/pool.c +344 -0
  30. data/contrib/zstd/lib/common/pool.h +84 -0
  31. data/contrib/zstd/lib/common/threading.c +121 -0
  32. data/contrib/zstd/lib/common/threading.h +155 -0
  33. data/contrib/zstd/{common → lib/common}/xxhash.c +85 -75
  34. data/contrib/zstd/{common → lib/common}/xxhash.h +85 -73
  35. data/contrib/zstd/lib/common/zstd_common.c +83 -0
  36. data/contrib/zstd/lib/common/zstd_errors.h +94 -0
  37. data/contrib/zstd/lib/common/zstd_internal.h +447 -0
  38. data/contrib/zstd/{compress → lib/compress}/fse_compress.c +194 -303
  39. data/contrib/zstd/lib/compress/hist.c +183 -0
  40. data/contrib/zstd/lib/compress/hist.h +75 -0
  41. data/contrib/zstd/lib/compress/huf_compress.c +798 -0
  42. data/contrib/zstd/lib/compress/zstd_compress.c +4278 -0
  43. data/contrib/zstd/lib/compress/zstd_compress_internal.h +1125 -0
  44. data/contrib/zstd/lib/compress/zstd_compress_literals.c +158 -0
  45. data/contrib/zstd/lib/compress/zstd_compress_literals.h +29 -0
  46. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +419 -0
  47. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +54 -0
  48. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +845 -0
  49. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +32 -0
  50. data/contrib/zstd/lib/compress/zstd_cwksp.h +525 -0
  51. data/contrib/zstd/lib/compress/zstd_double_fast.c +521 -0
  52. data/contrib/zstd/lib/compress/zstd_double_fast.h +38 -0
  53. data/contrib/zstd/lib/compress/zstd_fast.c +496 -0
  54. data/contrib/zstd/lib/compress/zstd_fast.h +37 -0
  55. data/contrib/zstd/lib/compress/zstd_lazy.c +1138 -0
  56. data/contrib/zstd/lib/compress/zstd_lazy.h +67 -0
  57. data/contrib/zstd/lib/compress/zstd_ldm.c +619 -0
  58. data/contrib/zstd/lib/compress/zstd_ldm.h +110 -0
  59. data/contrib/zstd/lib/compress/zstd_opt.c +1200 -0
  60. data/contrib/zstd/lib/compress/zstd_opt.h +56 -0
  61. data/contrib/zstd/lib/compress/zstdmt_compress.c +2143 -0
  62. data/contrib/zstd/lib/compress/zstdmt_compress.h +192 -0
  63. data/contrib/zstd/lib/decompress/huf_decompress.c +1248 -0
  64. data/contrib/zstd/lib/decompress/zstd_ddict.c +244 -0
  65. data/contrib/zstd/lib/decompress/zstd_ddict.h +44 -0
  66. data/contrib/zstd/lib/decompress/zstd_decompress.c +1885 -0
  67. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1432 -0
  68. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +59 -0
  69. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +189 -0
  70. data/contrib/zstd/{common → lib/deprecated}/zbuff.h +86 -69
  71. data/contrib/zstd/lib/deprecated/zbuff_common.c +26 -0
  72. data/contrib/zstd/lib/deprecated/zbuff_compress.c +147 -0
  73. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +75 -0
  74. data/contrib/zstd/lib/dictBuilder/cover.c +1236 -0
  75. data/contrib/zstd/lib/dictBuilder/cover.h +157 -0
  76. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.c +3 -3
  77. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/divsufsort.h +5 -5
  78. data/contrib/zstd/lib/dictBuilder/fastcover.c +757 -0
  79. data/contrib/zstd/{dictBuilder → lib/dictBuilder}/zdict.c +437 -347
  80. data/contrib/zstd/lib/dictBuilder/zdict.h +305 -0
  81. data/contrib/zstd/lib/legacy/zstd_legacy.h +415 -0
  82. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.c +272 -292
  83. data/contrib/zstd/{legacy → lib/legacy}/zstd_v01.h +26 -32
  84. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.c +162 -392
  85. data/contrib/zstd/{legacy → lib/legacy}/zstd_v02.h +26 -32
  86. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.c +162 -391
  87. data/contrib/zstd/{legacy → lib/legacy}/zstd_v03.h +27 -33
  88. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.c +195 -604
  89. data/contrib/zstd/{legacy → lib/legacy}/zstd_v04.h +26 -32
  90. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.c +300 -575
  91. data/contrib/zstd/{legacy → lib/legacy}/zstd_v05.h +22 -31
  92. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.c +165 -592
  93. data/contrib/zstd/{legacy → lib/legacy}/zstd_v06.h +54 -67
  94. data/contrib/zstd/lib/legacy/zstd_v07.c +4541 -0
  95. data/contrib/zstd/lib/legacy/zstd_v07.h +187 -0
  96. data/contrib/zstd/lib/libzstd.pc.in +15 -0
  97. data/contrib/zstd/lib/zstd.h +2090 -0
  98. data/ext/depend +2 -0
  99. data/ext/extconf.rb +18 -5
  100. data/ext/extzstd.c +296 -214
  101. data/ext/extzstd.h +81 -36
  102. data/ext/extzstd_nogvls.h +0 -117
  103. data/ext/extzstd_stream.c +622 -0
  104. data/ext/libzstd_conf.h +8 -0
  105. data/ext/zstd_common.c +11 -0
  106. data/ext/zstd_compress.c +15 -0
  107. data/ext/zstd_decompress.c +6 -0
  108. data/ext/zstd_dictbuilder.c +10 -0
  109. data/ext/zstd_dictbuilder_fastcover.c +3 -0
  110. data/ext/zstd_legacy_v01.c +3 -1
  111. data/ext/zstd_legacy_v02.c +3 -1
  112. data/ext/zstd_legacy_v03.c +3 -1
  113. data/ext/zstd_legacy_v04.c +3 -1
  114. data/ext/zstd_legacy_v05.c +3 -1
  115. data/ext/zstd_legacy_v06.c +3 -1
  116. data/ext/zstd_legacy_v07.c +3 -0
  117. data/gemstub.rb +27 -21
  118. data/lib/extzstd.rb +82 -161
  119. data/lib/extzstd/version.rb +1 -1
  120. data/test/test_basic.rb +19 -6
  121. metadata +127 -59
  122. data/contrib/zstd/common/error_private.h +0 -125
  123. data/contrib/zstd/common/error_public.h +0 -77
  124. data/contrib/zstd/common/huf.h +0 -228
  125. data/contrib/zstd/common/zstd.h +0 -475
  126. data/contrib/zstd/common/zstd_common.c +0 -91
  127. data/contrib/zstd/common/zstd_internal.h +0 -238
  128. data/contrib/zstd/compress/huf_compress.c +0 -577
  129. data/contrib/zstd/compress/zbuff_compress.c +0 -327
  130. data/contrib/zstd/compress/zstd_compress.c +0 -3074
  131. data/contrib/zstd/compress/zstd_opt.h +0 -1046
  132. data/contrib/zstd/decompress/huf_decompress.c +0 -894
  133. data/contrib/zstd/decompress/zbuff_decompress.c +0 -294
  134. data/contrib/zstd/decompress/zstd_decompress.c +0 -1362
  135. data/contrib/zstd/dictBuilder/zdict.h +0 -113
  136. data/contrib/zstd/legacy/zstd_legacy.h +0 -140
  137. data/ext/extzstd_buffered.c +0 -265
  138. data/ext/zstd_amalgam.c +0 -18
@@ -0,0 +1,157 @@
1
+ /*
2
+ * Copyright (c) 2017-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include <stdio.h> /* fprintf */
12
+ #include <stdlib.h> /* malloc, free, qsort */
13
+ #include <string.h> /* memset */
14
+ #include <time.h> /* clock */
15
+ #include "../common/mem.h" /* read */
16
+ #include "../common/pool.h"
17
+ #include "../common/threading.h"
18
+ #include "../common/zstd_internal.h" /* includes zstd.h */
19
+ #ifndef ZDICT_STATIC_LINKING_ONLY
20
+ #define ZDICT_STATIC_LINKING_ONLY
21
+ #endif
22
+ #include "zdict.h"
23
+
24
+ /**
25
+ * COVER_best_t is used for two purposes:
26
+ * 1. Synchronizing threads.
27
+ * 2. Saving the best parameters and dictionary.
28
+ *
29
+ * All of the methods except COVER_best_init() are thread safe if zstd is
30
+ * compiled with multithreaded support.
31
+ */
32
+ typedef struct COVER_best_s {
33
+ ZSTD_pthread_mutex_t mutex;
34
+ ZSTD_pthread_cond_t cond;
35
+ size_t liveJobs;
36
+ void *dict;
37
+ size_t dictSize;
38
+ ZDICT_cover_params_t parameters;
39
+ size_t compressedSize;
40
+ } COVER_best_t;
41
+
42
+ /**
43
+ * A segment is a range in the source as well as the score of the segment.
44
+ */
45
+ typedef struct {
46
+ U32 begin;
47
+ U32 end;
48
+ U32 score;
49
+ } COVER_segment_t;
50
+
51
+ /**
52
+ *Number of epochs and size of each epoch.
53
+ */
54
+ typedef struct {
55
+ U32 num;
56
+ U32 size;
57
+ } COVER_epoch_info_t;
58
+
59
+ /**
60
+ * Struct used for the dictionary selection function.
61
+ */
62
+ typedef struct COVER_dictSelection {
63
+ BYTE* dictContent;
64
+ size_t dictSize;
65
+ size_t totalCompressedSize;
66
+ } COVER_dictSelection_t;
67
+
68
+ /**
69
+ * Computes the number of epochs and the size of each epoch.
70
+ * We will make sure that each epoch gets at least 10 * k bytes.
71
+ *
72
+ * The COVER algorithms divide the data up into epochs of equal size and
73
+ * select one segment from each epoch.
74
+ *
75
+ * @param maxDictSize The maximum allowed dictionary size.
76
+ * @param nbDmers The number of dmers we are training on.
77
+ * @param k The parameter k (segment size).
78
+ * @param passes The target number of passes over the dmer corpus.
79
+ * More passes means a better dictionary.
80
+ */
81
+ COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
82
+ U32 k, U32 passes);
83
+
84
+ /**
85
+ * Warns the user when their corpus is too small.
86
+ */
87
+ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
88
+
89
+ /**
90
+ * Checks total compressed size of a dictionary
91
+ */
92
+ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
93
+ const size_t *samplesSizes, const BYTE *samples,
94
+ size_t *offsets,
95
+ size_t nbTrainSamples, size_t nbSamples,
96
+ BYTE *const dict, size_t dictBufferCapacity);
97
+
98
+ /**
99
+ * Returns the sum of the sample sizes.
100
+ */
101
+ size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
102
+
103
+ /**
104
+ * Initialize the `COVER_best_t`.
105
+ */
106
+ void COVER_best_init(COVER_best_t *best);
107
+
108
+ /**
109
+ * Wait until liveJobs == 0.
110
+ */
111
+ void COVER_best_wait(COVER_best_t *best);
112
+
113
+ /**
114
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
115
+ */
116
+ void COVER_best_destroy(COVER_best_t *best);
117
+
118
+ /**
119
+ * Called when a thread is about to be launched.
120
+ * Increments liveJobs.
121
+ */
122
+ void COVER_best_start(COVER_best_t *best);
123
+
124
+ /**
125
+ * Called when a thread finishes executing, both on error or success.
126
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
127
+ * If this dictionary is the best so far save it and its parameters.
128
+ */
129
+ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
130
+ COVER_dictSelection_t selection);
131
+ /**
132
+ * Error function for COVER_selectDict function. Checks if the return
133
+ * value is an error.
134
+ */
135
+ unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
136
+
137
+ /**
138
+ * Error function for COVER_selectDict function. Returns a struct where
139
+ * return.totalCompressedSize is a ZSTD error.
140
+ */
141
+ COVER_dictSelection_t COVER_dictSelectionError(size_t error);
142
+
143
+ /**
144
+ * Always call after selectDict is called to free up used memory from
145
+ * newly created dictionary.
146
+ */
147
+ void COVER_dictSelectionFree(COVER_dictSelection_t selection);
148
+
149
+ /**
150
+ * Called to finalize the dictionary and select one based on whether or not
151
+ * the shrink-dict flag was enabled. If enabled the dictionary used is the
152
+ * smallest dictionary within a specified regression of the compressed size
153
+ * from the largest dictionary.
154
+ */
155
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
156
+ size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
157
+ size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
@@ -1637,7 +1637,7 @@ construct_SA(const unsigned char *T, int *SA,
1637
1637
  if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
1638
1638
  k = SA + BUCKET_B(c2 = c0, c1);
1639
1639
  }
1640
- assert(k < j);
1640
+ assert(k < j); assert(k != NULL);
1641
1641
  *k-- = s;
1642
1642
  } else {
1643
1643
  assert(((s == 0) && (T[s] == c1)) || (s < 0));
@@ -1701,7 +1701,7 @@ construct_BWT(const unsigned char *T, int *SA,
1701
1701
  if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
1702
1702
  k = SA + BUCKET_B(c2 = c0, c1);
1703
1703
  }
1704
- assert(k < j);
1704
+ assert(k < j); assert(k != NULL);
1705
1705
  *k-- = s;
1706
1706
  } else if(s != 0) {
1707
1707
  *j = ~s;
@@ -1785,7 +1785,7 @@ construct_BWT_indexes(const unsigned char *T, int *SA,
1785
1785
  if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
1786
1786
  k = SA + BUCKET_B(c2 = c0, c1);
1787
1787
  }
1788
- assert(k < j);
1788
+ assert(k < j); assert(k != NULL);
1789
1789
  *k-- = s;
1790
1790
  } else if(s != 0) {
1791
1791
  *j = ~s;
@@ -36,8 +36,8 @@ extern "C" {
36
36
 
37
37
  /**
38
38
  * Constructs the suffix array of a given string.
39
- * @param T[0..n-1] The input string.
40
- * @param SA[0..n-1] The output array of suffixes.
39
+ * @param T [0..n-1] The input string.
40
+ * @param SA [0..n-1] The output array of suffixes.
41
41
  * @param n The length of the given string.
42
42
  * @param openMP enables OpenMP optimization.
43
43
  * @return 0 if no error occurred, -1 or -2 otherwise.
@@ -47,9 +47,9 @@ divsufsort(const unsigned char *T, int *SA, int n, int openMP);
47
47
 
48
48
  /**
49
49
  * Constructs the burrows-wheeler transformed string of a given string.
50
- * @param T[0..n-1] The input string.
51
- * @param U[0..n-1] The output string. (can be T)
52
- * @param A[0..n-1] The temporary array. (can be NULL)
50
+ * @param T [0..n-1] The input string.
51
+ * @param U [0..n-1] The output string. (can be T)
52
+ * @param A [0..n-1] The temporary array. (can be NULL)
53
53
  * @param n The length of the given string.
54
54
  * @param num_indexes The length of secondary indexes array. (can be NULL)
55
55
  * @param indexes The secondary indexes array. (can be NULL)
@@ -0,0 +1,757 @@
1
+ /*
2
+ * Copyright (c) 2018-2020, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ /*-*************************************
12
+ * Dependencies
13
+ ***************************************/
14
+ #include <stdio.h> /* fprintf */
15
+ #include <stdlib.h> /* malloc, free, qsort */
16
+ #include <string.h> /* memset */
17
+ #include <time.h> /* clock */
18
+
19
+ #include "../common/mem.h" /* read */
20
+ #include "../common/pool.h"
21
+ #include "../common/threading.h"
22
+ #include "cover.h"
23
+ #include "../common/zstd_internal.h" /* includes zstd.h */
24
+ #ifndef ZDICT_STATIC_LINKING_ONLY
25
+ #define ZDICT_STATIC_LINKING_ONLY
26
+ #endif
27
+ #include "zdict.h"
28
+
29
+
30
+ /*-*************************************
31
+ * Constants
32
+ ***************************************/
33
+ #define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
34
+ #define FASTCOVER_MAX_F 31
35
+ #define FASTCOVER_MAX_ACCEL 10
36
+ #define DEFAULT_SPLITPOINT 0.75
37
+ #define DEFAULT_F 20
38
+ #define DEFAULT_ACCEL 1
39
+
40
+
41
+ /*-*************************************
42
+ * Console display
43
+ ***************************************/
44
+ static int g_displayLevel = 2;
45
+ #define DISPLAY(...) \
46
+ { \
47
+ fprintf(stderr, __VA_ARGS__); \
48
+ fflush(stderr); \
49
+ }
50
+ #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
51
+ if (displayLevel >= l) { \
52
+ DISPLAY(__VA_ARGS__); \
53
+ } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
54
+ #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
55
+
56
+ #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
57
+ if (displayLevel >= l) { \
58
+ if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \
59
+ g_time = clock(); \
60
+ DISPLAY(__VA_ARGS__); \
61
+ } \
62
+ }
63
+ #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
64
+ static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
65
+ static clock_t g_time = 0;
66
+
67
+
68
+ /*-*************************************
69
+ * Hash Functions
70
+ ***************************************/
71
+ static const U64 prime6bytes = 227718039650203ULL;
72
+ static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
73
+ static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
74
+
75
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
76
+ static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
77
+ static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
78
+
79
+
80
+ /**
81
+ * Hash the d-byte value pointed to by p and mod 2^f
82
+ */
83
+ static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
84
+ if (d == 6) {
85
+ return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
86
+ }
87
+ return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
88
+ }
89
+
90
+
91
+ /*-*************************************
92
+ * Acceleration
93
+ ***************************************/
94
+ typedef struct {
95
+ unsigned finalize; /* Percentage of training samples used for ZDICT_finalizeDictionary */
96
+ unsigned skip; /* Number of dmer skipped between each dmer counted in computeFrequency */
97
+ } FASTCOVER_accel_t;
98
+
99
+
100
+ static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
101
+ { 100, 0 }, /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
102
+ { 100, 0 }, /* accel = 1 */
103
+ { 50, 1 }, /* accel = 2 */
104
+ { 34, 2 }, /* accel = 3 */
105
+ { 25, 3 }, /* accel = 4 */
106
+ { 20, 4 }, /* accel = 5 */
107
+ { 17, 5 }, /* accel = 6 */
108
+ { 14, 6 }, /* accel = 7 */
109
+ { 13, 7 }, /* accel = 8 */
110
+ { 11, 8 }, /* accel = 9 */
111
+ { 10, 9 }, /* accel = 10 */
112
+ };
113
+
114
+
115
+ /*-*************************************
116
+ * Context
117
+ ***************************************/
118
+ typedef struct {
119
+ const BYTE *samples;
120
+ size_t *offsets;
121
+ const size_t *samplesSizes;
122
+ size_t nbSamples;
123
+ size_t nbTrainSamples;
124
+ size_t nbTestSamples;
125
+ size_t nbDmers;
126
+ U32 *freqs;
127
+ unsigned d;
128
+ unsigned f;
129
+ FASTCOVER_accel_t accelParams;
130
+ } FASTCOVER_ctx_t;
131
+
132
+
133
+ /*-*************************************
134
+ * Helper functions
135
+ ***************************************/
136
+ /**
137
+ * Selects the best segment in an epoch.
138
+ * Segments of are scored according to the function:
139
+ *
140
+ * Let F(d) be the frequency of all dmers with hash value d.
141
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
142
+ *
143
+ * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
144
+ *
145
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
146
+ */
147
+ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
148
+ U32 *freqs, U32 begin, U32 end,
149
+ ZDICT_cover_params_t parameters,
150
+ U16* segmentFreqs) {
151
+ /* Constants */
152
+ const U32 k = parameters.k;
153
+ const U32 d = parameters.d;
154
+ const U32 f = ctx->f;
155
+ const U32 dmersInK = k - d + 1;
156
+
157
+ /* Try each segment (activeSegment) and save the best (bestSegment) */
158
+ COVER_segment_t bestSegment = {0, 0, 0};
159
+ COVER_segment_t activeSegment;
160
+
161
+ /* Reset the activeDmers in the segment */
162
+ /* The activeSegment starts at the beginning of the epoch. */
163
+ activeSegment.begin = begin;
164
+ activeSegment.end = begin;
165
+ activeSegment.score = 0;
166
+
167
+ /* Slide the activeSegment through the whole epoch.
168
+ * Save the best segment in bestSegment.
169
+ */
170
+ while (activeSegment.end < end) {
171
+ /* Get hash value of current dmer */
172
+ const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
173
+
174
+ /* Add frequency of this index to score if this is the first occurrence of index in active segment */
175
+ if (segmentFreqs[idx] == 0) {
176
+ activeSegment.score += freqs[idx];
177
+ }
178
+ /* Increment end of segment and segmentFreqs*/
179
+ activeSegment.end += 1;
180
+ segmentFreqs[idx] += 1;
181
+ /* If the window is now too large, drop the first position */
182
+ if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
183
+ /* Get hash value of the dmer to be eliminated from active segment */
184
+ const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
185
+ segmentFreqs[delIndex] -= 1;
186
+ /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
187
+ if (segmentFreqs[delIndex] == 0) {
188
+ activeSegment.score -= freqs[delIndex];
189
+ }
190
+ /* Increment start of segment */
191
+ activeSegment.begin += 1;
192
+ }
193
+
194
+ /* If this segment is the best so far save it */
195
+ if (activeSegment.score > bestSegment.score) {
196
+ bestSegment = activeSegment;
197
+ }
198
+ }
199
+
200
+ /* Zero out rest of segmentFreqs array */
201
+ while (activeSegment.begin < end) {
202
+ const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
203
+ segmentFreqs[delIndex] -= 1;
204
+ activeSegment.begin += 1;
205
+ }
206
+
207
+ {
208
+ /* Zero the frequency of hash value of each dmer covered by the chosen segment. */
209
+ U32 pos;
210
+ for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
211
+ const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
212
+ freqs[i] = 0;
213
+ }
214
+ }
215
+
216
+ return bestSegment;
217
+ }
218
+
219
+
220
+ static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
221
+ size_t maxDictSize, unsigned f,
222
+ unsigned accel) {
223
+ /* k, d, and f are required parameters */
224
+ if (parameters.d == 0 || parameters.k == 0) {
225
+ return 0;
226
+ }
227
+ /* d has to be 6 or 8 */
228
+ if (parameters.d != 6 && parameters.d != 8) {
229
+ return 0;
230
+ }
231
+ /* k <= maxDictSize */
232
+ if (parameters.k > maxDictSize) {
233
+ return 0;
234
+ }
235
+ /* d <= k */
236
+ if (parameters.d > parameters.k) {
237
+ return 0;
238
+ }
239
+ /* 0 < f <= FASTCOVER_MAX_F*/
240
+ if (f > FASTCOVER_MAX_F || f == 0) {
241
+ return 0;
242
+ }
243
+ /* 0 < splitPoint <= 1 */
244
+ if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
245
+ return 0;
246
+ }
247
+ /* 0 < accel <= 10 */
248
+ if (accel > 10 || accel == 0) {
249
+ return 0;
250
+ }
251
+ return 1;
252
+ }
253
+
254
+
255
+ /**
256
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
257
+ */
258
+ static void
259
+ FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
260
+ {
261
+ if (!ctx) return;
262
+
263
+ free(ctx->freqs);
264
+ ctx->freqs = NULL;
265
+
266
+ free(ctx->offsets);
267
+ ctx->offsets = NULL;
268
+ }
269
+
270
+
271
+ /**
272
+ * Calculate for frequency of hash value of each dmer in ctx->samples
273
+ */
274
+ static void
275
+ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
276
+ {
277
+ const unsigned f = ctx->f;
278
+ const unsigned d = ctx->d;
279
+ const unsigned skip = ctx->accelParams.skip;
280
+ const unsigned readLength = MAX(d, 8);
281
+ size_t i;
282
+ assert(ctx->nbTrainSamples >= 5);
283
+ assert(ctx->nbTrainSamples <= ctx->nbSamples);
284
+ for (i = 0; i < ctx->nbTrainSamples; i++) {
285
+ size_t start = ctx->offsets[i]; /* start of current dmer */
286
+ size_t const currSampleEnd = ctx->offsets[i+1];
287
+ while (start + readLength <= currSampleEnd) {
288
+ const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
289
+ freqs[dmerIndex]++;
290
+ start = start + skip + 1;
291
+ }
292
+ }
293
+ }
294
+
295
+
296
+ /**
297
+ * Prepare a context for dictionary building.
298
+ * The context is only dependent on the parameter `d` and can used multiple
299
+ * times.
300
+ * Returns 0 on success or error code on error.
301
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
302
+ */
303
+ static size_t
304
+ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
305
+ const void* samplesBuffer,
306
+ const size_t* samplesSizes, unsigned nbSamples,
307
+ unsigned d, double splitPoint, unsigned f,
308
+ FASTCOVER_accel_t accelParams)
309
+ {
310
+ const BYTE* const samples = (const BYTE*)samplesBuffer;
311
+ const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
312
+ /* Split samples into testing and training sets */
313
+ const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
314
+ const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
315
+ const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
316
+ const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
317
+
318
+ /* Checks */
319
+ if (totalSamplesSize < MAX(d, sizeof(U64)) ||
320
+ totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
321
+ DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
322
+ (unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
323
+ return ERROR(srcSize_wrong);
324
+ }
325
+
326
+ /* Check if there are at least 5 training samples */
327
+ if (nbTrainSamples < 5) {
328
+ DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
329
+ return ERROR(srcSize_wrong);
330
+ }
331
+
332
+ /* Check if there's testing sample */
333
+ if (nbTestSamples < 1) {
334
+ DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
335
+ return ERROR(srcSize_wrong);
336
+ }
337
+
338
+ /* Zero the context */
339
+ memset(ctx, 0, sizeof(*ctx));
340
+ DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
341
+ (unsigned)trainingSamplesSize);
342
+ DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
343
+ (unsigned)testSamplesSize);
344
+
345
+ ctx->samples = samples;
346
+ ctx->samplesSizes = samplesSizes;
347
+ ctx->nbSamples = nbSamples;
348
+ ctx->nbTrainSamples = nbTrainSamples;
349
+ ctx->nbTestSamples = nbTestSamples;
350
+ ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
351
+ ctx->d = d;
352
+ ctx->f = f;
353
+ ctx->accelParams = accelParams;
354
+
355
+ /* The offsets of each file */
356
+ ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
357
+ if (ctx->offsets == NULL) {
358
+ DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
359
+ FASTCOVER_ctx_destroy(ctx);
360
+ return ERROR(memory_allocation);
361
+ }
362
+
363
+ /* Fill offsets from the samplesSizes */
364
+ { U32 i;
365
+ ctx->offsets[0] = 0;
366
+ assert(nbSamples >= 5);
367
+ for (i = 1; i <= nbSamples; ++i) {
368
+ ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
369
+ }
370
+ }
371
+
372
+ /* Initialize frequency array of size 2^f */
373
+ ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
374
+ if (ctx->freqs == NULL) {
375
+ DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
376
+ FASTCOVER_ctx_destroy(ctx);
377
+ return ERROR(memory_allocation);
378
+ }
379
+
380
+ DISPLAYLEVEL(2, "Computing frequencies\n");
381
+ FASTCOVER_computeFrequency(ctx->freqs, ctx);
382
+
383
+ return 0;
384
+ }
385
+
386
+
387
+ /**
388
+ * Given the prepared context build the dictionary.
389
+ */
390
+ static size_t
391
+ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
392
+ U32* freqs,
393
+ void* dictBuffer, size_t dictBufferCapacity,
394
+ ZDICT_cover_params_t parameters,
395
+ U16* segmentFreqs)
396
+ {
397
+ BYTE *const dict = (BYTE *)dictBuffer;
398
+ size_t tail = dictBufferCapacity;
399
+ /* Divide the data into epochs. We will select one segment from each epoch. */
400
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
401
+ (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
402
+ const size_t maxZeroScoreRun = 10;
403
+ size_t zeroScoreRun = 0;
404
+ size_t epoch;
405
+ DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
406
+ (U32)epochs.num, (U32)epochs.size);
407
+ /* Loop through the epochs until there are no more segments or the dictionary
408
+ * is full.
409
+ */
410
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
411
+ const U32 epochBegin = (U32)(epoch * epochs.size);
412
+ const U32 epochEnd = epochBegin + epochs.size;
413
+ size_t segmentSize;
414
+ /* Select a segment */
415
+ COVER_segment_t segment = FASTCOVER_selectSegment(
416
+ ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
417
+
418
+ /* If the segment covers no dmers, then we are out of content.
419
+ * There may be new content in other epochs, for continue for some time.
420
+ */
421
+ if (segment.score == 0) {
422
+ if (++zeroScoreRun >= maxZeroScoreRun) {
423
+ break;
424
+ }
425
+ continue;
426
+ }
427
+ zeroScoreRun = 0;
428
+
429
+ /* Trim the segment if necessary and if it is too small then we are done */
430
+ segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
431
+ if (segmentSize < parameters.d) {
432
+ break;
433
+ }
434
+
435
+ /* We fill the dictionary from the back to allow the best segments to be
436
+ * referenced with the smallest offsets.
437
+ */
438
+ tail -= segmentSize;
439
+ memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
440
+ DISPLAYUPDATE(
441
+ 2, "\r%u%% ",
442
+ (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
443
+ }
444
+ DISPLAYLEVEL(2, "\r%79s\r", "");
445
+ return tail;
446
+ }
447
+
448
+ /**
449
+ * Parameters for FASTCOVER_tryParameters().
450
+ */
451
+ typedef struct FASTCOVER_tryParameters_data_s {
452
+ const FASTCOVER_ctx_t* ctx;
453
+ COVER_best_t* best;
454
+ size_t dictBufferCapacity;
455
+ ZDICT_cover_params_t parameters;
456
+ } FASTCOVER_tryParameters_data_t;
457
+
458
+
459
+ /**
460
+ * Tries a set of parameters and updates the COVER_best_t with the results.
461
+ * This function is thread safe if zstd is compiled with multithreaded support.
462
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
463
+ */
464
+ static void FASTCOVER_tryParameters(void *opaque)
465
+ {
466
+ /* Save parameters as local variables */
467
+ FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
468
+ const FASTCOVER_ctx_t *const ctx = data->ctx;
469
+ const ZDICT_cover_params_t parameters = data->parameters;
470
+ size_t dictBufferCapacity = data->dictBufferCapacity;
471
+ size_t totalCompressedSize = ERROR(GENERIC);
472
+ /* Initialize array to keep track of frequency of dmer within activeSegment */
473
+ U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
474
+ /* Allocate space for hash table, dict, and freqs */
475
+ BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
476
+ COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
477
+ U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
478
+ if (!segmentFreqs || !dict || !freqs) {
479
+ DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
480
+ goto _cleanup;
481
+ }
482
+ /* Copy the frequencies because we need to modify them */
483
+ memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
484
+ /* Build the dictionary */
485
+ { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
486
+ parameters, segmentFreqs);
487
+
488
+ const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
489
+ selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
490
+ ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
491
+ totalCompressedSize);
492
+
493
+ if (COVER_dictSelectionIsError(selection)) {
494
+ DISPLAYLEVEL(1, "Failed to select dictionary\n");
495
+ goto _cleanup;
496
+ }
497
+ }
498
+ _cleanup:
499
+ free(dict);
500
+ COVER_best_finish(data->best, parameters, selection);
501
+ free(data);
502
+ free(segmentFreqs);
503
+ COVER_dictSelectionFree(selection);
504
+ free(freqs);
505
+ }
506
+
507
+
508
+ static void
509
+ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
510
+ ZDICT_cover_params_t* coverParams)
511
+ {
512
+ coverParams->k = fastCoverParams.k;
513
+ coverParams->d = fastCoverParams.d;
514
+ coverParams->steps = fastCoverParams.steps;
515
+ coverParams->nbThreads = fastCoverParams.nbThreads;
516
+ coverParams->splitPoint = fastCoverParams.splitPoint;
517
+ coverParams->zParams = fastCoverParams.zParams;
518
+ coverParams->shrinkDict = fastCoverParams.shrinkDict;
519
+ }
520
+
521
+
522
+ static void
523
+ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
524
+ ZDICT_fastCover_params_t* fastCoverParams,
525
+ unsigned f, unsigned accel)
526
+ {
527
+ fastCoverParams->k = coverParams.k;
528
+ fastCoverParams->d = coverParams.d;
529
+ fastCoverParams->steps = coverParams.steps;
530
+ fastCoverParams->nbThreads = coverParams.nbThreads;
531
+ fastCoverParams->splitPoint = coverParams.splitPoint;
532
+ fastCoverParams->f = f;
533
+ fastCoverParams->accel = accel;
534
+ fastCoverParams->zParams = coverParams.zParams;
535
+ fastCoverParams->shrinkDict = coverParams.shrinkDict;
536
+ }
537
+
538
+
539
+ ZDICTLIB_API size_t
540
+ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
541
+ const void* samplesBuffer,
542
+ const size_t* samplesSizes, unsigned nbSamples,
543
+ ZDICT_fastCover_params_t parameters)
544
+ {
545
+ BYTE* const dict = (BYTE*)dictBuffer;
546
+ FASTCOVER_ctx_t ctx;
547
+ ZDICT_cover_params_t coverParams;
548
+ FASTCOVER_accel_t accelParams;
549
+ /* Initialize global data */
550
+ g_displayLevel = parameters.zParams.notificationLevel;
551
+ /* Assign splitPoint and f if not provided */
552
+ parameters.splitPoint = 1.0;
553
+ parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
554
+ parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
555
+ /* Convert to cover parameter */
556
+ memset(&coverParams, 0 , sizeof(coverParams));
557
+ FASTCOVER_convertToCoverParams(parameters, &coverParams);
558
+ /* Checks */
559
+ if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
560
+ parameters.accel)) {
561
+ DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
562
+ return ERROR(parameter_outOfBound);
563
+ }
564
+ if (nbSamples == 0) {
565
+ DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
566
+ return ERROR(srcSize_wrong);
567
+ }
568
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
569
+ DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
570
+ ZDICT_DICTSIZE_MIN);
571
+ return ERROR(dstSize_tooSmall);
572
+ }
573
+ /* Assign corresponding FASTCOVER_accel_t to accelParams*/
574
+ accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
575
+ /* Initialize context */
576
+ {
577
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
578
+ coverParams.d, parameters.splitPoint, parameters.f,
579
+ accelParams);
580
+ if (ZSTD_isError(initVal)) {
581
+ DISPLAYLEVEL(1, "Failed to initialize context\n");
582
+ return initVal;
583
+ }
584
+ }
585
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
586
+ /* Build the dictionary */
587
+ DISPLAYLEVEL(2, "Building dictionary\n");
588
+ {
589
+ /* Initialize array to keep track of frequency of dmer within activeSegment */
590
+ U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
591
+ const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
592
+ dictBufferCapacity, coverParams, segmentFreqs);
593
+ const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
594
+ const size_t dictionarySize = ZDICT_finalizeDictionary(
595
+ dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
596
+ samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
597
+ if (!ZSTD_isError(dictionarySize)) {
598
+ DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
599
+ (unsigned)dictionarySize);
600
+ }
601
+ FASTCOVER_ctx_destroy(&ctx);
602
+ free(segmentFreqs);
603
+ return dictionarySize;
604
+ }
605
+ }
606
+
607
+
608
+ ZDICTLIB_API size_t
609
+ ZDICT_optimizeTrainFromBuffer_fastCover(
610
+ void* dictBuffer, size_t dictBufferCapacity,
611
+ const void* samplesBuffer,
612
+ const size_t* samplesSizes, unsigned nbSamples,
613
+ ZDICT_fastCover_params_t* parameters)
614
+ {
615
+ ZDICT_cover_params_t coverParams;
616
+ FASTCOVER_accel_t accelParams;
617
+ /* constants */
618
+ const unsigned nbThreads = parameters->nbThreads;
619
+ const double splitPoint =
620
+ parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
621
+ const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
622
+ const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
623
+ const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
624
+ const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
625
+ const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
626
+ const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
627
+ const unsigned kIterations =
628
+ (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
629
+ const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
630
+ const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
631
+ const unsigned shrinkDict = 0;
632
+ /* Local variables */
633
+ const int displayLevel = parameters->zParams.notificationLevel;
634
+ unsigned iteration = 1;
635
+ unsigned d;
636
+ unsigned k;
637
+ COVER_best_t best;
638
+ POOL_ctx *pool = NULL;
639
+ int warned = 0;
640
+ /* Checks */
641
+ if (splitPoint <= 0 || splitPoint > 1) {
642
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
643
+ return ERROR(parameter_outOfBound);
644
+ }
645
+ if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
646
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
647
+ return ERROR(parameter_outOfBound);
648
+ }
649
+ if (kMinK < kMaxD || kMaxK < kMinK) {
650
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
651
+ return ERROR(parameter_outOfBound);
652
+ }
653
+ if (nbSamples == 0) {
654
+ LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
655
+ return ERROR(srcSize_wrong);
656
+ }
657
+ if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
658
+ LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
659
+ ZDICT_DICTSIZE_MIN);
660
+ return ERROR(dstSize_tooSmall);
661
+ }
662
+ if (nbThreads > 1) {
663
+ pool = POOL_create(nbThreads, 1);
664
+ if (!pool) {
665
+ return ERROR(memory_allocation);
666
+ }
667
+ }
668
+ /* Initialization */
669
+ COVER_best_init(&best);
670
+ memset(&coverParams, 0 , sizeof(coverParams));
671
+ FASTCOVER_convertToCoverParams(*parameters, &coverParams);
672
+ accelParams = FASTCOVER_defaultAccelParameters[accel];
673
+ /* Turn down global display level to clean up display at level 2 and below */
674
+ g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
675
+ /* Loop through d first because each new value needs a new context */
676
+ LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
677
+ kIterations);
678
+ for (d = kMinD; d <= kMaxD; d += 2) {
679
+ /* Initialize the context for this value of d */
680
+ FASTCOVER_ctx_t ctx;
681
+ LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
682
+ {
683
+ size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
684
+ if (ZSTD_isError(initVal)) {
685
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
686
+ COVER_best_destroy(&best);
687
+ POOL_free(pool);
688
+ return initVal;
689
+ }
690
+ }
691
+ if (!warned) {
692
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
693
+ warned = 1;
694
+ }
695
+ /* Loop through k reusing the same context */
696
+ for (k = kMinK; k <= kMaxK; k += kStepSize) {
697
+ /* Prepare the arguments */
698
+ FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
699
+ sizeof(FASTCOVER_tryParameters_data_t));
700
+ LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
701
+ if (!data) {
702
+ LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
703
+ COVER_best_destroy(&best);
704
+ FASTCOVER_ctx_destroy(&ctx);
705
+ POOL_free(pool);
706
+ return ERROR(memory_allocation);
707
+ }
708
+ data->ctx = &ctx;
709
+ data->best = &best;
710
+ data->dictBufferCapacity = dictBufferCapacity;
711
+ data->parameters = coverParams;
712
+ data->parameters.k = k;
713
+ data->parameters.d = d;
714
+ data->parameters.splitPoint = splitPoint;
715
+ data->parameters.steps = kSteps;
716
+ data->parameters.shrinkDict = shrinkDict;
717
+ data->parameters.zParams.notificationLevel = g_displayLevel;
718
+ /* Check the parameters */
719
+ if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
720
+ data->ctx->f, accel)) {
721
+ DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
722
+ free(data);
723
+ continue;
724
+ }
725
+ /* Call the function and pass ownership of data to it */
726
+ COVER_best_start(&best);
727
+ if (pool) {
728
+ POOL_add(pool, &FASTCOVER_tryParameters, data);
729
+ } else {
730
+ FASTCOVER_tryParameters(data);
731
+ }
732
+ /* Print status */
733
+ LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
734
+ (unsigned)((iteration * 100) / kIterations));
735
+ ++iteration;
736
+ }
737
+ COVER_best_wait(&best);
738
+ FASTCOVER_ctx_destroy(&ctx);
739
+ }
740
+ LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
741
+ /* Fill the output buffer and parameters with output of the best parameters */
742
+ {
743
+ const size_t dictSize = best.dictSize;
744
+ if (ZSTD_isError(best.compressedSize)) {
745
+ const size_t compressedSize = best.compressedSize;
746
+ COVER_best_destroy(&best);
747
+ POOL_free(pool);
748
+ return compressedSize;
749
+ }
750
+ FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
751
+ memcpy(dictBuffer, best.dict, dictSize);
752
+ COVER_best_destroy(&best);
753
+ POOL_free(pool);
754
+ return dictSize;
755
+ }
756
+
757
+ }