zstd-ruby 1.4.0.0 → 1.4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +5 -0
- data/ext/zstdruby/libzstd/common/compiler.h +7 -0
- data/ext/zstdruby/libzstd/common/zstd_internal.h +58 -6
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +175 -117
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +74 -30
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +56 -36
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +35 -14
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +10 -5
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +45 -32
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +18 -7
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +1 -0
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +12 -9
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +20 -9
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +154 -43
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +38 -3
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +46 -39
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +9 -9
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +5 -0
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -0
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +95 -101
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +11 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +88 -84
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +2 -4
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +2 -4
- data/ext/zstdruby/libzstd/zstd.h +53 -21
- data/lib/zstd-ruby/version.rb +1 -1
- metadata +3 -4
@@ -526,10 +526,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
|
526
526
|
* Prepare a context for dictionary building.
|
527
527
|
* The context is only dependent on the parameter `d` and can used multiple
|
528
528
|
* times.
|
529
|
-
* Returns
|
529
|
+
* Returns 0 on success or error code on error.
|
530
530
|
* The context must be destroyed with `COVER_ctx_destroy()`.
|
531
531
|
*/
|
532
|
-
static
|
532
|
+
static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
533
533
|
const size_t *samplesSizes, unsigned nbSamples,
|
534
534
|
unsigned d, double splitPoint) {
|
535
535
|
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
@@ -544,17 +544,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
544
544
|
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
545
545
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
546
546
|
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
|
547
|
-
return
|
547
|
+
return ERROR(srcSize_wrong);
|
548
548
|
}
|
549
549
|
/* Check if there are at least 5 training samples */
|
550
550
|
if (nbTrainSamples < 5) {
|
551
551
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
552
|
-
return
|
552
|
+
return ERROR(srcSize_wrong);
|
553
553
|
}
|
554
554
|
/* Check if there's testing sample */
|
555
555
|
if (nbTestSamples < 1) {
|
556
556
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
557
|
-
return
|
557
|
+
return ERROR(srcSize_wrong);
|
558
558
|
}
|
559
559
|
/* Zero the context */
|
560
560
|
memset(ctx, 0, sizeof(*ctx));
|
@@ -577,7 +577,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
577
577
|
if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
|
578
578
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
|
579
579
|
COVER_ctx_destroy(ctx);
|
580
|
-
return
|
580
|
+
return ERROR(memory_allocation);
|
581
581
|
}
|
582
582
|
ctx->freqs = NULL;
|
583
583
|
ctx->d = d;
|
@@ -624,7 +624,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
624
624
|
(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
|
625
625
|
ctx->freqs = ctx->suffix;
|
626
626
|
ctx->suffix = NULL;
|
627
|
-
return
|
627
|
+
return 0;
|
628
628
|
}
|
629
629
|
|
630
630
|
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
@@ -729,11 +729,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
729
729
|
/* Checks */
|
730
730
|
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
|
731
731
|
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
732
|
-
return ERROR(
|
732
|
+
return ERROR(parameter_outOfBound);
|
733
733
|
}
|
734
734
|
if (nbSamples == 0) {
|
735
735
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
736
|
-
return ERROR(
|
736
|
+
return ERROR(srcSize_wrong);
|
737
737
|
}
|
738
738
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
739
739
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -741,15 +741,18 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
741
741
|
return ERROR(dstSize_tooSmall);
|
742
742
|
}
|
743
743
|
/* Initialize context and activeDmers */
|
744
|
-
|
745
|
-
|
746
|
-
|
744
|
+
{
|
745
|
+
size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
746
|
+
parameters.d, parameters.splitPoint);
|
747
|
+
if (ZSTD_isError(initVal)) {
|
748
|
+
return initVal;
|
749
|
+
}
|
747
750
|
}
|
748
751
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
749
752
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
750
753
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
751
754
|
COVER_ctx_destroy(&ctx);
|
752
|
-
return ERROR(
|
755
|
+
return ERROR(memory_allocation);
|
753
756
|
}
|
754
757
|
|
755
758
|
DISPLAYLEVEL(2, "Building dictionary\n");
|
@@ -810,7 +813,7 @@ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
|
810
813
|
cctx, dst, dstCapacity, samples + offsets[i],
|
811
814
|
samplesSizes[i], cdict);
|
812
815
|
if (ZSTD_isError(size)) {
|
813
|
-
totalCompressedSize =
|
816
|
+
totalCompressedSize = size;
|
814
817
|
goto _compressCleanup;
|
815
818
|
}
|
816
819
|
totalCompressedSize += size;
|
@@ -886,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
|
|
886
889
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
887
890
|
* If this dictionary is the best so far save it and its parameters.
|
888
891
|
*/
|
889
|
-
void COVER_best_finish(COVER_best_t *best,
|
890
|
-
|
891
|
-
|
892
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
893
|
+
COVER_dictSelection_t selection) {
|
894
|
+
void* dict = selection.dictContent;
|
895
|
+
size_t compressedSize = selection.totalCompressedSize;
|
896
|
+
size_t dictSize = selection.dictSize;
|
892
897
|
if (!best) {
|
893
898
|
return;
|
894
899
|
}
|
@@ -914,6 +919,9 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
914
919
|
}
|
915
920
|
}
|
916
921
|
/* Save the dictionary, parameters, and size */
|
922
|
+
if (!dict) {
|
923
|
+
return;
|
924
|
+
}
|
917
925
|
memcpy(best->dict, dict, dictSize);
|
918
926
|
best->dictSize = dictSize;
|
919
927
|
best->parameters = parameters;
|
@@ -926,6 +934,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
926
934
|
}
|
927
935
|
}
|
928
936
|
|
937
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
|
938
|
+
COVER_dictSelection_t selection = { NULL, 0, error };
|
939
|
+
return selection;
|
940
|
+
}
|
941
|
+
|
942
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
|
943
|
+
return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
|
944
|
+
}
|
945
|
+
|
946
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
|
947
|
+
free(selection.dictContent);
|
948
|
+
}
|
949
|
+
|
950
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
951
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
952
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
|
953
|
+
|
954
|
+
size_t largestDict = 0;
|
955
|
+
size_t largestCompressed = 0;
|
956
|
+
BYTE* customDictContentEnd = customDictContent + dictContentSize;
|
957
|
+
|
958
|
+
BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
|
959
|
+
BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
|
960
|
+
double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
|
961
|
+
|
962
|
+
if (!largestDictbuffer || !candidateDictBuffer) {
|
963
|
+
free(largestDictbuffer);
|
964
|
+
free(candidateDictBuffer);
|
965
|
+
return COVER_dictSelectionError(dictContentSize);
|
966
|
+
}
|
967
|
+
|
968
|
+
/* Initial dictionary size and compressed size */
|
969
|
+
memcpy(largestDictbuffer, customDictContent, dictContentSize);
|
970
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
971
|
+
largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
|
972
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
973
|
+
|
974
|
+
if (ZDICT_isError(dictContentSize)) {
|
975
|
+
free(largestDictbuffer);
|
976
|
+
free(candidateDictBuffer);
|
977
|
+
return COVER_dictSelectionError(dictContentSize);
|
978
|
+
}
|
979
|
+
|
980
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
981
|
+
samplesBuffer, offsets,
|
982
|
+
nbCheckSamples, nbSamples,
|
983
|
+
largestDictbuffer, dictContentSize);
|
984
|
+
|
985
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
986
|
+
free(largestDictbuffer);
|
987
|
+
free(candidateDictBuffer);
|
988
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
989
|
+
}
|
990
|
+
|
991
|
+
if (params.shrinkDict == 0) {
|
992
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
993
|
+
free(candidateDictBuffer);
|
994
|
+
return selection;
|
995
|
+
}
|
996
|
+
|
997
|
+
largestDict = dictContentSize;
|
998
|
+
largestCompressed = totalCompressedSize;
|
999
|
+
dictContentSize = ZDICT_DICTSIZE_MIN;
|
1000
|
+
|
1001
|
+
/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
|
1002
|
+
while (dictContentSize < largestDict) {
|
1003
|
+
memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
|
1004
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
1005
|
+
candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
|
1006
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
1007
|
+
|
1008
|
+
if (ZDICT_isError(dictContentSize)) {
|
1009
|
+
free(largestDictbuffer);
|
1010
|
+
free(candidateDictBuffer);
|
1011
|
+
return COVER_dictSelectionError(dictContentSize);
|
1012
|
+
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
1016
|
+
samplesBuffer, offsets,
|
1017
|
+
nbCheckSamples, nbSamples,
|
1018
|
+
candidateDictBuffer, dictContentSize);
|
1019
|
+
|
1020
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
1021
|
+
free(largestDictbuffer);
|
1022
|
+
free(candidateDictBuffer);
|
1023
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
if (totalCompressedSize <= largestCompressed * regressionTolerance) {
|
1027
|
+
COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
|
1028
|
+
free(largestDictbuffer);
|
1029
|
+
return selection;
|
1030
|
+
}
|
1031
|
+
dictContentSize *= 2;
|
1032
|
+
}
|
1033
|
+
dictContentSize = largestDict;
|
1034
|
+
totalCompressedSize = largestCompressed;
|
1035
|
+
{
|
1036
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
1037
|
+
free(candidateDictBuffer);
|
1038
|
+
return selection;
|
1039
|
+
}
|
1040
|
+
}
|
1041
|
+
|
929
1042
|
/**
|
930
1043
|
* Parameters for COVER_tryParameters().
|
931
1044
|
*/
|
@@ -951,6 +1064,7 @@ static void COVER_tryParameters(void *opaque) {
|
|
951
1064
|
/* Allocate space for hash table, dict, and freqs */
|
952
1065
|
COVER_map_t activeDmers;
|
953
1066
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
1067
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
954
1068
|
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
955
1069
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
956
1070
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
@@ -966,29 +1080,21 @@ static void COVER_tryParameters(void *opaque) {
|
|
966
1080
|
{
|
967
1081
|
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
|
968
1082
|
dictBufferCapacity, parameters);
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
if (
|
974
|
-
DISPLAYLEVEL(1, "Failed to
|
1083
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
1084
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
1085
|
+
totalCompressedSize);
|
1086
|
+
|
1087
|
+
if (COVER_dictSelectionIsError(selection)) {
|
1088
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
975
1089
|
goto _cleanup;
|
976
1090
|
}
|
977
1091
|
}
|
978
|
-
/* Check total compressed size */
|
979
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
980
|
-
ctx->samples, ctx->offsets,
|
981
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
982
|
-
dict, dictBufferCapacity);
|
983
|
-
|
984
1092
|
_cleanup:
|
985
|
-
|
986
|
-
|
1093
|
+
free(dict);
|
1094
|
+
COVER_best_finish(data->best, parameters, selection);
|
987
1095
|
free(data);
|
988
1096
|
COVER_map_destroy(&activeDmers);
|
989
|
-
|
990
|
-
free(dict);
|
991
|
-
}
|
1097
|
+
COVER_dictSelectionFree(selection);
|
992
1098
|
if (freqs) {
|
993
1099
|
free(freqs);
|
994
1100
|
}
|
@@ -1010,6 +1116,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1010
1116
|
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
|
1011
1117
|
const unsigned kIterations =
|
1012
1118
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
1119
|
+
const unsigned shrinkDict = 0;
|
1013
1120
|
/* Local variables */
|
1014
1121
|
const int displayLevel = parameters->zParams.notificationLevel;
|
1015
1122
|
unsigned iteration = 1;
|
@@ -1022,15 +1129,15 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1022
1129
|
/* Checks */
|
1023
1130
|
if (splitPoint <= 0 || splitPoint > 1) {
|
1024
1131
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
1025
|
-
return ERROR(
|
1132
|
+
return ERROR(parameter_outOfBound);
|
1026
1133
|
}
|
1027
1134
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
1028
1135
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
1029
|
-
return ERROR(
|
1136
|
+
return ERROR(parameter_outOfBound);
|
1030
1137
|
}
|
1031
1138
|
if (nbSamples == 0) {
|
1032
1139
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
1033
|
-
return ERROR(
|
1140
|
+
return ERROR(srcSize_wrong);
|
1034
1141
|
}
|
1035
1142
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
1036
1143
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -1054,11 +1161,14 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1054
1161
|
/* Initialize the context for this value of d */
|
1055
1162
|
COVER_ctx_t ctx;
|
1056
1163
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1164
|
+
{
|
1165
|
+
const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
|
1166
|
+
if (ZSTD_isError(initVal)) {
|
1167
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
1168
|
+
COVER_best_destroy(&best);
|
1169
|
+
POOL_free(pool);
|
1170
|
+
return initVal;
|
1171
|
+
}
|
1062
1172
|
}
|
1063
1173
|
if (!warned) {
|
1064
1174
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
@@ -1075,7 +1185,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1075
1185
|
COVER_best_destroy(&best);
|
1076
1186
|
COVER_ctx_destroy(&ctx);
|
1077
1187
|
POOL_free(pool);
|
1078
|
-
return ERROR(
|
1188
|
+
return ERROR(memory_allocation);
|
1079
1189
|
}
|
1080
1190
|
data->ctx = &ctx;
|
1081
1191
|
data->best = &best;
|
@@ -1085,6 +1195,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1085
1195
|
data->parameters.d = d;
|
1086
1196
|
data->parameters.splitPoint = splitPoint;
|
1087
1197
|
data->parameters.steps = kSteps;
|
1198
|
+
data->parameters.shrinkDict = shrinkDict;
|
1088
1199
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
1089
1200
|
/* Check the parameters */
|
1090
1201
|
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
|
@@ -46,6 +46,15 @@ typedef struct {
|
|
46
46
|
U32 size;
|
47
47
|
} COVER_epoch_info_t;
|
48
48
|
|
49
|
+
/**
|
50
|
+
* Struct used for the dictionary selection function.
|
51
|
+
*/
|
52
|
+
typedef struct COVER_dictSelection {
|
53
|
+
BYTE* dictContent;
|
54
|
+
size_t dictSize;
|
55
|
+
size_t totalCompressedSize;
|
56
|
+
} COVER_dictSelection_t;
|
57
|
+
|
49
58
|
/**
|
50
59
|
* Computes the number of epochs and the size of each epoch.
|
51
60
|
* We will make sure that each epoch gets at least 10 * k bytes.
|
@@ -107,6 +116,32 @@ void COVER_best_start(COVER_best_t *best);
|
|
107
116
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
108
117
|
* If this dictionary is the best so far save it and its parameters.
|
109
118
|
*/
|
110
|
-
void COVER_best_finish(COVER_best_t *best,
|
111
|
-
|
112
|
-
|
119
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
120
|
+
COVER_dictSelection_t selection);
|
121
|
+
/**
|
122
|
+
* Error function for COVER_selectDict function. Checks if the return
|
123
|
+
* value is an error.
|
124
|
+
*/
|
125
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
|
126
|
+
|
127
|
+
/**
|
128
|
+
* Error function for COVER_selectDict function. Returns a struct where
|
129
|
+
* return.totalCompressedSize is a ZSTD error.
|
130
|
+
*/
|
131
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
|
132
|
+
|
133
|
+
/**
|
134
|
+
* Always call after selectDict is called to free up used memory from
|
135
|
+
* newly created dictionary.
|
136
|
+
*/
|
137
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
|
138
|
+
|
139
|
+
/**
|
140
|
+
* Called to finalize the dictionary and select one based on whether or not
|
141
|
+
* the shrink-dict flag was enabled. If enabled the dictionary used is the
|
142
|
+
* smallest dictionary within a specified regression of the compressed size
|
143
|
+
* from the largest dictionary.
|
144
|
+
*/
|
145
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
146
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
147
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
|
@@ -287,10 +287,10 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
|
|
287
287
|
* Prepare a context for dictionary building.
|
288
288
|
* The context is only dependent on the parameter `d` and can used multiple
|
289
289
|
* times.
|
290
|
-
* Returns
|
290
|
+
* Returns 0 on success or error code on error.
|
291
291
|
* The context must be destroyed with `FASTCOVER_ctx_destroy()`.
|
292
292
|
*/
|
293
|
-
static
|
293
|
+
static size_t
|
294
294
|
FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
295
295
|
const void* samplesBuffer,
|
296
296
|
const size_t* samplesSizes, unsigned nbSamples,
|
@@ -310,19 +310,19 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
310
310
|
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
|
311
311
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
312
312
|
(unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
|
313
|
-
return
|
313
|
+
return ERROR(srcSize_wrong);
|
314
314
|
}
|
315
315
|
|
316
316
|
/* Check if there are at least 5 training samples */
|
317
317
|
if (nbTrainSamples < 5) {
|
318
318
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
|
319
|
-
return
|
319
|
+
return ERROR(srcSize_wrong);
|
320
320
|
}
|
321
321
|
|
322
322
|
/* Check if there's testing sample */
|
323
323
|
if (nbTestSamples < 1) {
|
324
324
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
|
325
|
-
return
|
325
|
+
return ERROR(srcSize_wrong);
|
326
326
|
}
|
327
327
|
|
328
328
|
/* Zero the context */
|
@@ -347,7 +347,7 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
347
347
|
if (ctx->offsets == NULL) {
|
348
348
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
|
349
349
|
FASTCOVER_ctx_destroy(ctx);
|
350
|
-
return
|
350
|
+
return ERROR(memory_allocation);
|
351
351
|
}
|
352
352
|
|
353
353
|
/* Fill offsets from the samplesSizes */
|
@@ -364,13 +364,13 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
364
364
|
if (ctx->freqs == NULL) {
|
365
365
|
DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
|
366
366
|
FASTCOVER_ctx_destroy(ctx);
|
367
|
-
return
|
367
|
+
return ERROR(memory_allocation);
|
368
368
|
}
|
369
369
|
|
370
370
|
DISPLAYLEVEL(2, "Computing frequencies\n");
|
371
371
|
FASTCOVER_computeFrequency(ctx->freqs, ctx);
|
372
372
|
|
373
|
-
return
|
373
|
+
return 0;
|
374
374
|
}
|
375
375
|
|
376
376
|
|
@@ -435,7 +435,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
|
435
435
|
return tail;
|
436
436
|
}
|
437
437
|
|
438
|
-
|
439
438
|
/**
|
440
439
|
* Parameters for FASTCOVER_tryParameters().
|
441
440
|
*/
|
@@ -464,6 +463,7 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
464
463
|
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
|
465
464
|
/* Allocate space for hash table, dict, and freqs */
|
466
465
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
466
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
467
467
|
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
|
468
468
|
if (!segmentFreqs || !dict || !freqs) {
|
469
469
|
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
|
@@ -473,27 +473,24 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
473
473
|
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
|
474
474
|
/* Build the dictionary */
|
475
475
|
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
|
476
|
-
|
476
|
+
parameters, segmentFreqs);
|
477
|
+
|
477
478
|
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
479
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
480
|
+
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
481
|
+
totalCompressedSize);
|
482
|
+
|
483
|
+
if (COVER_dictSelectionIsError(selection)) {
|
484
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
483
485
|
goto _cleanup;
|
484
486
|
}
|
485
487
|
}
|
486
|
-
/* Check total compressed size */
|
487
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
488
|
-
ctx->samples, ctx->offsets,
|
489
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
490
|
-
dict, dictBufferCapacity);
|
491
488
|
_cleanup:
|
492
|
-
|
493
|
-
|
489
|
+
free(dict);
|
490
|
+
COVER_best_finish(data->best, parameters, selection);
|
494
491
|
free(data);
|
495
492
|
free(segmentFreqs);
|
496
|
-
|
493
|
+
COVER_dictSelectionFree(selection);
|
497
494
|
free(freqs);
|
498
495
|
}
|
499
496
|
|
@@ -508,6 +505,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
|
|
508
505
|
coverParams->nbThreads = fastCoverParams.nbThreads;
|
509
506
|
coverParams->splitPoint = fastCoverParams.splitPoint;
|
510
507
|
coverParams->zParams = fastCoverParams.zParams;
|
508
|
+
coverParams->shrinkDict = fastCoverParams.shrinkDict;
|
511
509
|
}
|
512
510
|
|
513
511
|
|
@@ -524,6 +522,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
|
|
524
522
|
fastCoverParams->f = f;
|
525
523
|
fastCoverParams->accel = accel;
|
526
524
|
fastCoverParams->zParams = coverParams.zParams;
|
525
|
+
fastCoverParams->shrinkDict = coverParams.shrinkDict;
|
527
526
|
}
|
528
527
|
|
529
528
|
|
@@ -550,11 +549,11 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
550
549
|
if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
|
551
550
|
parameters.accel)) {
|
552
551
|
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
|
553
|
-
return ERROR(
|
552
|
+
return ERROR(parameter_outOfBound);
|
554
553
|
}
|
555
554
|
if (nbSamples == 0) {
|
556
555
|
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
|
557
|
-
return ERROR(
|
556
|
+
return ERROR(srcSize_wrong);
|
558
557
|
}
|
559
558
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
560
559
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -564,11 +563,14 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
564
563
|
/* Assign corresponding FASTCOVER_accel_t to accelParams*/
|
565
564
|
accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
|
566
565
|
/* Initialize context */
|
567
|
-
|
566
|
+
{
|
567
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
568
568
|
coverParams.d, parameters.splitPoint, parameters.f,
|
569
|
-
accelParams)
|
570
|
-
|
571
|
-
|
569
|
+
accelParams);
|
570
|
+
if (ZSTD_isError(initVal)) {
|
571
|
+
DISPLAYLEVEL(1, "Failed to initialize context\n");
|
572
|
+
return initVal;
|
573
|
+
}
|
572
574
|
}
|
573
575
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
|
574
576
|
/* Build the dictionary */
|
@@ -616,6 +618,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
616
618
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
617
619
|
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
|
618
620
|
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
|
621
|
+
const unsigned shrinkDict = 0;
|
619
622
|
/* Local variables */
|
620
623
|
const int displayLevel = parameters->zParams.notificationLevel;
|
621
624
|
unsigned iteration = 1;
|
@@ -627,19 +630,19 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
627
630
|
/* Checks */
|
628
631
|
if (splitPoint <= 0 || splitPoint > 1) {
|
629
632
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
|
630
|
-
return ERROR(
|
633
|
+
return ERROR(parameter_outOfBound);
|
631
634
|
}
|
632
635
|
if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
|
633
636
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
|
634
|
-
return ERROR(
|
637
|
+
return ERROR(parameter_outOfBound);
|
635
638
|
}
|
636
639
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
637
640
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
|
638
|
-
return ERROR(
|
641
|
+
return ERROR(parameter_outOfBound);
|
639
642
|
}
|
640
643
|
if (nbSamples == 0) {
|
641
644
|
LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
|
642
|
-
return ERROR(
|
645
|
+
return ERROR(srcSize_wrong);
|
643
646
|
}
|
644
647
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
645
648
|
LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
|
@@ -666,11 +669,14 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
666
669
|
/* Initialize the context for this value of d */
|
667
670
|
FASTCOVER_ctx_t ctx;
|
668
671
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
672
|
+
{
|
673
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
|
674
|
+
if (ZSTD_isError(initVal)) {
|
675
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
676
|
+
COVER_best_destroy(&best);
|
677
|
+
POOL_free(pool);
|
678
|
+
return initVal;
|
679
|
+
}
|
674
680
|
}
|
675
681
|
if (!warned) {
|
676
682
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
|
@@ -687,7 +693,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
687
693
|
COVER_best_destroy(&best);
|
688
694
|
FASTCOVER_ctx_destroy(&ctx);
|
689
695
|
POOL_free(pool);
|
690
|
-
return ERROR(
|
696
|
+
return ERROR(memory_allocation);
|
691
697
|
}
|
692
698
|
data->ctx = &ctx;
|
693
699
|
data->best = &best;
|
@@ -697,6 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
697
703
|
data->parameters.d = d;
|
698
704
|
data->parameters.splitPoint = splitPoint;
|
699
705
|
data->parameters.steps = kSteps;
|
706
|
+
data->parameters.shrinkDict = shrinkDict;
|
700
707
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
701
708
|
/* Check the parameters */
|
702
709
|
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
|