zstd-ruby 1.4.0.0 → 1.4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/zstdruby/libzstd/Makefile +5 -0
- data/ext/zstdruby/libzstd/common/compiler.h +7 -0
- data/ext/zstdruby/libzstd/common/zstd_internal.h +58 -6
- data/ext/zstdruby/libzstd/compress/zstd_compress.c +175 -117
- data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +74 -30
- data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +56 -36
- data/ext/zstdruby/libzstd/compress/zstd_fast.c +35 -14
- data/ext/zstdruby/libzstd/compress/zstd_lazy.c +10 -5
- data/ext/zstdruby/libzstd/compress/zstd_ldm.c +1 -1
- data/ext/zstdruby/libzstd/compress/zstd_opt.c +45 -32
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +18 -7
- data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +1 -0
- data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +12 -9
- data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +20 -9
- data/ext/zstdruby/libzstd/dictBuilder/cover.c +154 -43
- data/ext/zstdruby/libzstd/dictBuilder/cover.h +38 -3
- data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +46 -39
- data/ext/zstdruby/libzstd/dictBuilder/zdict.c +9 -9
- data/ext/zstdruby/libzstd/dictBuilder/zdict.h +5 -0
- data/ext/zstdruby/libzstd/legacy/zstd_legacy.h +4 -0
- data/ext/zstdruby/libzstd/legacy/zstd_v01.c +95 -101
- data/ext/zstdruby/libzstd/legacy/zstd_v02.c +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v03.c +11 -6
- data/ext/zstdruby/libzstd/legacy/zstd_v04.c +11 -8
- data/ext/zstdruby/libzstd/legacy/zstd_v05.c +88 -84
- data/ext/zstdruby/libzstd/legacy/zstd_v06.c +2 -4
- data/ext/zstdruby/libzstd/legacy/zstd_v07.c +2 -4
- data/ext/zstdruby/libzstd/zstd.h +53 -21
- data/lib/zstd-ruby/version.rb +1 -1
- metadata +3 -4
@@ -526,10 +526,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
|
|
526
526
|
* Prepare a context for dictionary building.
|
527
527
|
* The context is only dependent on the parameter `d` and can used multiple
|
528
528
|
* times.
|
529
|
-
* Returns
|
529
|
+
* Returns 0 on success or error code on error.
|
530
530
|
* The context must be destroyed with `COVER_ctx_destroy()`.
|
531
531
|
*/
|
532
|
-
static
|
532
|
+
static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
533
533
|
const size_t *samplesSizes, unsigned nbSamples,
|
534
534
|
unsigned d, double splitPoint) {
|
535
535
|
const BYTE *const samples = (const BYTE *)samplesBuffer;
|
@@ -544,17 +544,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
544
544
|
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
|
545
545
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
546
546
|
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
|
547
|
-
return
|
547
|
+
return ERROR(srcSize_wrong);
|
548
548
|
}
|
549
549
|
/* Check if there are at least 5 training samples */
|
550
550
|
if (nbTrainSamples < 5) {
|
551
551
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
|
552
|
-
return
|
552
|
+
return ERROR(srcSize_wrong);
|
553
553
|
}
|
554
554
|
/* Check if there's testing sample */
|
555
555
|
if (nbTestSamples < 1) {
|
556
556
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
|
557
|
-
return
|
557
|
+
return ERROR(srcSize_wrong);
|
558
558
|
}
|
559
559
|
/* Zero the context */
|
560
560
|
memset(ctx, 0, sizeof(*ctx));
|
@@ -577,7 +577,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
577
577
|
if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
|
578
578
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
|
579
579
|
COVER_ctx_destroy(ctx);
|
580
|
-
return
|
580
|
+
return ERROR(memory_allocation);
|
581
581
|
}
|
582
582
|
ctx->freqs = NULL;
|
583
583
|
ctx->d = d;
|
@@ -624,7 +624,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
|
624
624
|
(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
|
625
625
|
ctx->freqs = ctx->suffix;
|
626
626
|
ctx->suffix = NULL;
|
627
|
-
return
|
627
|
+
return 0;
|
628
628
|
}
|
629
629
|
|
630
630
|
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
|
@@ -729,11 +729,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
729
729
|
/* Checks */
|
730
730
|
if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
|
731
731
|
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
|
732
|
-
return ERROR(
|
732
|
+
return ERROR(parameter_outOfBound);
|
733
733
|
}
|
734
734
|
if (nbSamples == 0) {
|
735
735
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
736
|
-
return ERROR(
|
736
|
+
return ERROR(srcSize_wrong);
|
737
737
|
}
|
738
738
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
739
739
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -741,15 +741,18 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
|
741
741
|
return ERROR(dstSize_tooSmall);
|
742
742
|
}
|
743
743
|
/* Initialize context and activeDmers */
|
744
|
-
|
745
|
-
|
746
|
-
|
744
|
+
{
|
745
|
+
size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
746
|
+
parameters.d, parameters.splitPoint);
|
747
|
+
if (ZSTD_isError(initVal)) {
|
748
|
+
return initVal;
|
749
|
+
}
|
747
750
|
}
|
748
751
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
|
749
752
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
750
753
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
751
754
|
COVER_ctx_destroy(&ctx);
|
752
|
-
return ERROR(
|
755
|
+
return ERROR(memory_allocation);
|
753
756
|
}
|
754
757
|
|
755
758
|
DISPLAYLEVEL(2, "Building dictionary\n");
|
@@ -810,7 +813,7 @@ size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
|
|
810
813
|
cctx, dst, dstCapacity, samples + offsets[i],
|
811
814
|
samplesSizes[i], cdict);
|
812
815
|
if (ZSTD_isError(size)) {
|
813
|
-
totalCompressedSize =
|
816
|
+
totalCompressedSize = size;
|
814
817
|
goto _compressCleanup;
|
815
818
|
}
|
816
819
|
totalCompressedSize += size;
|
@@ -886,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
|
|
886
889
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
887
890
|
* If this dictionary is the best so far save it and its parameters.
|
888
891
|
*/
|
889
|
-
void COVER_best_finish(COVER_best_t *best,
|
890
|
-
|
891
|
-
|
892
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
893
|
+
COVER_dictSelection_t selection) {
|
894
|
+
void* dict = selection.dictContent;
|
895
|
+
size_t compressedSize = selection.totalCompressedSize;
|
896
|
+
size_t dictSize = selection.dictSize;
|
892
897
|
if (!best) {
|
893
898
|
return;
|
894
899
|
}
|
@@ -914,6 +919,9 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
914
919
|
}
|
915
920
|
}
|
916
921
|
/* Save the dictionary, parameters, and size */
|
922
|
+
if (!dict) {
|
923
|
+
return;
|
924
|
+
}
|
917
925
|
memcpy(best->dict, dict, dictSize);
|
918
926
|
best->dictSize = dictSize;
|
919
927
|
best->parameters = parameters;
|
@@ -926,6 +934,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
|
|
926
934
|
}
|
927
935
|
}
|
928
936
|
|
937
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
|
938
|
+
COVER_dictSelection_t selection = { NULL, 0, error };
|
939
|
+
return selection;
|
940
|
+
}
|
941
|
+
|
942
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
|
943
|
+
return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
|
944
|
+
}
|
945
|
+
|
946
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection){
|
947
|
+
free(selection.dictContent);
|
948
|
+
}
|
949
|
+
|
950
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
951
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
952
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
|
953
|
+
|
954
|
+
size_t largestDict = 0;
|
955
|
+
size_t largestCompressed = 0;
|
956
|
+
BYTE* customDictContentEnd = customDictContent + dictContentSize;
|
957
|
+
|
958
|
+
BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
|
959
|
+
BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
|
960
|
+
double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
|
961
|
+
|
962
|
+
if (!largestDictbuffer || !candidateDictBuffer) {
|
963
|
+
free(largestDictbuffer);
|
964
|
+
free(candidateDictBuffer);
|
965
|
+
return COVER_dictSelectionError(dictContentSize);
|
966
|
+
}
|
967
|
+
|
968
|
+
/* Initial dictionary size and compressed size */
|
969
|
+
memcpy(largestDictbuffer, customDictContent, dictContentSize);
|
970
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
971
|
+
largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
|
972
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
973
|
+
|
974
|
+
if (ZDICT_isError(dictContentSize)) {
|
975
|
+
free(largestDictbuffer);
|
976
|
+
free(candidateDictBuffer);
|
977
|
+
return COVER_dictSelectionError(dictContentSize);
|
978
|
+
}
|
979
|
+
|
980
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
981
|
+
samplesBuffer, offsets,
|
982
|
+
nbCheckSamples, nbSamples,
|
983
|
+
largestDictbuffer, dictContentSize);
|
984
|
+
|
985
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
986
|
+
free(largestDictbuffer);
|
987
|
+
free(candidateDictBuffer);
|
988
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
989
|
+
}
|
990
|
+
|
991
|
+
if (params.shrinkDict == 0) {
|
992
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
993
|
+
free(candidateDictBuffer);
|
994
|
+
return selection;
|
995
|
+
}
|
996
|
+
|
997
|
+
largestDict = dictContentSize;
|
998
|
+
largestCompressed = totalCompressedSize;
|
999
|
+
dictContentSize = ZDICT_DICTSIZE_MIN;
|
1000
|
+
|
1001
|
+
/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
|
1002
|
+
while (dictContentSize < largestDict) {
|
1003
|
+
memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
|
1004
|
+
dictContentSize = ZDICT_finalizeDictionary(
|
1005
|
+
candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
|
1006
|
+
samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
|
1007
|
+
|
1008
|
+
if (ZDICT_isError(dictContentSize)) {
|
1009
|
+
free(largestDictbuffer);
|
1010
|
+
free(candidateDictBuffer);
|
1011
|
+
return COVER_dictSelectionError(dictContentSize);
|
1012
|
+
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
|
1016
|
+
samplesBuffer, offsets,
|
1017
|
+
nbCheckSamples, nbSamples,
|
1018
|
+
candidateDictBuffer, dictContentSize);
|
1019
|
+
|
1020
|
+
if (ZSTD_isError(totalCompressedSize)) {
|
1021
|
+
free(largestDictbuffer);
|
1022
|
+
free(candidateDictBuffer);
|
1023
|
+
return COVER_dictSelectionError(totalCompressedSize);
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
if (totalCompressedSize <= largestCompressed * regressionTolerance) {
|
1027
|
+
COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
|
1028
|
+
free(largestDictbuffer);
|
1029
|
+
return selection;
|
1030
|
+
}
|
1031
|
+
dictContentSize *= 2;
|
1032
|
+
}
|
1033
|
+
dictContentSize = largestDict;
|
1034
|
+
totalCompressedSize = largestCompressed;
|
1035
|
+
{
|
1036
|
+
COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
|
1037
|
+
free(candidateDictBuffer);
|
1038
|
+
return selection;
|
1039
|
+
}
|
1040
|
+
}
|
1041
|
+
|
929
1042
|
/**
|
930
1043
|
* Parameters for COVER_tryParameters().
|
931
1044
|
*/
|
@@ -951,6 +1064,7 @@ static void COVER_tryParameters(void *opaque) {
|
|
951
1064
|
/* Allocate space for hash table, dict, and freqs */
|
952
1065
|
COVER_map_t activeDmers;
|
953
1066
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
1067
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
954
1068
|
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
|
955
1069
|
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
956
1070
|
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
@@ -966,29 +1080,21 @@ static void COVER_tryParameters(void *opaque) {
|
|
966
1080
|
{
|
967
1081
|
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
|
968
1082
|
dictBufferCapacity, parameters);
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
if (
|
974
|
-
DISPLAYLEVEL(1, "Failed to
|
1083
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
1084
|
+
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
1085
|
+
totalCompressedSize);
|
1086
|
+
|
1087
|
+
if (COVER_dictSelectionIsError(selection)) {
|
1088
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
975
1089
|
goto _cleanup;
|
976
1090
|
}
|
977
1091
|
}
|
978
|
-
/* Check total compressed size */
|
979
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
980
|
-
ctx->samples, ctx->offsets,
|
981
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
982
|
-
dict, dictBufferCapacity);
|
983
|
-
|
984
1092
|
_cleanup:
|
985
|
-
|
986
|
-
|
1093
|
+
free(dict);
|
1094
|
+
COVER_best_finish(data->best, parameters, selection);
|
987
1095
|
free(data);
|
988
1096
|
COVER_map_destroy(&activeDmers);
|
989
|
-
|
990
|
-
free(dict);
|
991
|
-
}
|
1097
|
+
COVER_dictSelectionFree(selection);
|
992
1098
|
if (freqs) {
|
993
1099
|
free(freqs);
|
994
1100
|
}
|
@@ -1010,6 +1116,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1010
1116
|
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
|
1011
1117
|
const unsigned kIterations =
|
1012
1118
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
1119
|
+
const unsigned shrinkDict = 0;
|
1013
1120
|
/* Local variables */
|
1014
1121
|
const int displayLevel = parameters->zParams.notificationLevel;
|
1015
1122
|
unsigned iteration = 1;
|
@@ -1022,15 +1129,15 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1022
1129
|
/* Checks */
|
1023
1130
|
if (splitPoint <= 0 || splitPoint > 1) {
|
1024
1131
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
1025
|
-
return ERROR(
|
1132
|
+
return ERROR(parameter_outOfBound);
|
1026
1133
|
}
|
1027
1134
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
1028
1135
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
|
1029
|
-
return ERROR(
|
1136
|
+
return ERROR(parameter_outOfBound);
|
1030
1137
|
}
|
1031
1138
|
if (nbSamples == 0) {
|
1032
1139
|
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
|
1033
|
-
return ERROR(
|
1140
|
+
return ERROR(srcSize_wrong);
|
1034
1141
|
}
|
1035
1142
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
1036
1143
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -1054,11 +1161,14 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1054
1161
|
/* Initialize the context for this value of d */
|
1055
1162
|
COVER_ctx_t ctx;
|
1056
1163
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1164
|
+
{
|
1165
|
+
const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
|
1166
|
+
if (ZSTD_isError(initVal)) {
|
1167
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
1168
|
+
COVER_best_destroy(&best);
|
1169
|
+
POOL_free(pool);
|
1170
|
+
return initVal;
|
1171
|
+
}
|
1062
1172
|
}
|
1063
1173
|
if (!warned) {
|
1064
1174
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
|
@@ -1075,7 +1185,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1075
1185
|
COVER_best_destroy(&best);
|
1076
1186
|
COVER_ctx_destroy(&ctx);
|
1077
1187
|
POOL_free(pool);
|
1078
|
-
return ERROR(
|
1188
|
+
return ERROR(memory_allocation);
|
1079
1189
|
}
|
1080
1190
|
data->ctx = &ctx;
|
1081
1191
|
data->best = &best;
|
@@ -1085,6 +1195,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
|
1085
1195
|
data->parameters.d = d;
|
1086
1196
|
data->parameters.splitPoint = splitPoint;
|
1087
1197
|
data->parameters.steps = kSteps;
|
1198
|
+
data->parameters.shrinkDict = shrinkDict;
|
1088
1199
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
1089
1200
|
/* Check the parameters */
|
1090
1201
|
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
|
@@ -46,6 +46,15 @@ typedef struct {
|
|
46
46
|
U32 size;
|
47
47
|
} COVER_epoch_info_t;
|
48
48
|
|
49
|
+
/**
|
50
|
+
* Struct used for the dictionary selection function.
|
51
|
+
*/
|
52
|
+
typedef struct COVER_dictSelection {
|
53
|
+
BYTE* dictContent;
|
54
|
+
size_t dictSize;
|
55
|
+
size_t totalCompressedSize;
|
56
|
+
} COVER_dictSelection_t;
|
57
|
+
|
49
58
|
/**
|
50
59
|
* Computes the number of epochs and the size of each epoch.
|
51
60
|
* We will make sure that each epoch gets at least 10 * k bytes.
|
@@ -107,6 +116,32 @@ void COVER_best_start(COVER_best_t *best);
|
|
107
116
|
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
|
108
117
|
* If this dictionary is the best so far save it and its parameters.
|
109
118
|
*/
|
110
|
-
void COVER_best_finish(COVER_best_t *best,
|
111
|
-
|
112
|
-
|
119
|
+
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
|
120
|
+
COVER_dictSelection_t selection);
|
121
|
+
/**
|
122
|
+
* Error function for COVER_selectDict function. Checks if the return
|
123
|
+
* value is an error.
|
124
|
+
*/
|
125
|
+
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
|
126
|
+
|
127
|
+
/**
|
128
|
+
* Error function for COVER_selectDict function. Returns a struct where
|
129
|
+
* return.totalCompressedSize is a ZSTD error.
|
130
|
+
*/
|
131
|
+
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
|
132
|
+
|
133
|
+
/**
|
134
|
+
* Always call after selectDict is called to free up used memory from
|
135
|
+
* newly created dictionary.
|
136
|
+
*/
|
137
|
+
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
|
138
|
+
|
139
|
+
/**
|
140
|
+
* Called to finalize the dictionary and select one based on whether or not
|
141
|
+
* the shrink-dict flag was enabled. If enabled the dictionary used is the
|
142
|
+
* smallest dictionary within a specified regression of the compressed size
|
143
|
+
* from the largest dictionary.
|
144
|
+
*/
|
145
|
+
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
|
146
|
+
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
|
147
|
+
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
|
@@ -287,10 +287,10 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
|
|
287
287
|
* Prepare a context for dictionary building.
|
288
288
|
* The context is only dependent on the parameter `d` and can used multiple
|
289
289
|
* times.
|
290
|
-
* Returns
|
290
|
+
* Returns 0 on success or error code on error.
|
291
291
|
* The context must be destroyed with `FASTCOVER_ctx_destroy()`.
|
292
292
|
*/
|
293
|
-
static
|
293
|
+
static size_t
|
294
294
|
FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
295
295
|
const void* samplesBuffer,
|
296
296
|
const size_t* samplesSizes, unsigned nbSamples,
|
@@ -310,19 +310,19 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
310
310
|
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
|
311
311
|
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
|
312
312
|
(unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
|
313
|
-
return
|
313
|
+
return ERROR(srcSize_wrong);
|
314
314
|
}
|
315
315
|
|
316
316
|
/* Check if there are at least 5 training samples */
|
317
317
|
if (nbTrainSamples < 5) {
|
318
318
|
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
|
319
|
-
return
|
319
|
+
return ERROR(srcSize_wrong);
|
320
320
|
}
|
321
321
|
|
322
322
|
/* Check if there's testing sample */
|
323
323
|
if (nbTestSamples < 1) {
|
324
324
|
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
|
325
|
-
return
|
325
|
+
return ERROR(srcSize_wrong);
|
326
326
|
}
|
327
327
|
|
328
328
|
/* Zero the context */
|
@@ -347,7 +347,7 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
347
347
|
if (ctx->offsets == NULL) {
|
348
348
|
DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
|
349
349
|
FASTCOVER_ctx_destroy(ctx);
|
350
|
-
return
|
350
|
+
return ERROR(memory_allocation);
|
351
351
|
}
|
352
352
|
|
353
353
|
/* Fill offsets from the samplesSizes */
|
@@ -364,13 +364,13 @@ FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
|
|
364
364
|
if (ctx->freqs == NULL) {
|
365
365
|
DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
|
366
366
|
FASTCOVER_ctx_destroy(ctx);
|
367
|
-
return
|
367
|
+
return ERROR(memory_allocation);
|
368
368
|
}
|
369
369
|
|
370
370
|
DISPLAYLEVEL(2, "Computing frequencies\n");
|
371
371
|
FASTCOVER_computeFrequency(ctx->freqs, ctx);
|
372
372
|
|
373
|
-
return
|
373
|
+
return 0;
|
374
374
|
}
|
375
375
|
|
376
376
|
|
@@ -435,7 +435,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
|
435
435
|
return tail;
|
436
436
|
}
|
437
437
|
|
438
|
-
|
439
438
|
/**
|
440
439
|
* Parameters for FASTCOVER_tryParameters().
|
441
440
|
*/
|
@@ -464,6 +463,7 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
464
463
|
U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
|
465
464
|
/* Allocate space for hash table, dict, and freqs */
|
466
465
|
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
|
466
|
+
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
|
467
467
|
U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
|
468
468
|
if (!segmentFreqs || !dict || !freqs) {
|
469
469
|
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
|
@@ -473,27 +473,24 @@ static void FASTCOVER_tryParameters(void *opaque)
|
|
473
473
|
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
|
474
474
|
/* Build the dictionary */
|
475
475
|
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
|
476
|
-
|
476
|
+
parameters, segmentFreqs);
|
477
|
+
|
477
478
|
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
479
|
+
selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
|
480
|
+
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
|
481
|
+
totalCompressedSize);
|
482
|
+
|
483
|
+
if (COVER_dictSelectionIsError(selection)) {
|
484
|
+
DISPLAYLEVEL(1, "Failed to select dictionary\n");
|
483
485
|
goto _cleanup;
|
484
486
|
}
|
485
487
|
}
|
486
|
-
/* Check total compressed size */
|
487
|
-
totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
|
488
|
-
ctx->samples, ctx->offsets,
|
489
|
-
ctx->nbTrainSamples, ctx->nbSamples,
|
490
|
-
dict, dictBufferCapacity);
|
491
488
|
_cleanup:
|
492
|
-
|
493
|
-
|
489
|
+
free(dict);
|
490
|
+
COVER_best_finish(data->best, parameters, selection);
|
494
491
|
free(data);
|
495
492
|
free(segmentFreqs);
|
496
|
-
|
493
|
+
COVER_dictSelectionFree(selection);
|
497
494
|
free(freqs);
|
498
495
|
}
|
499
496
|
|
@@ -508,6 +505,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
|
|
508
505
|
coverParams->nbThreads = fastCoverParams.nbThreads;
|
509
506
|
coverParams->splitPoint = fastCoverParams.splitPoint;
|
510
507
|
coverParams->zParams = fastCoverParams.zParams;
|
508
|
+
coverParams->shrinkDict = fastCoverParams.shrinkDict;
|
511
509
|
}
|
512
510
|
|
513
511
|
|
@@ -524,6 +522,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
|
|
524
522
|
fastCoverParams->f = f;
|
525
523
|
fastCoverParams->accel = accel;
|
526
524
|
fastCoverParams->zParams = coverParams.zParams;
|
525
|
+
fastCoverParams->shrinkDict = coverParams.shrinkDict;
|
527
526
|
}
|
528
527
|
|
529
528
|
|
@@ -550,11 +549,11 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
550
549
|
if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
|
551
550
|
parameters.accel)) {
|
552
551
|
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
|
553
|
-
return ERROR(
|
552
|
+
return ERROR(parameter_outOfBound);
|
554
553
|
}
|
555
554
|
if (nbSamples == 0) {
|
556
555
|
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
|
557
|
-
return ERROR(
|
556
|
+
return ERROR(srcSize_wrong);
|
558
557
|
}
|
559
558
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
560
559
|
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
|
@@ -564,11 +563,14 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
|
564
563
|
/* Assign corresponding FASTCOVER_accel_t to accelParams*/
|
565
564
|
accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
|
566
565
|
/* Initialize context */
|
567
|
-
|
566
|
+
{
|
567
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
|
568
568
|
coverParams.d, parameters.splitPoint, parameters.f,
|
569
|
-
accelParams)
|
570
|
-
|
571
|
-
|
569
|
+
accelParams);
|
570
|
+
if (ZSTD_isError(initVal)) {
|
571
|
+
DISPLAYLEVEL(1, "Failed to initialize context\n");
|
572
|
+
return initVal;
|
573
|
+
}
|
572
574
|
}
|
573
575
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
|
574
576
|
/* Build the dictionary */
|
@@ -616,6 +618,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
616
618
|
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
|
617
619
|
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
|
618
620
|
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
|
621
|
+
const unsigned shrinkDict = 0;
|
619
622
|
/* Local variables */
|
620
623
|
const int displayLevel = parameters->zParams.notificationLevel;
|
621
624
|
unsigned iteration = 1;
|
@@ -627,19 +630,19 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
627
630
|
/* Checks */
|
628
631
|
if (splitPoint <= 0 || splitPoint > 1) {
|
629
632
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
|
630
|
-
return ERROR(
|
633
|
+
return ERROR(parameter_outOfBound);
|
631
634
|
}
|
632
635
|
if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
|
633
636
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
|
634
|
-
return ERROR(
|
637
|
+
return ERROR(parameter_outOfBound);
|
635
638
|
}
|
636
639
|
if (kMinK < kMaxD || kMaxK < kMinK) {
|
637
640
|
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
|
638
|
-
return ERROR(
|
641
|
+
return ERROR(parameter_outOfBound);
|
639
642
|
}
|
640
643
|
if (nbSamples == 0) {
|
641
644
|
LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
|
642
|
-
return ERROR(
|
645
|
+
return ERROR(srcSize_wrong);
|
643
646
|
}
|
644
647
|
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
|
645
648
|
LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
|
@@ -666,11 +669,14 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
666
669
|
/* Initialize the context for this value of d */
|
667
670
|
FASTCOVER_ctx_t ctx;
|
668
671
|
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
672
|
+
{
|
673
|
+
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
|
674
|
+
if (ZSTD_isError(initVal)) {
|
675
|
+
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
|
676
|
+
COVER_best_destroy(&best);
|
677
|
+
POOL_free(pool);
|
678
|
+
return initVal;
|
679
|
+
}
|
674
680
|
}
|
675
681
|
if (!warned) {
|
676
682
|
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
|
@@ -687,7 +693,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
687
693
|
COVER_best_destroy(&best);
|
688
694
|
FASTCOVER_ctx_destroy(&ctx);
|
689
695
|
POOL_free(pool);
|
690
|
-
return ERROR(
|
696
|
+
return ERROR(memory_allocation);
|
691
697
|
}
|
692
698
|
data->ctx = &ctx;
|
693
699
|
data->best = &best;
|
@@ -697,6 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
|
697
703
|
data->parameters.d = d;
|
698
704
|
data->parameters.splitPoint = splitPoint;
|
699
705
|
data->parameters.steps = kSteps;
|
706
|
+
data->parameters.shrinkDict = shrinkDict;
|
700
707
|
data->parameters.zParams.notificationLevel = g_displayLevel;
|
701
708
|
/* Check the parameters */
|
702
709
|
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
|