isomorfeus-ferret 0.17.0 → 0.17.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/isomorfeus_ferret_ext/bm_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/frb_index.c +0 -35
- data/ext/isomorfeus_ferret_ext/frt_except.c +42 -0
- data/ext/isomorfeus_ferret_ext/frt_except.h +2 -0
- data/ext/isomorfeus_ferret_ext/frt_in_stream.c +492 -0
- data/ext/isomorfeus_ferret_ext/frt_in_stream.h +240 -0
- data/ext/isomorfeus_ferret_ext/frt_ind.c +0 -11
- data/ext/isomorfeus_ferret_ext/frt_ind.h +0 -1
- data/ext/isomorfeus_ferret_ext/frt_index.c +6 -1069
- data/ext/isomorfeus_ferret_ext/frt_index.h +1 -43
- data/ext/isomorfeus_ferret_ext/frt_lazy_doc.c +29 -0
- data/ext/isomorfeus_ferret_ext/frt_lazy_doc.h +19 -0
- data/ext/isomorfeus_ferret_ext/frt_lazy_doc_field.c +100 -0
- data/ext/isomorfeus_ferret_ext/frt_lazy_doc_field.h +33 -0
- data/ext/isomorfeus_ferret_ext/frt_out_stream.c +346 -0
- data/ext/isomorfeus_ferret_ext/frt_out_stream.h +198 -0
- data/ext/isomorfeus_ferret_ext/frt_ram_store.h +12 -0
- data/ext/isomorfeus_ferret_ext/frt_store.c +2 -513
- data/ext/isomorfeus_ferret_ext/frt_store.h +4 -443
- data/ext/isomorfeus_ferret_ext/frt_stream.h +18 -0
- data/ext/isomorfeus_ferret_ext/test_file_deleter.c +0 -1
- data/ext/isomorfeus_ferret_ext/test_index.c +0 -8
- data/ext/isomorfeus_ferret_ext/test_threading.c +1 -10
- data/lib/isomorfeus/ferret/index/index.rb +0 -11
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +22 -3
@@ -1,4 +1,6 @@
|
|
1
1
|
#include "frt_global.h"
|
2
|
+
#include "frt_lazy_doc_field.h"
|
3
|
+
#include "frt_lazy_doc.h"
|
2
4
|
#include "frt_index.h"
|
3
5
|
#include "frt_similarity.h"
|
4
6
|
#include "frt_helper.h"
|
@@ -6,13 +8,6 @@
|
|
6
8
|
#include <string.h>
|
7
9
|
#include <limits.h>
|
8
10
|
#include <ctype.h>
|
9
|
-
#include "brotli_decode.h"
|
10
|
-
#include "brotli_encode.h"
|
11
|
-
#include "bzlib.h"
|
12
|
-
#include "lz4frame.h"
|
13
|
-
|
14
|
-
// #undef close
|
15
|
-
// #undef read
|
16
11
|
|
17
12
|
extern rb_encoding *utf8_encoding;
|
18
13
|
extern void frt_micro_sleep(const int micro_seconds);
|
@@ -46,9 +41,6 @@ static char *ste_next(FrtTermEnum *te);
|
|
46
41
|
#define FORMAT 15
|
47
42
|
#define SEGMENTS_GEN_FILE_NAME "segments"
|
48
43
|
#define MAX_EXT_LEN 10
|
49
|
-
#define FRT_COMPRESSION_BUFFER_SIZE 16348
|
50
|
-
#define FRT_BROTLI_COMPRESSION_LEVEL 4
|
51
|
-
#define FRT_BZIP_COMPRESSION_LEVEL 9
|
52
44
|
|
53
45
|
/* *** Must be three characters *** */
|
54
46
|
static const char *INDEX_EXTENSIONS[] = {
|
@@ -590,29 +582,6 @@ static char *si_norm_file_name(FrtSegmentInfo *si, char *buf, int field_num)
|
|
590
582
|
|
591
583
|
void frt_deleter_queue_file(FrtDeleter *dlr, const char *file_name);
|
592
584
|
|
593
|
-
static void si_delete_files(FrtSegmentInfo *si, FrtFieldInfos *fis, FrtDeleter *dlr)
|
594
|
-
{
|
595
|
-
int i;
|
596
|
-
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
597
|
-
size_t seg_len = strlen(si->name);
|
598
|
-
char *ext;
|
599
|
-
|
600
|
-
for (i = si->norm_gens_size - 1; i >= 0; i--) {
|
601
|
-
if (0 <= si->norm_gens[i]) {
|
602
|
-
frt_deleter_queue_file(dlr, si_norm_file_name(si, file_name, fis->fields[i]->number));
|
603
|
-
}
|
604
|
-
}
|
605
|
-
|
606
|
-
memcpy(file_name, si->name, seg_len);
|
607
|
-
file_name[seg_len] = '.';
|
608
|
-
ext = file_name + seg_len + 1;
|
609
|
-
|
610
|
-
for (i = FRT_NELEMS(INDEX_EXTENSIONS) - 1; i >= 0; i--) {
|
611
|
-
memcpy(ext, INDEX_EXTENSIONS[i], 4);
|
612
|
-
frt_deleter_queue_file(dlr, file_name);
|
613
|
-
}
|
614
|
-
}
|
615
|
-
|
616
585
|
/****************************************************************************
|
617
586
|
*
|
618
587
|
* SegmentInfos
|
@@ -1044,355 +1013,6 @@ frt_u64 frt_sis_read_current_version(FrtStore *store)
|
|
1044
1013
|
return fsf.ret.uint64;
|
1045
1014
|
}
|
1046
1015
|
|
1047
|
-
/****************************************************************************
|
1048
|
-
*
|
1049
|
-
* LazyDocField
|
1050
|
-
*
|
1051
|
-
****************************************************************************/
|
1052
|
-
|
1053
|
-
static FrtLazyDocField *lazy_df_new(ID name, const int size, FrtCompressionType compression) {
|
1054
|
-
FrtLazyDocField *self = FRT_ALLOC(FrtLazyDocField);
|
1055
|
-
self->name = name;
|
1056
|
-
self->size = size;
|
1057
|
-
self->data = FRT_ALLOC_AND_ZERO_N(FrtLazyDocFieldData, size);
|
1058
|
-
self->compression = compression;
|
1059
|
-
self->decompressed = false;
|
1060
|
-
self->loaded = false;
|
1061
|
-
return self;
|
1062
|
-
}
|
1063
|
-
|
1064
|
-
static void lazy_df_destroy(FrtLazyDocField *self) {
|
1065
|
-
int i;
|
1066
|
-
for (i = self->size - 1; i >= 0; i--) {
|
1067
|
-
if (self->data[i].text) {
|
1068
|
-
free(self->data[i].text);
|
1069
|
-
}
|
1070
|
-
}
|
1071
|
-
free(self->data);
|
1072
|
-
free(self);
|
1073
|
-
}
|
1074
|
-
|
1075
|
-
static void comp_raise(void) {
|
1076
|
-
FRT_RAISE(EXCEPTION, "Compression error");
|
1077
|
-
}
|
1078
|
-
|
1079
|
-
static char *is_read_brotli_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
|
1080
|
-
int buf_out_idx = 0;
|
1081
|
-
int read_len;
|
1082
|
-
frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
|
1083
|
-
const frt_uchar *next_in;
|
1084
|
-
size_t available_in;
|
1085
|
-
frt_uchar *buf_out = NULL;
|
1086
|
-
frt_uchar *next_out;
|
1087
|
-
size_t available_out;
|
1088
|
-
|
1089
|
-
BrotliDecoderState *b_state = BrotliDecoderCreateInstance(NULL, NULL, NULL);
|
1090
|
-
BrotliDecoderResult b_result = BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
|
1091
|
-
if (!b_state) { comp_raise(); return NULL; }
|
1092
|
-
|
1093
|
-
do {
|
1094
|
-
read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1095
|
-
frt_is_read_bytes(is, buf_in, read_len);
|
1096
|
-
compressed_len -= read_len;
|
1097
|
-
available_in = read_len;
|
1098
|
-
next_in = buf_in;
|
1099
|
-
available_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1100
|
-
do {
|
1101
|
-
FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
|
1102
|
-
next_out = buf_out + buf_out_idx;
|
1103
|
-
b_result = BrotliDecoderDecompressStream(b_state,
|
1104
|
-
&available_in, &next_in,
|
1105
|
-
&available_out, &next_out, NULL);
|
1106
|
-
if (b_result == BROTLI_DECODER_RESULT_ERROR) { comp_raise(); return NULL; }
|
1107
|
-
buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - available_out;
|
1108
|
-
} while (b_result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT);
|
1109
|
-
} while (b_result != BROTLI_DECODER_RESULT_SUCCESS && compressed_len > 0);
|
1110
|
-
|
1111
|
-
BrotliDecoderDestroyInstance(b_state);
|
1112
|
-
|
1113
|
-
FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + 1);
|
1114
|
-
buf_out[buf_out_idx] = '\0';
|
1115
|
-
*len = buf_out_idx;
|
1116
|
-
return (char *)buf_out;
|
1117
|
-
}
|
1118
|
-
|
1119
|
-
static void zraise(int ret) {
|
1120
|
-
switch (ret) {
|
1121
|
-
case BZ_IO_ERROR:
|
1122
|
-
if (ferror(stdin))
|
1123
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: error reading stdin");
|
1124
|
-
if (ferror(stdout))
|
1125
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: error writing stdout");
|
1126
|
-
break;
|
1127
|
-
case BZ_CONFIG_ERROR:
|
1128
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: system configuration error");
|
1129
|
-
break;
|
1130
|
-
case BZ_SEQUENCE_ERROR: /* shouldn't occur if code is correct */
|
1131
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! sequence error");
|
1132
|
-
break;
|
1133
|
-
case BZ_PARAM_ERROR: /* shouldn't occur if code is correct */
|
1134
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! parameter error");
|
1135
|
-
break;
|
1136
|
-
case BZ_MEM_ERROR:
|
1137
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: memory error");
|
1138
|
-
break;
|
1139
|
-
case BZ_DATA_ERROR:
|
1140
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check error");
|
1141
|
-
break;
|
1142
|
-
case BZ_DATA_ERROR_MAGIC:
|
1143
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check - non-matching magic");
|
1144
|
-
break;
|
1145
|
-
case BZ_UNEXPECTED_EOF:
|
1146
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: unexpected end-of-file");
|
1147
|
-
break;
|
1148
|
-
case BZ_OUTBUFF_FULL:
|
1149
|
-
FRT_RAISE(FRT_IO_ERROR, "bzlib: output buffer full");
|
1150
|
-
break;
|
1151
|
-
default:
|
1152
|
-
FRT_RAISE(FRT_EXCEPTION, "bzlib: unknown error");
|
1153
|
-
}
|
1154
|
-
}
|
1155
|
-
|
1156
|
-
static char *is_read_bz2_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
|
1157
|
-
int buf_out_idx = 0, ret, read_len;
|
1158
|
-
char *buf_out = NULL;
|
1159
|
-
char buf_in[FRT_COMPRESSION_BUFFER_SIZE];
|
1160
|
-
bz_stream zstrm;
|
1161
|
-
zstrm.bzalloc = NULL;
|
1162
|
-
zstrm.bzfree = NULL;
|
1163
|
-
zstrm.opaque = NULL;
|
1164
|
-
zstrm.next_in = NULL;
|
1165
|
-
zstrm.avail_in = 0;
|
1166
|
-
if ((ret = BZ2_bzDecompressInit(&zstrm, 0, 0)) != BZ_OK) zraise(ret);
|
1167
|
-
|
1168
|
-
do {
|
1169
|
-
read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1170
|
-
frt_is_read_bytes(is, (frt_uchar *)buf_in, read_len);
|
1171
|
-
compressed_len -= read_len;
|
1172
|
-
zstrm.avail_in = read_len;
|
1173
|
-
zstrm.next_in = buf_in;
|
1174
|
-
zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1175
|
-
|
1176
|
-
do {
|
1177
|
-
REALLOC_N(buf_out, char, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
|
1178
|
-
zstrm.next_out = buf_out + buf_out_idx;
|
1179
|
-
ret = BZ2_bzDecompress(&zstrm);
|
1180
|
-
assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
|
1181
|
-
if (ret != BZ_OK && ret != BZ_STREAM_END) {
|
1182
|
-
(void)BZ2_bzDecompressEnd(&zstrm);
|
1183
|
-
zraise(ret);
|
1184
|
-
}
|
1185
|
-
buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
|
1186
|
-
} while (zstrm.avail_out == 0);
|
1187
|
-
} while (ret != BZ_STREAM_END && compressed_len != 0);
|
1188
|
-
|
1189
|
-
(void)BZ2_bzDecompressEnd(&zstrm);
|
1190
|
-
|
1191
|
-
FRT_REALLOC_N(buf_out, char, buf_out_idx + 1);
|
1192
|
-
buf_out[buf_out_idx] = '\0';
|
1193
|
-
|
1194
|
-
*len = buf_out_idx;
|
1195
|
-
return (char *)buf_out;
|
1196
|
-
}
|
1197
|
-
|
1198
|
-
static char *is_read_lz4_compressed_bytes(FrtInStream *is, int compressed_len, int *length) {
|
1199
|
-
frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
|
1200
|
-
char *buf_out = NULL;
|
1201
|
-
int dc_length = 0;
|
1202
|
-
LZ4F_dctx *dctx;
|
1203
|
-
LZ4F_frameInfo_t frame_info;
|
1204
|
-
LZ4F_errorCode_t dctx_status = LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION);
|
1205
|
-
if (LZ4F_isError(dctx_status)) { *length = -1; return NULL; }
|
1206
|
-
|
1207
|
-
/* header and buffer */
|
1208
|
-
int read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1209
|
-
frt_is_read_bytes(is, buf_in, read_length);
|
1210
|
-
compressed_len -= read_length;
|
1211
|
-
|
1212
|
-
size_t consumed_size = read_length;
|
1213
|
-
size_t res = LZ4F_getFrameInfo(dctx, &frame_info, buf_in, &consumed_size);
|
1214
|
-
if (LZ4F_isError(res)) { *length = -1; return NULL; }
|
1215
|
-
size_t buf_out_length;
|
1216
|
-
switch(frame_info.blockSizeID) {
|
1217
|
-
case LZ4F_default:
|
1218
|
-
case LZ4F_max64KB:
|
1219
|
-
buf_out_length = 1 << 16;
|
1220
|
-
break;
|
1221
|
-
case LZ4F_max256KB:
|
1222
|
-
buf_out_length = 1 << 18;
|
1223
|
-
break;
|
1224
|
-
case LZ4F_max1MB:
|
1225
|
-
buf_out_length = 1 << 20;
|
1226
|
-
break;
|
1227
|
-
case LZ4F_max4MB:
|
1228
|
-
buf_out_length = 1 << 22;
|
1229
|
-
break;
|
1230
|
-
default:
|
1231
|
-
buf_out_length = 0;
|
1232
|
-
}
|
1233
|
-
|
1234
|
-
res = 1;
|
1235
|
-
int first_chunk = 1;
|
1236
|
-
|
1237
|
-
/* decompress data */
|
1238
|
-
while (res != 0) {
|
1239
|
-
if (!first_chunk) {
|
1240
|
-
read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1241
|
-
frt_is_read_bytes(is, buf_in, read_length);
|
1242
|
-
compressed_len -= read_length;
|
1243
|
-
consumed_size = 0;
|
1244
|
-
}
|
1245
|
-
first_chunk = 0;
|
1246
|
-
|
1247
|
-
char *src = (char *)(buf_in + consumed_size);
|
1248
|
-
char *src_end = (char *)buf_in + read_length;
|
1249
|
-
|
1250
|
-
while (src < src_end && res != 0){
|
1251
|
-
size_t dest_length = buf_out_length;
|
1252
|
-
size_t consumed_size = read_length;
|
1253
|
-
FRT_REALLOC_N(buf_out, char, dc_length + buf_out_length);
|
1254
|
-
res = LZ4F_decompress(dctx, buf_out + dc_length, &dest_length, src, &consumed_size, NULL);
|
1255
|
-
if (LZ4F_isError(res)) { *length = -1; return NULL; }
|
1256
|
-
dc_length += dest_length;
|
1257
|
-
src = src + consumed_size;
|
1258
|
-
}
|
1259
|
-
}
|
1260
|
-
|
1261
|
-
/* finish up */
|
1262
|
-
LZ4F_freeDecompressionContext(dctx);
|
1263
|
-
|
1264
|
-
FRT_REALLOC_N(buf_out, char, dc_length + 1);
|
1265
|
-
buf_out[dc_length] = '\0';
|
1266
|
-
|
1267
|
-
*length = dc_length;
|
1268
|
-
return buf_out;
|
1269
|
-
}
|
1270
|
-
|
1271
|
-
static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len, FrtCompressionType compression) {
|
1272
|
-
switch (compression) {
|
1273
|
-
case FRT_COMPRESSION_BROTLI:
|
1274
|
-
return is_read_brotli_compressed_bytes(is, compressed_len, len);
|
1275
|
-
case FRT_COMPRESSION_BZ2:
|
1276
|
-
return is_read_bz2_compressed_bytes(is, compressed_len, len);
|
1277
|
-
case FRT_COMPRESSION_LZ4:
|
1278
|
-
return is_read_lz4_compressed_bytes(is, compressed_len, len);
|
1279
|
-
default:
|
1280
|
-
return NULL;
|
1281
|
-
}
|
1282
|
-
}
|
1283
|
-
|
1284
|
-
char *frt_lazy_df_get_data(FrtLazyDocField *self, int i) {
|
1285
|
-
char *text = NULL;
|
1286
|
-
if (i < self->size && i >= 0) {
|
1287
|
-
text = self->data[i].text;
|
1288
|
-
if (NULL == text) {
|
1289
|
-
const int read_len = self->data[i].length + 1;
|
1290
|
-
frt_is_seek(self->doc->fields_in, self->data[i].start);
|
1291
|
-
if (self->data[i].compression != FRT_COMPRESSION_NONE) {
|
1292
|
-
self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length), self->data[i].compression);
|
1293
|
-
} else {
|
1294
|
-
self->data[i].text = text = FRT_ALLOC_N(char, read_len);
|
1295
|
-
frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
|
1296
|
-
text[read_len - 1] = '\0';
|
1297
|
-
}
|
1298
|
-
self->loaded = true;
|
1299
|
-
}
|
1300
|
-
}
|
1301
|
-
|
1302
|
-
return text;
|
1303
|
-
}
|
1304
|
-
|
1305
|
-
void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len) {
|
1306
|
-
if (self->compression != FRT_COMPRESSION_NONE && !self->decompressed) {
|
1307
|
-
int i;
|
1308
|
-
self->len = 0;
|
1309
|
-
for (i = self->size-1; i >= 0; i--) {
|
1310
|
-
(void)frt_lazy_df_get_data(self, i);
|
1311
|
-
self->len += self->data[i].length + 1;
|
1312
|
-
}
|
1313
|
-
self->len--; /* each field separated by ' ' but no need to add to end */
|
1314
|
-
self->decompressed = true;
|
1315
|
-
}
|
1316
|
-
if (start < 0 || start >= self->len) {
|
1317
|
-
FRT_RAISE(FRT_IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
|
1318
|
-
"is not between 0 and %d", start, self->len);
|
1319
|
-
}
|
1320
|
-
if (len <= 0) {
|
1321
|
-
FRT_RAISE(FRT_IO_ERROR, "len = %d, but should be greater than 0", len);
|
1322
|
-
}
|
1323
|
-
if (start + len > self->len) {
|
1324
|
-
FRT_RAISE(FRT_IO_ERROR, "Tried to read past end of field. Field is only %d "
|
1325
|
-
"bytes long but tried to read to %d", self->len, start + len);
|
1326
|
-
}
|
1327
|
-
if (self->compression != FRT_COMPRESSION_NONE) {
|
1328
|
-
int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
|
1329
|
-
for (i = 0; i < self->size; i++) {
|
1330
|
-
cur_end = cur_start + self->data[i].length;
|
1331
|
-
if (start < cur_end) {
|
1332
|
-
copy_start = start > cur_start ? start - cur_start : 0;
|
1333
|
-
copy_len = cur_end - cur_start - copy_start;
|
1334
|
-
if (copy_len >= len) {
|
1335
|
-
copy_len = len;
|
1336
|
-
len = 0;
|
1337
|
-
}
|
1338
|
-
else {
|
1339
|
-
len -= copy_len;
|
1340
|
-
}
|
1341
|
-
memcpy(buf + buf_start,
|
1342
|
-
self->data[i].text + copy_start,
|
1343
|
-
copy_len);
|
1344
|
-
buf_start += copy_len;
|
1345
|
-
if (len > 0) {
|
1346
|
-
buf[buf_start++] = ' ';
|
1347
|
-
len--;
|
1348
|
-
}
|
1349
|
-
if (len == 0) break;
|
1350
|
-
}
|
1351
|
-
cur_start = cur_end + 1;
|
1352
|
-
}
|
1353
|
-
} else {
|
1354
|
-
frt_is_seek(self->doc->fields_in, self->data[0].start + start);
|
1355
|
-
frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)buf, len);
|
1356
|
-
}
|
1357
|
-
}
|
1358
|
-
|
1359
|
-
/****************************************************************************
|
1360
|
-
*
|
1361
|
-
* LazyDoc
|
1362
|
-
*
|
1363
|
-
****************************************************************************/
|
1364
|
-
|
1365
|
-
static FrtLazyDoc *lazy_doc_new(int size, FrtInStream *fdt_in)
|
1366
|
-
{
|
1367
|
-
FrtLazyDoc *self = FRT_ALLOC(FrtLazyDoc);
|
1368
|
-
self->field_dictionary = frt_h_new_ptr((frt_free_ft)&lazy_df_destroy);
|
1369
|
-
self->size = size;
|
1370
|
-
self->fields = FRT_ALLOC_AND_ZERO_N(FrtLazyDocField *, size);
|
1371
|
-
self->fields_in = frt_is_clone(fdt_in);
|
1372
|
-
self->loaded = false;
|
1373
|
-
return self;
|
1374
|
-
}
|
1375
|
-
|
1376
|
-
void frt_lazy_doc_close(FrtLazyDoc *self)
|
1377
|
-
{
|
1378
|
-
frt_h_destroy(self->field_dictionary);
|
1379
|
-
frt_is_close(self->fields_in);
|
1380
|
-
free(self->fields);
|
1381
|
-
free(self);
|
1382
|
-
}
|
1383
|
-
|
1384
|
-
static void lazy_doc_add_field(FrtLazyDoc *self, FrtLazyDocField *lazy_df, int i)
|
1385
|
-
{
|
1386
|
-
self->fields[i] = lazy_df;
|
1387
|
-
|
1388
|
-
frt_h_set(self->field_dictionary, (void *)lazy_df->name, lazy_df);
|
1389
|
-
lazy_df->doc = self;
|
1390
|
-
}
|
1391
|
-
|
1392
|
-
FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self, ID field) {
|
1393
|
-
return (FrtLazyDocField *)frt_h_get(self->field_dictionary, (void *)field);
|
1394
|
-
}
|
1395
|
-
|
1396
1016
|
/****************************************************************************
|
1397
1017
|
* FrtFieldsReader
|
1398
1018
|
****************************************************************************/
|
@@ -1457,7 +1077,7 @@ static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df,
|
|
1457
1077
|
|
1458
1078
|
for (i = 0; i < df_size; i++) {
|
1459
1079
|
const int compressed_len = df->lengths[i] + 1;
|
1460
|
-
df->data[i] =
|
1080
|
+
df->data[i] = frt_is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]), compression);
|
1461
1081
|
}
|
1462
1082
|
}
|
1463
1083
|
|
@@ -1522,11 +1142,11 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
|
|
1522
1142
|
frt_is_seek(fdt_in, pos);
|
1523
1143
|
stored_cnt = frt_is_read_vint(fdt_in);
|
1524
1144
|
|
1525
|
-
lazy_doc =
|
1145
|
+
lazy_doc = frt_lazy_doc_new(stored_cnt, fdt_in);
|
1526
1146
|
for (i = 0; i < stored_cnt; i++) {
|
1527
1147
|
FrtFieldInfo *fi = fr->fis->fields[frt_is_read_vint(fdt_in)];
|
1528
1148
|
const int df_size = frt_is_read_vint(fdt_in);
|
1529
|
-
FrtLazyDocField *lazy_df =
|
1149
|
+
FrtLazyDocField *lazy_df = frt_lazy_df_new(fi->name, df_size, frt_fi_get_compression(fi));
|
1530
1150
|
const int field_start = start;
|
1531
1151
|
/* get the starts relative positions this time around */
|
1532
1152
|
|
@@ -1538,7 +1158,7 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
|
|
1538
1158
|
}
|
1539
1159
|
|
1540
1160
|
lazy_df->len = start - field_start - 1;
|
1541
|
-
|
1161
|
+
frt_lazy_doc_add_field(lazy_doc, lazy_df, i);
|
1542
1162
|
}
|
1543
1163
|
/* correct the starts to their correct absolute positions */
|
1544
1164
|
const frt_off_t abs_start = frt_is_pos(fdt_in);
|
@@ -1720,145 +1340,6 @@ void frt_fw_close(FrtFieldsWriter *fw) {
|
|
1720
1340
|
free(fw);
|
1721
1341
|
}
|
1722
1342
|
|
1723
|
-
static int frt_os_write_brotli_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
|
1724
|
-
size_t compressed_length = 0;
|
1725
|
-
const frt_uchar *next_in = data;
|
1726
|
-
size_t available_in = length;
|
1727
|
-
size_t available_out;
|
1728
|
-
frt_uchar compression_buffer[FRT_COMPRESSION_BUFFER_SIZE];
|
1729
|
-
frt_uchar *next_out;
|
1730
|
-
BrotliEncoderState *b_state = BrotliEncoderCreateInstance(NULL, NULL, NULL);
|
1731
|
-
if (!b_state) { comp_raise(); return -1; }
|
1732
|
-
|
1733
|
-
BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, FRT_BROTLI_COMPRESSION_LEVEL);
|
1734
|
-
|
1735
|
-
do {
|
1736
|
-
available_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1737
|
-
next_out = compression_buffer;
|
1738
|
-
if (!BrotliEncoderCompressStream(b_state, BROTLI_OPERATION_FINISH,
|
1739
|
-
&available_in, &next_in,
|
1740
|
-
&available_out, &next_out, &compressed_length)) {
|
1741
|
-
BrotliEncoderDestroyInstance(b_state);
|
1742
|
-
comp_raise();
|
1743
|
-
return -1;
|
1744
|
-
}
|
1745
|
-
frt_os_write_bytes(out_stream, compression_buffer, FRT_COMPRESSION_BUFFER_SIZE - available_out);
|
1746
|
-
} while (!BrotliEncoderIsFinished(b_state));
|
1747
|
-
|
1748
|
-
BrotliEncoderDestroyInstance(b_state);
|
1749
|
-
|
1750
|
-
return (int)compressed_length;
|
1751
|
-
}
|
1752
|
-
|
1753
|
-
static int frt_os_write_bz2_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
|
1754
|
-
int ret, buf_size, compressed_len = 0;
|
1755
|
-
char out_buffer[FRT_COMPRESSION_BUFFER_SIZE];
|
1756
|
-
bz_stream zstrm;
|
1757
|
-
zstrm.bzalloc = NULL;
|
1758
|
-
zstrm.bzfree = NULL;
|
1759
|
-
zstrm.opaque = NULL;
|
1760
|
-
if ((ret = BZ2_bzCompressInit(&zstrm, FRT_BZIP_COMPRESSION_LEVEL, 0, 0)) != BZ_OK) zraise(ret);
|
1761
|
-
|
1762
|
-
zstrm.avail_in = length;
|
1763
|
-
zstrm.next_in = (char *)data;
|
1764
|
-
zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1765
|
-
zstrm.next_out = out_buffer;
|
1766
|
-
|
1767
|
-
do {
|
1768
|
-
ret = BZ2_bzCompress(&zstrm, BZ_FINISH); /* no bad return value */
|
1769
|
-
assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
|
1770
|
-
compressed_len += buf_size = FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
|
1771
|
-
frt_os_write_bytes(out_stream, (frt_uchar *)out_buffer, buf_size);
|
1772
|
-
} while (zstrm.avail_out == 0);
|
1773
|
-
assert(zstrm.avail_in == 0); /* all input will be used */
|
1774
|
-
|
1775
|
-
(void)BZ2_bzCompressEnd(&zstrm);
|
1776
|
-
return compressed_len;
|
1777
|
-
}
|
1778
|
-
|
1779
|
-
static const LZ4F_preferences_t lz4_prefs = {
|
1780
|
-
{
|
1781
|
-
LZ4F_default,
|
1782
|
-
LZ4F_blockLinked,
|
1783
|
-
LZ4F_noContentChecksum,
|
1784
|
-
LZ4F_frame,
|
1785
|
-
0, /* unknown content size */
|
1786
|
-
0, /* no dictID */
|
1787
|
-
LZ4F_noBlockChecksum
|
1788
|
-
},
|
1789
|
-
0,
|
1790
|
-
1,
|
1791
|
-
1,
|
1792
|
-
{0,0,0}
|
1793
|
-
};
|
1794
|
-
|
1795
|
-
static int frt_os_write_lz4_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
|
1796
|
-
int compressed_length = 0;
|
1797
|
-
int remaining_length = length;
|
1798
|
-
size_t ccmp_length = 0;
|
1799
|
-
LZ4F_compressionContext_t ctx;
|
1800
|
-
size_t out_buf_length = LZ4F_compressBound(FRT_COMPRESSION_BUFFER_SIZE, &lz4_prefs);
|
1801
|
-
frt_uchar *out_buf = frt_ecalloc(out_buf_length);
|
1802
|
-
|
1803
|
-
size_t ctx_creation = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
|
1804
|
-
if (LZ4F_isError(ctx_creation)) {
|
1805
|
-
compressed_length = -1;
|
1806
|
-
goto finish;
|
1807
|
-
}
|
1808
|
-
|
1809
|
-
/* create header */
|
1810
|
-
ccmp_length = LZ4F_compressBegin(ctx, out_buf, out_buf_length, &lz4_prefs);
|
1811
|
-
if (LZ4F_isError(ccmp_length)) {
|
1812
|
-
compressed_length = -1;
|
1813
|
-
goto finish;
|
1814
|
-
}
|
1815
|
-
compressed_length = ccmp_length;
|
1816
|
-
frt_os_write_bytes(out_stream, out_buf, ccmp_length);
|
1817
|
-
|
1818
|
-
/* compress data */
|
1819
|
-
do {
|
1820
|
-
int read_length = (FRT_COMPRESSION_BUFFER_SIZE > remaining_length) ? remaining_length : FRT_COMPRESSION_BUFFER_SIZE;
|
1821
|
-
ccmp_length = LZ4F_compressUpdate(ctx, out_buf, out_buf_length, data + (length - remaining_length), read_length, NULL);
|
1822
|
-
if (LZ4F_isError(ccmp_length)) {
|
1823
|
-
compressed_length = -1;
|
1824
|
-
goto finish;
|
1825
|
-
}
|
1826
|
-
frt_os_write_bytes(out_stream, out_buf, ccmp_length);
|
1827
|
-
compressed_length += ccmp_length;
|
1828
|
-
remaining_length -= read_length;
|
1829
|
-
} while (remaining_length > 0);
|
1830
|
-
|
1831
|
-
/* finish up */
|
1832
|
-
ccmp_length = LZ4F_compressEnd(ctx, out_buf, out_buf_length, NULL);
|
1833
|
-
if (LZ4F_isError(ccmp_length)) {
|
1834
|
-
compressed_length = -1;
|
1835
|
-
goto finish;
|
1836
|
-
}
|
1837
|
-
|
1838
|
-
frt_os_write_bytes(out_stream, out_buf, ccmp_length);
|
1839
|
-
compressed_length += ccmp_length;
|
1840
|
-
|
1841
|
-
finish:
|
1842
|
-
LZ4F_freeCompressionContext(ctx);
|
1843
|
-
free(out_buf);
|
1844
|
-
|
1845
|
-
return compressed_length;
|
1846
|
-
}
|
1847
|
-
|
1848
|
-
static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length, FrtCompressionType compression) {
|
1849
|
-
switch (compression) {
|
1850
|
-
case FRT_COMPRESSION_BROTLI:
|
1851
|
-
return frt_os_write_brotli_compressed_bytes(out_stream, data, length);
|
1852
|
-
case FRT_COMPRESSION_BZ2:
|
1853
|
-
return frt_os_write_bz2_compressed_bytes(out_stream, data, length);
|
1854
|
-
case FRT_COMPRESSION_LZ4:
|
1855
|
-
return frt_os_write_lz4_compressed_bytes(out_stream, data, length);
|
1856
|
-
default:
|
1857
|
-
return -1;
|
1858
|
-
}
|
1859
|
-
|
1860
|
-
}
|
1861
|
-
|
1862
1343
|
void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
|
1863
1344
|
int i, j, stored_cnt = 0;
|
1864
1345
|
FrtDocField *df;
|
@@ -5384,465 +4865,6 @@ void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc) {
|
|
5384
4865
|
* IndexWriter
|
5385
4866
|
*
|
5386
4867
|
****************************************************************************/
|
5387
|
-
/****************************************************************************
|
5388
|
-
* SegmentMergeInfo
|
5389
|
-
****************************************************************************/
|
5390
|
-
|
5391
|
-
typedef struct SegmentMergeInfo {
|
5392
|
-
int base;
|
5393
|
-
int max_doc;
|
5394
|
-
int doc_cnt;
|
5395
|
-
FrtSegmentInfo *si;
|
5396
|
-
FrtStore *store;
|
5397
|
-
FrtStore *orig_store;
|
5398
|
-
FrtBitVector *deleted_docs;
|
5399
|
-
FrtSegmentFieldIndex *sfi;
|
5400
|
-
FrtTermEnum *te;
|
5401
|
-
FrtTermDocEnum *tde;
|
5402
|
-
char *term;
|
5403
|
-
int *doc_map;
|
5404
|
-
FrtInStream *frq_in;
|
5405
|
-
FrtInStream *prx_in;
|
5406
|
-
} SegmentMergeInfo;
|
5407
|
-
|
5408
|
-
static bool smi_lt(const SegmentMergeInfo *smi1, const SegmentMergeInfo *smi2)
|
5409
|
-
{
|
5410
|
-
int cmpres = strcmp(smi1->term, smi2->term);
|
5411
|
-
if (0 == cmpres) {
|
5412
|
-
return smi1->base < smi2->base;
|
5413
|
-
}
|
5414
|
-
else {
|
5415
|
-
return cmpres < 0;
|
5416
|
-
}
|
5417
|
-
}
|
5418
|
-
|
5419
|
-
static void smi_load_doc_map(SegmentMergeInfo *smi)
|
5420
|
-
{
|
5421
|
-
FrtBitVector *deleted_docs = smi->deleted_docs;
|
5422
|
-
const int max_doc = smi->max_doc;
|
5423
|
-
int j = 0, i;
|
5424
|
-
|
5425
|
-
smi->doc_map = FRT_ALLOC_N(int, max_doc);
|
5426
|
-
for (i = 0; i < max_doc; i++) {
|
5427
|
-
if (frt_bv_get(deleted_docs, i)) {
|
5428
|
-
smi->doc_map[i] = -1;
|
5429
|
-
}
|
5430
|
-
else {
|
5431
|
-
smi->doc_map[i] = j++;
|
5432
|
-
}
|
5433
|
-
}
|
5434
|
-
smi->doc_cnt = j;
|
5435
|
-
}
|
5436
|
-
|
5437
|
-
static SegmentMergeInfo *smi_new(int base, FrtStore *store, FrtSegmentInfo *si)
|
5438
|
-
{
|
5439
|
-
SegmentMergeInfo *smi = FRT_ALLOC_AND_ZERO(SegmentMergeInfo);
|
5440
|
-
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
5441
|
-
char *segment = si->name;
|
5442
|
-
smi->base = base;
|
5443
|
-
smi->si = si;
|
5444
|
-
smi->orig_store = smi->store = store;
|
5445
|
-
FRT_REF(smi->orig_store);
|
5446
|
-
|
5447
|
-
sprintf(file_name, "%s.fdx", segment);
|
5448
|
-
smi->doc_cnt = smi->max_doc
|
5449
|
-
= smi->store->length(smi->store, file_name) / FIELDS_IDX_PTR_SIZE;
|
5450
|
-
|
5451
|
-
if (si->del_gen >= 0) {
|
5452
|
-
frt_fn_for_generation(file_name, segment, "del", si->del_gen);
|
5453
|
-
smi->deleted_docs = bv_read(store, file_name);
|
5454
|
-
smi_load_doc_map(smi);
|
5455
|
-
}
|
5456
|
-
return smi;
|
5457
|
-
}
|
5458
|
-
|
5459
|
-
static void smi_load_term_input(SegmentMergeInfo *smi)
|
5460
|
-
{
|
5461
|
-
FrtStore *store = smi->store;
|
5462
|
-
char *segment = smi->si->name;
|
5463
|
-
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
5464
|
-
smi->sfi = frt_sfi_open(store, segment);
|
5465
|
-
sprintf(file_name, "%s.tis", segment);
|
5466
|
-
FrtInStream *is = store->open_input(store, file_name);
|
5467
|
-
FRT_DEREF(is);
|
5468
|
-
smi->te = TE(frt_ste_new(is, smi->sfi));
|
5469
|
-
sprintf(file_name, "%s.frq", segment);
|
5470
|
-
smi->frq_in = store->open_input(store, file_name);
|
5471
|
-
sprintf(file_name, "%s.prx", segment);
|
5472
|
-
smi->prx_in = store->open_input(store, file_name);
|
5473
|
-
smi->tde = frt_stpe_new(NULL, smi->frq_in, smi->prx_in, smi->deleted_docs,
|
5474
|
-
STE(smi->te)->skip_interval);
|
5475
|
-
}
|
5476
|
-
|
5477
|
-
static void smi_close_term_input(SegmentMergeInfo *smi)
|
5478
|
-
{
|
5479
|
-
frt_ste_close(smi->te);
|
5480
|
-
frt_sfi_close(smi->sfi);
|
5481
|
-
stpe_close(smi->tde);
|
5482
|
-
frt_is_close(smi->frq_in);
|
5483
|
-
frt_is_close(smi->prx_in);
|
5484
|
-
}
|
5485
|
-
|
5486
|
-
static void smi_destroy(SegmentMergeInfo *smi)
|
5487
|
-
{
|
5488
|
-
if (smi->store != smi->orig_store) {
|
5489
|
-
frt_store_close(smi->store);
|
5490
|
-
}
|
5491
|
-
frt_store_close(smi->orig_store);
|
5492
|
-
if (smi->deleted_docs) {
|
5493
|
-
frt_bv_destroy(smi->deleted_docs);
|
5494
|
-
free(smi->doc_map);
|
5495
|
-
}
|
5496
|
-
free(smi);
|
5497
|
-
}
|
5498
|
-
|
5499
|
-
static char *smi_next(SegmentMergeInfo *smi)
|
5500
|
-
{
|
5501
|
-
return (smi->term = ste_next(smi->te));
|
5502
|
-
}
|
5503
|
-
|
5504
|
-
/****************************************************************************
|
5505
|
-
* SegmentMerger
|
5506
|
-
****************************************************************************/
|
5507
|
-
|
5508
|
-
typedef struct SegmentMerger {
|
5509
|
-
FrtTermInfo ti;
|
5510
|
-
FrtStore *store;
|
5511
|
-
FrtFieldInfos *fis;
|
5512
|
-
FrtSegmentInfo *si;
|
5513
|
-
SegmentMergeInfo **smis;
|
5514
|
-
int seg_cnt;
|
5515
|
-
int doc_cnt;
|
5516
|
-
FrtConfig *config;
|
5517
|
-
FrtTermInfosWriter *tiw;
|
5518
|
-
char *term_buf;
|
5519
|
-
int term_buf_ptr;
|
5520
|
-
int term_buf_size;
|
5521
|
-
FrtPriorityQueue *queue;
|
5522
|
-
SkipBuffer *skip_buf;
|
5523
|
-
FrtOutStream *frq_out;
|
5524
|
-
FrtOutStream *prx_out;
|
5525
|
-
} SegmentMerger;
|
5526
|
-
|
5527
|
-
static SegmentMerger *sm_create(FrtIndexWriter *iw, FrtSegmentInfo *si, FrtSegmentInfo **seg_infos, const int seg_cnt)
|
5528
|
-
{
|
5529
|
-
int i;
|
5530
|
-
SegmentMerger *sm = FRT_ALLOC_AND_ZERO_N(SegmentMerger, seg_cnt);
|
5531
|
-
sm->store = iw->store;
|
5532
|
-
FRT_REF(sm->store);
|
5533
|
-
sm->fis = iw->fis;
|
5534
|
-
sm->si = si;
|
5535
|
-
sm->doc_cnt = 0;
|
5536
|
-
sm->smis = FRT_ALLOC_N(SegmentMergeInfo *, seg_cnt);
|
5537
|
-
for (i = 0; i < seg_cnt; i++) {
|
5538
|
-
sm->smis[i] = smi_new(sm->doc_cnt, seg_infos[i]->store, seg_infos[i]);
|
5539
|
-
sm->doc_cnt += sm->smis[i]->doc_cnt;
|
5540
|
-
}
|
5541
|
-
sm->seg_cnt = seg_cnt;
|
5542
|
-
sm->config = &iw->config;
|
5543
|
-
return sm;
|
5544
|
-
}
|
5545
|
-
|
5546
|
-
static void sm_destroy(SegmentMerger *sm)
|
5547
|
-
{
|
5548
|
-
int i;
|
5549
|
-
const int seg_cnt = sm->seg_cnt;
|
5550
|
-
for (i = 0; i < seg_cnt; i++) {
|
5551
|
-
smi_destroy(sm->smis[i]);
|
5552
|
-
}
|
5553
|
-
frt_store_close(sm->store);
|
5554
|
-
free(sm->smis);
|
5555
|
-
free(sm);
|
5556
|
-
}
|
5557
|
-
|
5558
|
-
static void sm_merge_fields(SegmentMerger *sm)
|
5559
|
-
{
|
5560
|
-
int i, j;
|
5561
|
-
frt_off_t start, end = 0;
|
5562
|
-
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
5563
|
-
FrtOutStream *fdt_out, *fdx_out;
|
5564
|
-
FrtStore *store = sm->store;
|
5565
|
-
const int seg_cnt = sm->seg_cnt;
|
5566
|
-
|
5567
|
-
sprintf(file_name, "%s.fdt", sm->si->name);
|
5568
|
-
fdt_out = store->new_output(store, file_name);
|
5569
|
-
|
5570
|
-
sprintf(file_name, "%s.fdx", sm->si->name);
|
5571
|
-
fdx_out = store->new_output(store, file_name);
|
5572
|
-
|
5573
|
-
for (i = 0; i < seg_cnt; i++) {
|
5574
|
-
SegmentMergeInfo *smi = sm->smis[i];
|
5575
|
-
const int max_doc = smi->max_doc;
|
5576
|
-
FrtInStream *fdt_in, *fdx_in;
|
5577
|
-
char *segment = smi->si->name;
|
5578
|
-
store = smi->store;
|
5579
|
-
sprintf(file_name, "%s.fdt", segment);
|
5580
|
-
fdt_in = store->open_input(store, file_name);
|
5581
|
-
sprintf(file_name, "%s.fdx", segment);
|
5582
|
-
fdx_in = store->open_input(store, file_name);
|
5583
|
-
|
5584
|
-
if (max_doc > 0) {
|
5585
|
-
end = (off_t)frt_is_read_u64(fdx_in);
|
5586
|
-
}
|
5587
|
-
for (j = 0; j < max_doc; j++) {
|
5588
|
-
frt_u32 tv_idx_offset = frt_is_read_u32(fdx_in);
|
5589
|
-
start = end;
|
5590
|
-
if (j == max_doc - 1) {
|
5591
|
-
end = frt_is_length(fdt_in);
|
5592
|
-
}
|
5593
|
-
else {
|
5594
|
-
end = (off_t)frt_is_read_u64(fdx_in);
|
5595
|
-
}
|
5596
|
-
/* skip deleted docs */
|
5597
|
-
if (!smi->deleted_docs || !frt_bv_get(smi->deleted_docs, j)) {
|
5598
|
-
frt_os_write_u64(fdx_out, frt_os_pos(fdt_out));
|
5599
|
-
frt_os_write_u32(fdx_out, tv_idx_offset);
|
5600
|
-
frt_is_seek(fdt_in, start);
|
5601
|
-
frt_is2os_copy_bytes(fdt_in, fdt_out, end - start);
|
5602
|
-
}
|
5603
|
-
}
|
5604
|
-
frt_is_close(fdt_in);
|
5605
|
-
frt_is_close(fdx_in);
|
5606
|
-
}
|
5607
|
-
frt_os_close(fdt_out);
|
5608
|
-
frt_os_close(fdx_out);
|
5609
|
-
}
|
5610
|
-
|
5611
|
-
static int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **matches,
|
5612
|
-
const int match_size)
|
5613
|
-
{
|
5614
|
-
int i;
|
5615
|
-
int last_doc = 0, base, doc, doc_code, freq;
|
5616
|
-
int skip_interval = sm->config->skip_interval;
|
5617
|
-
int *doc_map = NULL;
|
5618
|
-
int df = 0; /* number of docs w/ term */
|
5619
|
-
FrtTermDocEnum *tde;
|
5620
|
-
SegmentMergeInfo *smi;
|
5621
|
-
SkipBuffer *skip_buf = sm->skip_buf;
|
5622
|
-
skip_buf_reset(skip_buf);
|
5623
|
-
|
5624
|
-
for (i = 0; i < match_size; i++) {
|
5625
|
-
smi = matches[i];
|
5626
|
-
base = smi->base;
|
5627
|
-
doc_map = smi->doc_map;
|
5628
|
-
tde = smi->tde;
|
5629
|
-
stpe_seek_ti(STDE(tde), &smi->te->curr_ti);
|
5630
|
-
|
5631
|
-
/* since we are using copy_bytes below to copy the proximities we use
|
5632
|
-
* stde_next rather than stpe_next here */
|
5633
|
-
while (stde_next(tde)) {
|
5634
|
-
doc = stde_doc_num(tde);
|
5635
|
-
if (NULL != doc_map) {
|
5636
|
-
doc = doc_map[doc]; /* work around deletions */
|
5637
|
-
}
|
5638
|
-
doc += base; /* convert to merged space */
|
5639
|
-
assert(doc == 0 || doc > last_doc);
|
5640
|
-
|
5641
|
-
df++;
|
5642
|
-
if (0 == (df % skip_interval)) {
|
5643
|
-
skip_buf_add(skip_buf, last_doc);
|
5644
|
-
}
|
5645
|
-
|
5646
|
-
doc_code = (doc - last_doc) << 1; /* use low bit to flag freq=1 */
|
5647
|
-
last_doc = doc;
|
5648
|
-
|
5649
|
-
freq = stde_freq(tde);
|
5650
|
-
if (freq == 1) {
|
5651
|
-
frt_os_write_vint(sm->frq_out, doc_code | 1); /* doc & freq=1 */
|
5652
|
-
}
|
5653
|
-
else {
|
5654
|
-
frt_os_write_vint(sm->frq_out, doc_code); /* write doc */
|
5655
|
-
frt_os_write_vint(sm->frq_out, freq); /* write freqency in doc */
|
5656
|
-
}
|
5657
|
-
|
5658
|
-
/* copy position deltas */
|
5659
|
-
frt_is2os_copy_vints(STDE(tde)->prx_in, sm->prx_out, freq);
|
5660
|
-
}
|
5661
|
-
}
|
5662
|
-
return df;
|
5663
|
-
}
|
5664
|
-
|
5665
|
-
static char *sm_cache_term(SegmentMerger *sm, char *term, int term_len)
|
5666
|
-
{
|
5667
|
-
term = (char *)memcpy(sm->term_buf + sm->term_buf_ptr, term, term_len + 1);
|
5668
|
-
sm->term_buf_ptr += term_len + 1;
|
5669
|
-
if (sm->term_buf_ptr > sm->term_buf_size) {
|
5670
|
-
sm->term_buf_ptr = 0;
|
5671
|
-
}
|
5672
|
-
return term;
|
5673
|
-
}
|
5674
|
-
|
5675
|
-
static void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **matches,
|
5676
|
-
int match_size)
|
5677
|
-
{
|
5678
|
-
frt_off_t frq_ptr = frt_os_pos(sm->frq_out);
|
5679
|
-
frt_off_t prx_ptr = frt_os_pos(sm->prx_out);
|
5680
|
-
|
5681
|
-
int df = sm_append_postings(sm, matches, match_size); /* append posting data */
|
5682
|
-
|
5683
|
-
frt_off_t skip_ptr = skip_buf_write(sm->skip_buf);
|
5684
|
-
|
5685
|
-
if (df > 0) {
|
5686
|
-
/* add an entry to the dictionary with ptrs to prox and freq files */
|
5687
|
-
SegmentMergeInfo *first_match = matches[0];
|
5688
|
-
int term_len = first_match->te->curr_term_len;
|
5689
|
-
|
5690
|
-
frt_ti_set(sm->ti, df, frq_ptr, prx_ptr,
|
5691
|
-
(skip_ptr - frq_ptr));
|
5692
|
-
frt_tiw_add(sm->tiw, sm_cache_term(sm, first_match->term, term_len),
|
5693
|
-
term_len, &sm->ti);
|
5694
|
-
}
|
5695
|
-
}
|
5696
|
-
|
5697
|
-
static void sm_merge_term_infos(SegmentMerger *sm)
|
5698
|
-
{
|
5699
|
-
int i, j, match_size;
|
5700
|
-
SegmentMergeInfo *smi, *top, **matches;
|
5701
|
-
char *term;
|
5702
|
-
const int seg_cnt = sm->seg_cnt;
|
5703
|
-
const int fis_size = sm->fis->size;
|
5704
|
-
|
5705
|
-
matches = FRT_ALLOC_N(SegmentMergeInfo *, seg_cnt);
|
5706
|
-
|
5707
|
-
for (j = 0; j < seg_cnt; j++) {
|
5708
|
-
smi_load_term_input(sm->smis[j]);
|
5709
|
-
}
|
5710
|
-
|
5711
|
-
for (i = 0; i < fis_size; i++) {
|
5712
|
-
frt_tiw_start_field(sm->tiw, i);
|
5713
|
-
for (j = 0; j < seg_cnt; j++) {
|
5714
|
-
smi = sm->smis[j];
|
5715
|
-
ste_set_field(smi->te, i);
|
5716
|
-
if (NULL != smi_next(smi)) {
|
5717
|
-
frt_pq_push(sm->queue, smi); /* initialize @queue */
|
5718
|
-
}
|
5719
|
-
}
|
5720
|
-
while (sm->queue->size > 0) {
|
5721
|
-
match_size = 0; /* pop matching terms */
|
5722
|
-
matches[0] = (SegmentMergeInfo *)frt_pq_pop(sm->queue);
|
5723
|
-
match_size++;
|
5724
|
-
term = matches[0]->term;
|
5725
|
-
top = (SegmentMergeInfo *)frt_pq_top(sm->queue);
|
5726
|
-
while ((NULL != top) && (0 == strcmp(term, top->term))) {
|
5727
|
-
matches[match_size] = (SegmentMergeInfo *)frt_pq_pop(sm->queue);
|
5728
|
-
match_size++;
|
5729
|
-
top = (SegmentMergeInfo *)frt_pq_top(sm->queue);
|
5730
|
-
}
|
5731
|
-
|
5732
|
-
sm_merge_term_info(sm, matches, match_size);/* add new FrtTermInfo */
|
5733
|
-
|
5734
|
-
while (match_size > 0) {
|
5735
|
-
match_size--;
|
5736
|
-
smi = matches[match_size];
|
5737
|
-
if (NULL != smi_next(smi)) {
|
5738
|
-
frt_pq_push(sm->queue, smi); /* restore queue */
|
5739
|
-
}
|
5740
|
-
}
|
5741
|
-
}
|
5742
|
-
}
|
5743
|
-
free(matches);
|
5744
|
-
for (j = 0; j < seg_cnt; j++) {
|
5745
|
-
smi_close_term_input(sm->smis[j]);
|
5746
|
-
}
|
5747
|
-
}
|
5748
|
-
|
5749
|
-
static void sm_merge_terms(SegmentMerger *sm)
|
5750
|
-
{
|
5751
|
-
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
5752
|
-
|
5753
|
-
sprintf(file_name, "%s.frq", sm->si->name);
|
5754
|
-
sm->frq_out = sm->store->new_output(sm->store, file_name);
|
5755
|
-
sprintf(file_name, "%s.prx", sm->si->name);
|
5756
|
-
sm->prx_out = sm->store->new_output(sm->store, file_name);
|
5757
|
-
|
5758
|
-
sm->tiw = frt_tiw_open(sm->store, sm->si->name, sm->config->index_interval,
|
5759
|
-
sm->config->skip_interval);
|
5760
|
-
sm->skip_buf = skip_buf_new(sm->frq_out, sm->prx_out);
|
5761
|
-
|
5762
|
-
/* terms_buf_ptr holds a buffer of terms since the FrtTermInfosWriter needs
|
5763
|
-
* to keep the last index_interval terms so that it can compare the last
|
5764
|
-
* term put in the index with the next one. So the size of the buffer must
|
5765
|
-
* by index_interval + 2. */
|
5766
|
-
sm->term_buf_ptr = 0;
|
5767
|
-
sm->term_buf_size = (sm->config->index_interval + 1) * FRT_MAX_WORD_SIZE;
|
5768
|
-
sm->term_buf = FRT_ALLOC_N(char, sm->term_buf_size + FRT_MAX_WORD_SIZE);
|
5769
|
-
|
5770
|
-
sm->queue = frt_pq_new(sm->seg_cnt, (frt_lt_ft)&smi_lt, NULL);
|
5771
|
-
|
5772
|
-
sm_merge_term_infos(sm);
|
5773
|
-
|
5774
|
-
frt_os_close(sm->frq_out);
|
5775
|
-
frt_os_close(sm->prx_out);
|
5776
|
-
frt_tiw_close(sm->tiw);
|
5777
|
-
frt_pq_destroy(sm->queue);
|
5778
|
-
skip_buf_destroy(sm->skip_buf);
|
5779
|
-
free(sm->term_buf);
|
5780
|
-
}
|
5781
|
-
|
5782
|
-
static void sm_merge_norms(SegmentMerger *sm)
|
5783
|
-
{
|
5784
|
-
FrtSegmentInfo *si;
|
5785
|
-
int i, j, k;
|
5786
|
-
FrtStore *store;
|
5787
|
-
frt_uchar byte;
|
5788
|
-
FrtFieldInfo *fi;
|
5789
|
-
FrtOutStream *os;
|
5790
|
-
FrtInStream *is;
|
5791
|
-
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
5792
|
-
SegmentMergeInfo *smi;
|
5793
|
-
const int seg_cnt = sm->seg_cnt;
|
5794
|
-
for (i = sm->fis->size - 1; i >= 0; i--) {
|
5795
|
-
fi = sm->fis->fields[i];
|
5796
|
-
if (bits_has_norms(fi->bits)) {
|
5797
|
-
si = sm->si;
|
5798
|
-
frt_si_advance_norm_gen(si, i);
|
5799
|
-
si_norm_file_name(si, file_name, i);
|
5800
|
-
os = sm->store->new_output(sm->store, file_name);
|
5801
|
-
for (j = 0; j < seg_cnt; j++) {
|
5802
|
-
smi = sm->smis[j];
|
5803
|
-
si = smi->si;
|
5804
|
-
if (si_norm_file_name(si, file_name, i)) {
|
5805
|
-
const int max_doc = smi->max_doc;
|
5806
|
-
FrtBitVector *deleted_docs = smi->deleted_docs;
|
5807
|
-
store = smi->store;
|
5808
|
-
is = store->open_input(store, file_name);
|
5809
|
-
if (deleted_docs) {
|
5810
|
-
for (k = 0; k < max_doc; k++) {
|
5811
|
-
byte = frt_is_read_byte(is);
|
5812
|
-
if (!frt_bv_get(deleted_docs, k)) {
|
5813
|
-
frt_os_write_byte(os, byte);
|
5814
|
-
}
|
5815
|
-
}
|
5816
|
-
}
|
5817
|
-
else {
|
5818
|
-
frt_is2os_copy_bytes(is, os, max_doc);
|
5819
|
-
}
|
5820
|
-
frt_is_close(is);
|
5821
|
-
}
|
5822
|
-
else {
|
5823
|
-
const int doc_cnt = smi->doc_cnt;
|
5824
|
-
for (k = 0; k < doc_cnt; k++) {
|
5825
|
-
frt_os_write_byte(os, '\0');
|
5826
|
-
}
|
5827
|
-
}
|
5828
|
-
}
|
5829
|
-
frt_os_close(os);
|
5830
|
-
}
|
5831
|
-
}
|
5832
|
-
}
|
5833
|
-
|
5834
|
-
static int sm_merge(SegmentMerger *sm)
|
5835
|
-
{
|
5836
|
-
sm_merge_fields(sm);
|
5837
|
-
sm_merge_terms(sm);
|
5838
|
-
sm_merge_norms(sm);
|
5839
|
-
return sm->doc_cnt;
|
5840
|
-
}
|
5841
|
-
|
5842
|
-
|
5843
|
-
/****************************************************************************
|
5844
|
-
* IndexWriter
|
5845
|
-
****************************************************************************/
|
5846
4868
|
|
5847
4869
|
/* prepare an index ready for writing */
|
5848
4870
|
void frt_index_create(FrtStore *store, FrtFieldInfos *fis)
|
@@ -5874,68 +4896,6 @@ int frt_iw_doc_count(FrtIndexWriter *iw)
|
|
5874
4896
|
return doc_cnt;
|
5875
4897
|
}
|
5876
4898
|
|
5877
|
-
static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg, const int max_seg) {
|
5878
|
-
int i;
|
5879
|
-
FrtSegmentInfos *sis = iw->sis;
|
5880
|
-
FrtSegmentInfo *si = frt_sis_new_segment(sis, 0, iw->store);
|
5881
|
-
|
5882
|
-
SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg], max_seg - min_seg);
|
5883
|
-
|
5884
|
-
/* This is where all the action happens. */
|
5885
|
-
si->doc_cnt = sm_merge(merger);
|
5886
|
-
|
5887
|
-
pthread_mutex_lock(&iw->store->mutex);
|
5888
|
-
/* delete merged segments */
|
5889
|
-
for (i = min_seg; i < max_seg; i++) {
|
5890
|
-
si_delete_files(sis->segs[i], iw->fis, iw->deleter);
|
5891
|
-
}
|
5892
|
-
|
5893
|
-
frt_sis_del_from_to(sis, min_seg, max_seg);
|
5894
|
-
|
5895
|
-
frt_sis_write(sis, iw->store, iw->deleter);
|
5896
|
-
deleter_commit_pending_deletions(iw->deleter);
|
5897
|
-
|
5898
|
-
pthread_mutex_unlock(&iw->store->mutex);
|
5899
|
-
|
5900
|
-
sm_destroy(merger);
|
5901
|
-
}
|
5902
|
-
|
5903
|
-
static void iw_merge_segments_from(FrtIndexWriter *iw, int min_segment)
|
5904
|
-
{
|
5905
|
-
iw_merge_segments(iw, min_segment, iw->sis->size);
|
5906
|
-
}
|
5907
|
-
|
5908
|
-
static void iw_maybe_merge_segments(FrtIndexWriter *iw)
|
5909
|
-
{
|
5910
|
-
int target_merge_docs = iw->config.merge_factor;
|
5911
|
-
int min_segment, merge_docs;
|
5912
|
-
FrtSegmentInfo *si;
|
5913
|
-
|
5914
|
-
while (target_merge_docs > 0
|
5915
|
-
&& target_merge_docs <= iw->config.max_merge_docs) {
|
5916
|
-
/* find segments smaller than current target size */
|
5917
|
-
min_segment = iw->sis->size - 1;
|
5918
|
-
merge_docs = 0;
|
5919
|
-
while (min_segment >= 0) {
|
5920
|
-
si = iw->sis->segs[min_segment];
|
5921
|
-
if (si->doc_cnt >= target_merge_docs) {
|
5922
|
-
break;
|
5923
|
-
}
|
5924
|
-
merge_docs += si->doc_cnt;
|
5925
|
-
min_segment--;
|
5926
|
-
}
|
5927
|
-
|
5928
|
-
if (merge_docs >= target_merge_docs) { /* found a merge to do */
|
5929
|
-
iw_merge_segments_from(iw, min_segment + 1);
|
5930
|
-
}
|
5931
|
-
else if (min_segment <= 0) {
|
5932
|
-
break;
|
5933
|
-
}
|
5934
|
-
|
5935
|
-
target_merge_docs *= iw->config.merge_factor;
|
5936
|
-
}
|
5937
|
-
}
|
5938
|
-
|
5939
4899
|
static void iw_flush_ram_segment(FrtIndexWriter *iw) {
|
5940
4900
|
FrtSegmentInfos *sis = iw->sis;
|
5941
4901
|
FrtSegmentInfo *si;
|
@@ -5947,7 +4907,6 @@ static void iw_flush_ram_segment(FrtIndexWriter *iw) {
|
|
5947
4907
|
frt_sis_write(iw->sis, iw->store, iw->deleter);
|
5948
4908
|
deleter_commit_pending_deletions(iw->deleter);
|
5949
4909
|
pthread_mutex_unlock(&iw->store->mutex);
|
5950
|
-
iw_maybe_merge_segments(iw);
|
5951
4910
|
}
|
5952
4911
|
|
5953
4912
|
void frt_iw_add_doc(FrtIndexWriter *iw, FrtDocument *doc)
|
@@ -6051,26 +5010,6 @@ void frt_iw_delete_terms(FrtIndexWriter *iw, ID field, char **terms, const int t
|
|
6051
5010
|
}
|
6052
5011
|
}
|
6053
5012
|
|
6054
|
-
static void iw_optimize_i(FrtIndexWriter *iw)
|
6055
|
-
{
|
6056
|
-
int min_segment;
|
6057
|
-
iw_commit_i(iw);
|
6058
|
-
while (iw->sis->size > 1
|
6059
|
-
|| (iw->sis->size == 1
|
6060
|
-
&& (frt_si_has_deletions(iw->sis->segs[0])
|
6061
|
-
|| (iw->sis->segs[0]->store != iw->store)))) {
|
6062
|
-
min_segment = iw->sis->size - iw->config.merge_factor;
|
6063
|
-
iw_merge_segments_from(iw, min_segment < 0 ? 0 : min_segment);
|
6064
|
-
}
|
6065
|
-
}
|
6066
|
-
|
6067
|
-
void frt_iw_optimize(FrtIndexWriter *iw)
|
6068
|
-
{
|
6069
|
-
pthread_mutex_lock(&iw->mutex);
|
6070
|
-
iw_optimize_i(iw);
|
6071
|
-
pthread_mutex_unlock(&iw->mutex);
|
6072
|
-
}
|
6073
|
-
|
6074
5013
|
void frt_iw_close(FrtIndexWriter *iw)
|
6075
5014
|
{
|
6076
5015
|
pthread_mutex_lock(&iw->mutex);
|
@@ -6398,7 +5337,6 @@ static void iw_add_segments(FrtIndexWriter *iw, FrtIndexReader *ir) {
|
|
6398
5337
|
void frt_iw_add_readers(FrtIndexWriter *iw, FrtIndexReader **readers, const int r_cnt) {
|
6399
5338
|
int i;
|
6400
5339
|
pthread_mutex_lock(&iw->mutex);
|
6401
|
-
iw_optimize_i(iw);
|
6402
5340
|
|
6403
5341
|
for (i = 0; i < r_cnt; i++) {
|
6404
5342
|
iw_add_segments(iw, readers[i]);
|
@@ -6410,6 +5348,5 @@ void frt_iw_add_readers(FrtIndexWriter *iw, FrtIndexReader **readers, const int
|
|
6410
5348
|
frt_sis_write(iw->sis, iw->store, iw->deleter);
|
6411
5349
|
pthread_mutex_unlock(&iw->store->mutex);
|
6412
5350
|
|
6413
|
-
iw_optimize_i(iw);
|
6414
5351
|
pthread_mutex_unlock(&iw->mutex);
|
6415
5352
|
}
|