isomorfeus-ferret 0.17.1 → 0.17.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,6 @@
1
1
  #include "frt_global.h"
2
+ #include "frt_lazy_doc_field.h"
3
+ #include "frt_lazy_doc.h"
2
4
  #include "frt_index.h"
3
5
  #include "frt_similarity.h"
4
6
  #include "frt_helper.h"
@@ -6,13 +8,6 @@
6
8
  #include <string.h>
7
9
  #include <limits.h>
8
10
  #include <ctype.h>
9
- #include "brotli_decode.h"
10
- #include "brotli_encode.h"
11
- #include "bzlib.h"
12
- #include "lz4frame.h"
13
-
14
- // #undef close
15
- // #undef read
16
11
 
17
12
  extern rb_encoding *utf8_encoding;
18
13
  extern void frt_micro_sleep(const int micro_seconds);
@@ -46,9 +41,6 @@ static char *ste_next(FrtTermEnum *te);
46
41
  #define FORMAT 15
47
42
  #define SEGMENTS_GEN_FILE_NAME "segments"
48
43
  #define MAX_EXT_LEN 10
49
- #define FRT_COMPRESSION_BUFFER_SIZE 16348
50
- #define FRT_BROTLI_COMPRESSION_LEVEL 4
51
- #define FRT_BZIP_COMPRESSION_LEVEL 9
52
44
 
53
45
  /* *** Must be three characters *** */
54
46
  static const char *INDEX_EXTENSIONS[] = {
@@ -590,29 +582,6 @@ static char *si_norm_file_name(FrtSegmentInfo *si, char *buf, int field_num)
590
582
 
591
583
  void frt_deleter_queue_file(FrtDeleter *dlr, const char *file_name);
592
584
 
593
- static void si_delete_files(FrtSegmentInfo *si, FrtFieldInfos *fis, FrtDeleter *dlr)
594
- {
595
- int i;
596
- char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
597
- size_t seg_len = strlen(si->name);
598
- char *ext;
599
-
600
- for (i = si->norm_gens_size - 1; i >= 0; i--) {
601
- if (0 <= si->norm_gens[i]) {
602
- frt_deleter_queue_file(dlr, si_norm_file_name(si, file_name, fis->fields[i]->number));
603
- }
604
- }
605
-
606
- memcpy(file_name, si->name, seg_len);
607
- file_name[seg_len] = '.';
608
- ext = file_name + seg_len + 1;
609
-
610
- for (i = FRT_NELEMS(INDEX_EXTENSIONS) - 1; i >= 0; i--) {
611
- memcpy(ext, INDEX_EXTENSIONS[i], 4);
612
- frt_deleter_queue_file(dlr, file_name);
613
- }
614
- }
615
-
616
585
  /****************************************************************************
617
586
  *
618
587
  * SegmentInfos
@@ -1044,355 +1013,6 @@ frt_u64 frt_sis_read_current_version(FrtStore *store)
1044
1013
  return fsf.ret.uint64;
1045
1014
  }
1046
1015
 
1047
- /****************************************************************************
1048
- *
1049
- * LazyDocField
1050
- *
1051
- ****************************************************************************/
1052
-
1053
- static FrtLazyDocField *lazy_df_new(ID name, const int size, FrtCompressionType compression) {
1054
- FrtLazyDocField *self = FRT_ALLOC(FrtLazyDocField);
1055
- self->name = name;
1056
- self->size = size;
1057
- self->data = FRT_ALLOC_AND_ZERO_N(FrtLazyDocFieldData, size);
1058
- self->compression = compression;
1059
- self->decompressed = false;
1060
- self->loaded = false;
1061
- return self;
1062
- }
1063
-
1064
- static void lazy_df_destroy(FrtLazyDocField *self) {
1065
- int i;
1066
- for (i = self->size - 1; i >= 0; i--) {
1067
- if (self->data[i].text) {
1068
- free(self->data[i].text);
1069
- }
1070
- }
1071
- free(self->data);
1072
- free(self);
1073
- }
1074
-
1075
- static void comp_raise(void) {
1076
- FRT_RAISE(EXCEPTION, "Compression error");
1077
- }
1078
-
1079
- static char *is_read_brotli_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
1080
- int buf_out_idx = 0;
1081
- int read_len;
1082
- frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1083
- const frt_uchar *next_in;
1084
- size_t available_in;
1085
- frt_uchar *buf_out = NULL;
1086
- frt_uchar *next_out;
1087
- size_t available_out;
1088
-
1089
- BrotliDecoderState *b_state = BrotliDecoderCreateInstance(NULL, NULL, NULL);
1090
- BrotliDecoderResult b_result = BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
1091
- if (!b_state) { comp_raise(); return NULL; }
1092
-
1093
- do {
1094
- read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1095
- frt_is_read_bytes(is, buf_in, read_len);
1096
- compressed_len -= read_len;
1097
- available_in = read_len;
1098
- next_in = buf_in;
1099
- available_out = FRT_COMPRESSION_BUFFER_SIZE;
1100
- do {
1101
- FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
1102
- next_out = buf_out + buf_out_idx;
1103
- b_result = BrotliDecoderDecompressStream(b_state,
1104
- &available_in, &next_in,
1105
- &available_out, &next_out, NULL);
1106
- if (b_result == BROTLI_DECODER_RESULT_ERROR) { comp_raise(); return NULL; }
1107
- buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - available_out;
1108
- } while (b_result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT);
1109
- } while (b_result != BROTLI_DECODER_RESULT_SUCCESS && compressed_len > 0);
1110
-
1111
- BrotliDecoderDestroyInstance(b_state);
1112
-
1113
- FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + 1);
1114
- buf_out[buf_out_idx] = '\0';
1115
- *len = buf_out_idx;
1116
- return (char *)buf_out;
1117
- }
1118
-
1119
- static void zraise(int ret) {
1120
- switch (ret) {
1121
- case BZ_IO_ERROR:
1122
- if (ferror(stdin))
1123
- FRT_RAISE(FRT_IO_ERROR, "bzlib: error reading stdin");
1124
- if (ferror(stdout))
1125
- FRT_RAISE(FRT_IO_ERROR, "bzlib: error writing stdout");
1126
- break;
1127
- case BZ_CONFIG_ERROR:
1128
- FRT_RAISE(FRT_IO_ERROR, "bzlib: system configuration error");
1129
- break;
1130
- case BZ_SEQUENCE_ERROR: /* shouldn't occur if code is correct */
1131
- FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! sequence error");
1132
- break;
1133
- case BZ_PARAM_ERROR: /* shouldn't occur if code is correct */
1134
- FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! parameter error");
1135
- break;
1136
- case BZ_MEM_ERROR:
1137
- FRT_RAISE(FRT_IO_ERROR, "bzlib: memory error");
1138
- break;
1139
- case BZ_DATA_ERROR:
1140
- FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check error");
1141
- break;
1142
- case BZ_DATA_ERROR_MAGIC:
1143
- FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check - non-matching magic");
1144
- break;
1145
- case BZ_UNEXPECTED_EOF:
1146
- FRT_RAISE(FRT_IO_ERROR, "bzlib: unexpected end-of-file");
1147
- break;
1148
- case BZ_OUTBUFF_FULL:
1149
- FRT_RAISE(FRT_IO_ERROR, "bzlib: output buffer full");
1150
- break;
1151
- default:
1152
- FRT_RAISE(FRT_EXCEPTION, "bzlib: unknown error");
1153
- }
1154
- }
1155
-
1156
- static char *is_read_bz2_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
1157
- int buf_out_idx = 0, ret, read_len;
1158
- char *buf_out = NULL;
1159
- char buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1160
- bz_stream zstrm;
1161
- zstrm.bzalloc = NULL;
1162
- zstrm.bzfree = NULL;
1163
- zstrm.opaque = NULL;
1164
- zstrm.next_in = NULL;
1165
- zstrm.avail_in = 0;
1166
- if ((ret = BZ2_bzDecompressInit(&zstrm, 0, 0)) != BZ_OK) zraise(ret);
1167
-
1168
- do {
1169
- read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1170
- frt_is_read_bytes(is, (frt_uchar *)buf_in, read_len);
1171
- compressed_len -= read_len;
1172
- zstrm.avail_in = read_len;
1173
- zstrm.next_in = buf_in;
1174
- zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
1175
-
1176
- do {
1177
- REALLOC_N(buf_out, char, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
1178
- zstrm.next_out = buf_out + buf_out_idx;
1179
- ret = BZ2_bzDecompress(&zstrm);
1180
- assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
1181
- if (ret != BZ_OK && ret != BZ_STREAM_END) {
1182
- (void)BZ2_bzDecompressEnd(&zstrm);
1183
- zraise(ret);
1184
- }
1185
- buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
1186
- } while (zstrm.avail_out == 0);
1187
- } while (ret != BZ_STREAM_END && compressed_len != 0);
1188
-
1189
- (void)BZ2_bzDecompressEnd(&zstrm);
1190
-
1191
- FRT_REALLOC_N(buf_out, char, buf_out_idx + 1);
1192
- buf_out[buf_out_idx] = '\0';
1193
-
1194
- *len = buf_out_idx;
1195
- return (char *)buf_out;
1196
- }
1197
-
1198
- static char *is_read_lz4_compressed_bytes(FrtInStream *is, int compressed_len, int *length) {
1199
- frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1200
- char *buf_out = NULL;
1201
- int dc_length = 0;
1202
- LZ4F_dctx *dctx;
1203
- LZ4F_frameInfo_t frame_info;
1204
- LZ4F_errorCode_t dctx_status = LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION);
1205
- if (LZ4F_isError(dctx_status)) { *length = -1; return NULL; }
1206
-
1207
- /* header and buffer */
1208
- int read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1209
- frt_is_read_bytes(is, buf_in, read_length);
1210
- compressed_len -= read_length;
1211
-
1212
- size_t consumed_size = read_length;
1213
- size_t res = LZ4F_getFrameInfo(dctx, &frame_info, buf_in, &consumed_size);
1214
- if (LZ4F_isError(res)) { *length = -1; return NULL; }
1215
- size_t buf_out_length;
1216
- switch(frame_info.blockSizeID) {
1217
- case LZ4F_default:
1218
- case LZ4F_max64KB:
1219
- buf_out_length = 1 << 16;
1220
- break;
1221
- case LZ4F_max256KB:
1222
- buf_out_length = 1 << 18;
1223
- break;
1224
- case LZ4F_max1MB:
1225
- buf_out_length = 1 << 20;
1226
- break;
1227
- case LZ4F_max4MB:
1228
- buf_out_length = 1 << 22;
1229
- break;
1230
- default:
1231
- buf_out_length = 0;
1232
- }
1233
-
1234
- res = 1;
1235
- int first_chunk = 1;
1236
-
1237
- /* decompress data */
1238
- while (res != 0) {
1239
- if (!first_chunk) {
1240
- read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1241
- frt_is_read_bytes(is, buf_in, read_length);
1242
- compressed_len -= read_length;
1243
- consumed_size = 0;
1244
- }
1245
- first_chunk = 0;
1246
-
1247
- char *src = (char *)(buf_in + consumed_size);
1248
- char *src_end = (char *)buf_in + read_length;
1249
-
1250
- while (src < src_end && res != 0){
1251
- size_t dest_length = buf_out_length;
1252
- size_t consumed_size = read_length;
1253
- FRT_REALLOC_N(buf_out, char, dc_length + buf_out_length);
1254
- res = LZ4F_decompress(dctx, buf_out + dc_length, &dest_length, src, &consumed_size, NULL);
1255
- if (LZ4F_isError(res)) { *length = -1; return NULL; }
1256
- dc_length += dest_length;
1257
- src = src + consumed_size;
1258
- }
1259
- }
1260
-
1261
- /* finish up */
1262
- LZ4F_freeDecompressionContext(dctx);
1263
-
1264
- FRT_REALLOC_N(buf_out, char, dc_length + 1);
1265
- buf_out[dc_length] = '\0';
1266
-
1267
- *length = dc_length;
1268
- return buf_out;
1269
- }
1270
-
1271
- static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len, FrtCompressionType compression) {
1272
- switch (compression) {
1273
- case FRT_COMPRESSION_BROTLI:
1274
- return is_read_brotli_compressed_bytes(is, compressed_len, len);
1275
- case FRT_COMPRESSION_BZ2:
1276
- return is_read_bz2_compressed_bytes(is, compressed_len, len);
1277
- case FRT_COMPRESSION_LZ4:
1278
- return is_read_lz4_compressed_bytes(is, compressed_len, len);
1279
- default:
1280
- return NULL;
1281
- }
1282
- }
1283
-
1284
- char *frt_lazy_df_get_data(FrtLazyDocField *self, int i) {
1285
- char *text = NULL;
1286
- if (i < self->size && i >= 0) {
1287
- text = self->data[i].text;
1288
- if (NULL == text) {
1289
- const int read_len = self->data[i].length + 1;
1290
- frt_is_seek(self->doc->fields_in, self->data[i].start);
1291
- if (self->data[i].compression != FRT_COMPRESSION_NONE) {
1292
- self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length), self->data[i].compression);
1293
- } else {
1294
- self->data[i].text = text = FRT_ALLOC_N(char, read_len);
1295
- frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
1296
- text[read_len - 1] = '\0';
1297
- }
1298
- self->loaded = true;
1299
- }
1300
- }
1301
-
1302
- return text;
1303
- }
1304
-
1305
- void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len) {
1306
- if (self->compression != FRT_COMPRESSION_NONE && !self->decompressed) {
1307
- int i;
1308
- self->len = 0;
1309
- for (i = self->size-1; i >= 0; i--) {
1310
- (void)frt_lazy_df_get_data(self, i);
1311
- self->len += self->data[i].length + 1;
1312
- }
1313
- self->len--; /* each field separated by ' ' but no need to add to end */
1314
- self->decompressed = true;
1315
- }
1316
- if (start < 0 || start >= self->len) {
1317
- FRT_RAISE(FRT_IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
1318
- "is not between 0 and %d", start, self->len);
1319
- }
1320
- if (len <= 0) {
1321
- FRT_RAISE(FRT_IO_ERROR, "len = %d, but should be greater than 0", len);
1322
- }
1323
- if (start + len > self->len) {
1324
- FRT_RAISE(FRT_IO_ERROR, "Tried to read past end of field. Field is only %d "
1325
- "bytes long but tried to read to %d", self->len, start + len);
1326
- }
1327
- if (self->compression != FRT_COMPRESSION_NONE) {
1328
- int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
1329
- for (i = 0; i < self->size; i++) {
1330
- cur_end = cur_start + self->data[i].length;
1331
- if (start < cur_end) {
1332
- copy_start = start > cur_start ? start - cur_start : 0;
1333
- copy_len = cur_end - cur_start - copy_start;
1334
- if (copy_len >= len) {
1335
- copy_len = len;
1336
- len = 0;
1337
- }
1338
- else {
1339
- len -= copy_len;
1340
- }
1341
- memcpy(buf + buf_start,
1342
- self->data[i].text + copy_start,
1343
- copy_len);
1344
- buf_start += copy_len;
1345
- if (len > 0) {
1346
- buf[buf_start++] = ' ';
1347
- len--;
1348
- }
1349
- if (len == 0) break;
1350
- }
1351
- cur_start = cur_end + 1;
1352
- }
1353
- } else {
1354
- frt_is_seek(self->doc->fields_in, self->data[0].start + start);
1355
- frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)buf, len);
1356
- }
1357
- }
1358
-
1359
- /****************************************************************************
1360
- *
1361
- * LazyDoc
1362
- *
1363
- ****************************************************************************/
1364
-
1365
- static FrtLazyDoc *lazy_doc_new(int size, FrtInStream *fdt_in)
1366
- {
1367
- FrtLazyDoc *self = FRT_ALLOC(FrtLazyDoc);
1368
- self->field_dictionary = frt_h_new_ptr((frt_free_ft)&lazy_df_destroy);
1369
- self->size = size;
1370
- self->fields = FRT_ALLOC_AND_ZERO_N(FrtLazyDocField *, size);
1371
- self->fields_in = frt_is_clone(fdt_in);
1372
- self->loaded = false;
1373
- return self;
1374
- }
1375
-
1376
- void frt_lazy_doc_close(FrtLazyDoc *self)
1377
- {
1378
- frt_h_destroy(self->field_dictionary);
1379
- frt_is_close(self->fields_in);
1380
- free(self->fields);
1381
- free(self);
1382
- }
1383
-
1384
- static void lazy_doc_add_field(FrtLazyDoc *self, FrtLazyDocField *lazy_df, int i)
1385
- {
1386
- self->fields[i] = lazy_df;
1387
-
1388
- frt_h_set(self->field_dictionary, (void *)lazy_df->name, lazy_df);
1389
- lazy_df->doc = self;
1390
- }
1391
-
1392
- FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self, ID field) {
1393
- return (FrtLazyDocField *)frt_h_get(self->field_dictionary, (void *)field);
1394
- }
1395
-
1396
1016
  /****************************************************************************
1397
1017
  * FrtFieldsReader
1398
1018
  ****************************************************************************/
@@ -1457,7 +1077,7 @@ static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df,
1457
1077
 
1458
1078
  for (i = 0; i < df_size; i++) {
1459
1079
  const int compressed_len = df->lengths[i] + 1;
1460
- df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]), compression);
1080
+ df->data[i] = frt_is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]), compression);
1461
1081
  }
1462
1082
  }
1463
1083
 
@@ -1522,11 +1142,11 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
1522
1142
  frt_is_seek(fdt_in, pos);
1523
1143
  stored_cnt = frt_is_read_vint(fdt_in);
1524
1144
 
1525
- lazy_doc = lazy_doc_new(stored_cnt, fdt_in);
1145
+ lazy_doc = frt_lazy_doc_new(stored_cnt, fdt_in);
1526
1146
  for (i = 0; i < stored_cnt; i++) {
1527
1147
  FrtFieldInfo *fi = fr->fis->fields[frt_is_read_vint(fdt_in)];
1528
1148
  const int df_size = frt_is_read_vint(fdt_in);
1529
- FrtLazyDocField *lazy_df = lazy_df_new(fi->name, df_size, frt_fi_get_compression(fi));
1149
+ FrtLazyDocField *lazy_df = frt_lazy_df_new(fi->name, df_size, frt_fi_get_compression(fi));
1530
1150
  const int field_start = start;
1531
1151
  /* get the starts relative positions this time around */
1532
1152
 
@@ -1538,7 +1158,7 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
1538
1158
  }
1539
1159
 
1540
1160
  lazy_df->len = start - field_start - 1;
1541
- lazy_doc_add_field(lazy_doc, lazy_df, i);
1161
+ frt_lazy_doc_add_field(lazy_doc, lazy_df, i);
1542
1162
  }
1543
1163
  /* correct the starts to their correct absolute positions */
1544
1164
  const frt_off_t abs_start = frt_is_pos(fdt_in);
@@ -1720,145 +1340,6 @@ void frt_fw_close(FrtFieldsWriter *fw) {
1720
1340
  free(fw);
1721
1341
  }
1722
1342
 
1723
- static int frt_os_write_brotli_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1724
- size_t compressed_length = 0;
1725
- const frt_uchar *next_in = data;
1726
- size_t available_in = length;
1727
- size_t available_out;
1728
- frt_uchar compression_buffer[FRT_COMPRESSION_BUFFER_SIZE];
1729
- frt_uchar *next_out;
1730
- BrotliEncoderState *b_state = BrotliEncoderCreateInstance(NULL, NULL, NULL);
1731
- if (!b_state) { comp_raise(); return -1; }
1732
-
1733
- BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, FRT_BROTLI_COMPRESSION_LEVEL);
1734
-
1735
- do {
1736
- available_out = FRT_COMPRESSION_BUFFER_SIZE;
1737
- next_out = compression_buffer;
1738
- if (!BrotliEncoderCompressStream(b_state, BROTLI_OPERATION_FINISH,
1739
- &available_in, &next_in,
1740
- &available_out, &next_out, &compressed_length)) {
1741
- BrotliEncoderDestroyInstance(b_state);
1742
- comp_raise();
1743
- return -1;
1744
- }
1745
- frt_os_write_bytes(out_stream, compression_buffer, FRT_COMPRESSION_BUFFER_SIZE - available_out);
1746
- } while (!BrotliEncoderIsFinished(b_state));
1747
-
1748
- BrotliEncoderDestroyInstance(b_state);
1749
-
1750
- return (int)compressed_length;
1751
- }
1752
-
1753
- static int frt_os_write_bz2_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1754
- int ret, buf_size, compressed_len = 0;
1755
- char out_buffer[FRT_COMPRESSION_BUFFER_SIZE];
1756
- bz_stream zstrm;
1757
- zstrm.bzalloc = NULL;
1758
- zstrm.bzfree = NULL;
1759
- zstrm.opaque = NULL;
1760
- if ((ret = BZ2_bzCompressInit(&zstrm, FRT_BZIP_COMPRESSION_LEVEL, 0, 0)) != BZ_OK) zraise(ret);
1761
-
1762
- zstrm.avail_in = length;
1763
- zstrm.next_in = (char *)data;
1764
- zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
1765
- zstrm.next_out = out_buffer;
1766
-
1767
- do {
1768
- ret = BZ2_bzCompress(&zstrm, BZ_FINISH); /* no bad return value */
1769
- assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
1770
- compressed_len += buf_size = FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
1771
- frt_os_write_bytes(out_stream, (frt_uchar *)out_buffer, buf_size);
1772
- } while (zstrm.avail_out == 0);
1773
- assert(zstrm.avail_in == 0); /* all input will be used */
1774
-
1775
- (void)BZ2_bzCompressEnd(&zstrm);
1776
- return compressed_len;
1777
- }
1778
-
1779
- static const LZ4F_preferences_t lz4_prefs = {
1780
- {
1781
- LZ4F_default,
1782
- LZ4F_blockLinked,
1783
- LZ4F_noContentChecksum,
1784
- LZ4F_frame,
1785
- 0, /* unknown content size */
1786
- 0, /* no dictID */
1787
- LZ4F_noBlockChecksum
1788
- },
1789
- 0,
1790
- 1,
1791
- 1,
1792
- {0,0,0}
1793
- };
1794
-
1795
- static int frt_os_write_lz4_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1796
- int compressed_length = 0;
1797
- int remaining_length = length;
1798
- size_t ccmp_length = 0;
1799
- LZ4F_compressionContext_t ctx;
1800
- size_t out_buf_length = LZ4F_compressBound(FRT_COMPRESSION_BUFFER_SIZE, &lz4_prefs);
1801
- frt_uchar *out_buf = frt_ecalloc(out_buf_length);
1802
-
1803
- size_t ctx_creation = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
1804
- if (LZ4F_isError(ctx_creation)) {
1805
- compressed_length = -1;
1806
- goto finish;
1807
- }
1808
-
1809
- /* create header */
1810
- ccmp_length = LZ4F_compressBegin(ctx, out_buf, out_buf_length, &lz4_prefs);
1811
- if (LZ4F_isError(ccmp_length)) {
1812
- compressed_length = -1;
1813
- goto finish;
1814
- }
1815
- compressed_length = ccmp_length;
1816
- frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1817
-
1818
- /* compress data */
1819
- do {
1820
- int read_length = (FRT_COMPRESSION_BUFFER_SIZE > remaining_length) ? remaining_length : FRT_COMPRESSION_BUFFER_SIZE;
1821
- ccmp_length = LZ4F_compressUpdate(ctx, out_buf, out_buf_length, data + (length - remaining_length), read_length, NULL);
1822
- if (LZ4F_isError(ccmp_length)) {
1823
- compressed_length = -1;
1824
- goto finish;
1825
- }
1826
- frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1827
- compressed_length += ccmp_length;
1828
- remaining_length -= read_length;
1829
- } while (remaining_length > 0);
1830
-
1831
- /* finish up */
1832
- ccmp_length = LZ4F_compressEnd(ctx, out_buf, out_buf_length, NULL);
1833
- if (LZ4F_isError(ccmp_length)) {
1834
- compressed_length = -1;
1835
- goto finish;
1836
- }
1837
-
1838
- frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1839
- compressed_length += ccmp_length;
1840
-
1841
- finish:
1842
- LZ4F_freeCompressionContext(ctx);
1843
- free(out_buf);
1844
-
1845
- return compressed_length;
1846
- }
1847
-
1848
- static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length, FrtCompressionType compression) {
1849
- switch (compression) {
1850
- case FRT_COMPRESSION_BROTLI:
1851
- return frt_os_write_brotli_compressed_bytes(out_stream, data, length);
1852
- case FRT_COMPRESSION_BZ2:
1853
- return frt_os_write_bz2_compressed_bytes(out_stream, data, length);
1854
- case FRT_COMPRESSION_LZ4:
1855
- return frt_os_write_lz4_compressed_bytes(out_stream, data, length);
1856
- default:
1857
- return -1;
1858
- }
1859
-
1860
- }
1861
-
1862
1343
  void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
1863
1344
  int i, j, stored_cnt = 0;
1864
1345
  FrtDocField *df;
@@ -5384,465 +4865,6 @@ void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc) {
5384
4865
  * IndexWriter
5385
4866
  *
5386
4867
  ****************************************************************************/
5387
- /****************************************************************************
5388
- * SegmentMergeInfo
5389
- ****************************************************************************/
5390
-
5391
- typedef struct SegmentMergeInfo {
5392
- int base;
5393
- int max_doc;
5394
- int doc_cnt;
5395
- FrtSegmentInfo *si;
5396
- FrtStore *store;
5397
- FrtStore *orig_store;
5398
- FrtBitVector *deleted_docs;
5399
- FrtSegmentFieldIndex *sfi;
5400
- FrtTermEnum *te;
5401
- FrtTermDocEnum *tde;
5402
- char *term;
5403
- int *doc_map;
5404
- FrtInStream *frq_in;
5405
- FrtInStream *prx_in;
5406
- } SegmentMergeInfo;
5407
-
5408
- static bool smi_lt(const SegmentMergeInfo *smi1, const SegmentMergeInfo *smi2)
5409
- {
5410
- int cmpres = strcmp(smi1->term, smi2->term);
5411
- if (0 == cmpres) {
5412
- return smi1->base < smi2->base;
5413
- }
5414
- else {
5415
- return cmpres < 0;
5416
- }
5417
- }
5418
-
5419
- static void smi_load_doc_map(SegmentMergeInfo *smi)
5420
- {
5421
- FrtBitVector *deleted_docs = smi->deleted_docs;
5422
- const int max_doc = smi->max_doc;
5423
- int j = 0, i;
5424
-
5425
- smi->doc_map = FRT_ALLOC_N(int, max_doc);
5426
- for (i = 0; i < max_doc; i++) {
5427
- if (frt_bv_get(deleted_docs, i)) {
5428
- smi->doc_map[i] = -1;
5429
- }
5430
- else {
5431
- smi->doc_map[i] = j++;
5432
- }
5433
- }
5434
- smi->doc_cnt = j;
5435
- }
5436
-
5437
- static SegmentMergeInfo *smi_new(int base, FrtStore *store, FrtSegmentInfo *si)
5438
- {
5439
- SegmentMergeInfo *smi = FRT_ALLOC_AND_ZERO(SegmentMergeInfo);
5440
- char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
5441
- char *segment = si->name;
5442
- smi->base = base;
5443
- smi->si = si;
5444
- smi->orig_store = smi->store = store;
5445
- FRT_REF(smi->orig_store);
5446
-
5447
- sprintf(file_name, "%s.fdx", segment);
5448
- smi->doc_cnt = smi->max_doc
5449
- = smi->store->length(smi->store, file_name) / FIELDS_IDX_PTR_SIZE;
5450
-
5451
- if (si->del_gen >= 0) {
5452
- frt_fn_for_generation(file_name, segment, "del", si->del_gen);
5453
- smi->deleted_docs = bv_read(store, file_name);
5454
- smi_load_doc_map(smi);
5455
- }
5456
- return smi;
5457
- }
5458
-
5459
- static void smi_load_term_input(SegmentMergeInfo *smi)
5460
- {
5461
- FrtStore *store = smi->store;
5462
- char *segment = smi->si->name;
5463
- char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
5464
- smi->sfi = frt_sfi_open(store, segment);
5465
- sprintf(file_name, "%s.tis", segment);
5466
- FrtInStream *is = store->open_input(store, file_name);
5467
- FRT_DEREF(is);
5468
- smi->te = TE(frt_ste_new(is, smi->sfi));
5469
- sprintf(file_name, "%s.frq", segment);
5470
- smi->frq_in = store->open_input(store, file_name);
5471
- sprintf(file_name, "%s.prx", segment);
5472
- smi->prx_in = store->open_input(store, file_name);
5473
- smi->tde = frt_stpe_new(NULL, smi->frq_in, smi->prx_in, smi->deleted_docs,
5474
- STE(smi->te)->skip_interval);
5475
- }
5476
-
5477
- static void smi_close_term_input(SegmentMergeInfo *smi)
5478
- {
5479
- frt_ste_close(smi->te);
5480
- frt_sfi_close(smi->sfi);
5481
- stpe_close(smi->tde);
5482
- frt_is_close(smi->frq_in);
5483
- frt_is_close(smi->prx_in);
5484
- }
5485
-
5486
- static void smi_destroy(SegmentMergeInfo *smi)
5487
- {
5488
- if (smi->store != smi->orig_store) {
5489
- frt_store_close(smi->store);
5490
- }
5491
- frt_store_close(smi->orig_store);
5492
- if (smi->deleted_docs) {
5493
- frt_bv_destroy(smi->deleted_docs);
5494
- free(smi->doc_map);
5495
- }
5496
- free(smi);
5497
- }
5498
-
5499
- static char *smi_next(SegmentMergeInfo *smi)
5500
- {
5501
- return (smi->term = ste_next(smi->te));
5502
- }
5503
-
5504
- /****************************************************************************
5505
- * SegmentMerger
5506
- ****************************************************************************/
5507
-
5508
- typedef struct SegmentMerger {
5509
- FrtTermInfo ti;
5510
- FrtStore *store;
5511
- FrtFieldInfos *fis;
5512
- FrtSegmentInfo *si;
5513
- SegmentMergeInfo **smis;
5514
- int seg_cnt;
5515
- int doc_cnt;
5516
- FrtConfig *config;
5517
- FrtTermInfosWriter *tiw;
5518
- char *term_buf;
5519
- int term_buf_ptr;
5520
- int term_buf_size;
5521
- FrtPriorityQueue *queue;
5522
- SkipBuffer *skip_buf;
5523
- FrtOutStream *frq_out;
5524
- FrtOutStream *prx_out;
5525
- } SegmentMerger;
5526
-
5527
- static SegmentMerger *sm_create(FrtIndexWriter *iw, FrtSegmentInfo *si, FrtSegmentInfo **seg_infos, const int seg_cnt)
5528
- {
5529
- int i;
5530
- SegmentMerger *sm = FRT_ALLOC_AND_ZERO_N(SegmentMerger, seg_cnt);
5531
- sm->store = iw->store;
5532
- FRT_REF(sm->store);
5533
- sm->fis = iw->fis;
5534
- sm->si = si;
5535
- sm->doc_cnt = 0;
5536
- sm->smis = FRT_ALLOC_N(SegmentMergeInfo *, seg_cnt);
5537
- for (i = 0; i < seg_cnt; i++) {
5538
- sm->smis[i] = smi_new(sm->doc_cnt, seg_infos[i]->store, seg_infos[i]);
5539
- sm->doc_cnt += sm->smis[i]->doc_cnt;
5540
- }
5541
- sm->seg_cnt = seg_cnt;
5542
- sm->config = &iw->config;
5543
- return sm;
5544
- }
5545
-
5546
- static void sm_destroy(SegmentMerger *sm)
5547
- {
5548
- int i;
5549
- const int seg_cnt = sm->seg_cnt;
5550
- for (i = 0; i < seg_cnt; i++) {
5551
- smi_destroy(sm->smis[i]);
5552
- }
5553
- frt_store_close(sm->store);
5554
- free(sm->smis);
5555
- free(sm);
5556
- }
5557
-
5558
- static void sm_merge_fields(SegmentMerger *sm)
5559
- {
5560
- int i, j;
5561
- frt_off_t start, end = 0;
5562
- char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
5563
- FrtOutStream *fdt_out, *fdx_out;
5564
- FrtStore *store = sm->store;
5565
- const int seg_cnt = sm->seg_cnt;
5566
-
5567
- sprintf(file_name, "%s.fdt", sm->si->name);
5568
- fdt_out = store->new_output(store, file_name);
5569
-
5570
- sprintf(file_name, "%s.fdx", sm->si->name);
5571
- fdx_out = store->new_output(store, file_name);
5572
-
5573
- for (i = 0; i < seg_cnt; i++) {
5574
- SegmentMergeInfo *smi = sm->smis[i];
5575
- const int max_doc = smi->max_doc;
5576
- FrtInStream *fdt_in, *fdx_in;
5577
- char *segment = smi->si->name;
5578
- store = smi->store;
5579
- sprintf(file_name, "%s.fdt", segment);
5580
- fdt_in = store->open_input(store, file_name);
5581
- sprintf(file_name, "%s.fdx", segment);
5582
- fdx_in = store->open_input(store, file_name);
5583
-
5584
- if (max_doc > 0) {
5585
- end = (off_t)frt_is_read_u64(fdx_in);
5586
- }
5587
- for (j = 0; j < max_doc; j++) {
5588
- frt_u32 tv_idx_offset = frt_is_read_u32(fdx_in);
5589
- start = end;
5590
- if (j == max_doc - 1) {
5591
- end = frt_is_length(fdt_in);
5592
- }
5593
- else {
5594
- end = (off_t)frt_is_read_u64(fdx_in);
5595
- }
5596
- /* skip deleted docs */
5597
- if (!smi->deleted_docs || !frt_bv_get(smi->deleted_docs, j)) {
5598
- frt_os_write_u64(fdx_out, frt_os_pos(fdt_out));
5599
- frt_os_write_u32(fdx_out, tv_idx_offset);
5600
- frt_is_seek(fdt_in, start);
5601
- frt_is2os_copy_bytes(fdt_in, fdt_out, end - start);
5602
- }
5603
- }
5604
- frt_is_close(fdt_in);
5605
- frt_is_close(fdx_in);
5606
- }
5607
- frt_os_close(fdt_out);
5608
- frt_os_close(fdx_out);
5609
- }
5610
-
5611
- static int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **matches,
5612
- const int match_size)
5613
- {
5614
- int i;
5615
- int last_doc = 0, base, doc, doc_code, freq;
5616
- int skip_interval = sm->config->skip_interval;
5617
- int *doc_map = NULL;
5618
- int df = 0; /* number of docs w/ term */
5619
- FrtTermDocEnum *tde;
5620
- SegmentMergeInfo *smi;
5621
- SkipBuffer *skip_buf = sm->skip_buf;
5622
- skip_buf_reset(skip_buf);
5623
-
5624
- for (i = 0; i < match_size; i++) {
5625
- smi = matches[i];
5626
- base = smi->base;
5627
- doc_map = smi->doc_map;
5628
- tde = smi->tde;
5629
- stpe_seek_ti(STDE(tde), &smi->te->curr_ti);
5630
-
5631
- /* since we are using copy_bytes below to copy the proximities we use
5632
- * stde_next rather than stpe_next here */
5633
- while (stde_next(tde)) {
5634
- doc = stde_doc_num(tde);
5635
- if (NULL != doc_map) {
5636
- doc = doc_map[doc]; /* work around deletions */
5637
- }
5638
- doc += base; /* convert to merged space */
5639
- assert(doc == 0 || doc > last_doc);
5640
-
5641
- df++;
5642
- if (0 == (df % skip_interval)) {
5643
- skip_buf_add(skip_buf, last_doc);
5644
- }
5645
-
5646
- doc_code = (doc - last_doc) << 1; /* use low bit to flag freq=1 */
5647
- last_doc = doc;
5648
-
5649
- freq = stde_freq(tde);
5650
- if (freq == 1) {
5651
- frt_os_write_vint(sm->frq_out, doc_code | 1); /* doc & freq=1 */
5652
- }
5653
- else {
5654
- frt_os_write_vint(sm->frq_out, doc_code); /* write doc */
5655
- frt_os_write_vint(sm->frq_out, freq); /* write freqency in doc */
5656
- }
5657
-
5658
- /* copy position deltas */
5659
- frt_is2os_copy_vints(STDE(tde)->prx_in, sm->prx_out, freq);
5660
- }
5661
- }
5662
- return df;
5663
- }
5664
-
5665
- static char *sm_cache_term(SegmentMerger *sm, char *term, int term_len)
5666
- {
5667
- term = (char *)memcpy(sm->term_buf + sm->term_buf_ptr, term, term_len + 1);
5668
- sm->term_buf_ptr += term_len + 1;
5669
- if (sm->term_buf_ptr > sm->term_buf_size) {
5670
- sm->term_buf_ptr = 0;
5671
- }
5672
- return term;
5673
- }
5674
-
5675
- static void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **matches,
5676
- int match_size)
5677
- {
5678
- frt_off_t frq_ptr = frt_os_pos(sm->frq_out);
5679
- frt_off_t prx_ptr = frt_os_pos(sm->prx_out);
5680
-
5681
- int df = sm_append_postings(sm, matches, match_size); /* append posting data */
5682
-
5683
- frt_off_t skip_ptr = skip_buf_write(sm->skip_buf);
5684
-
5685
- if (df > 0) {
5686
- /* add an entry to the dictionary with ptrs to prox and freq files */
5687
- SegmentMergeInfo *first_match = matches[0];
5688
- int term_len = first_match->te->curr_term_len;
5689
-
5690
- frt_ti_set(sm->ti, df, frq_ptr, prx_ptr,
5691
- (skip_ptr - frq_ptr));
5692
- frt_tiw_add(sm->tiw, sm_cache_term(sm, first_match->term, term_len),
5693
- term_len, &sm->ti);
5694
- }
5695
- }
5696
-
5697
- static void sm_merge_term_infos(SegmentMerger *sm)
5698
- {
5699
- int i, j, match_size;
5700
- SegmentMergeInfo *smi, *top, **matches;
5701
- char *term;
5702
- const int seg_cnt = sm->seg_cnt;
5703
- const int fis_size = sm->fis->size;
5704
-
5705
- matches = FRT_ALLOC_N(SegmentMergeInfo *, seg_cnt);
5706
-
5707
- for (j = 0; j < seg_cnt; j++) {
5708
- smi_load_term_input(sm->smis[j]);
5709
- }
5710
-
5711
- for (i = 0; i < fis_size; i++) {
5712
- frt_tiw_start_field(sm->tiw, i);
5713
- for (j = 0; j < seg_cnt; j++) {
5714
- smi = sm->smis[j];
5715
- ste_set_field(smi->te, i);
5716
- if (NULL != smi_next(smi)) {
5717
- frt_pq_push(sm->queue, smi); /* initialize @queue */
5718
- }
5719
- }
5720
- while (sm->queue->size > 0) {
5721
- match_size = 0; /* pop matching terms */
5722
- matches[0] = (SegmentMergeInfo *)frt_pq_pop(sm->queue);
5723
- match_size++;
5724
- term = matches[0]->term;
5725
- top = (SegmentMergeInfo *)frt_pq_top(sm->queue);
5726
- while ((NULL != top) && (0 == strcmp(term, top->term))) {
5727
- matches[match_size] = (SegmentMergeInfo *)frt_pq_pop(sm->queue);
5728
- match_size++;
5729
- top = (SegmentMergeInfo *)frt_pq_top(sm->queue);
5730
- }
5731
-
5732
- sm_merge_term_info(sm, matches, match_size);/* add new FrtTermInfo */
5733
-
5734
- while (match_size > 0) {
5735
- match_size--;
5736
- smi = matches[match_size];
5737
- if (NULL != smi_next(smi)) {
5738
- frt_pq_push(sm->queue, smi); /* restore queue */
5739
- }
5740
- }
5741
- }
5742
- }
5743
- free(matches);
5744
- for (j = 0; j < seg_cnt; j++) {
5745
- smi_close_term_input(sm->smis[j]);
5746
- }
5747
- }
5748
-
5749
- static void sm_merge_terms(SegmentMerger *sm)
5750
- {
5751
- char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
5752
-
5753
- sprintf(file_name, "%s.frq", sm->si->name);
5754
- sm->frq_out = sm->store->new_output(sm->store, file_name);
5755
- sprintf(file_name, "%s.prx", sm->si->name);
5756
- sm->prx_out = sm->store->new_output(sm->store, file_name);
5757
-
5758
- sm->tiw = frt_tiw_open(sm->store, sm->si->name, sm->config->index_interval,
5759
- sm->config->skip_interval);
5760
- sm->skip_buf = skip_buf_new(sm->frq_out, sm->prx_out);
5761
-
5762
- /* terms_buf_ptr holds a buffer of terms since the FrtTermInfosWriter needs
5763
- * to keep the last index_interval terms so that it can compare the last
5764
- * term put in the index with the next one. So the size of the buffer must
5765
- * by index_interval + 2. */
5766
- sm->term_buf_ptr = 0;
5767
- sm->term_buf_size = (sm->config->index_interval + 1) * FRT_MAX_WORD_SIZE;
5768
- sm->term_buf = FRT_ALLOC_N(char, sm->term_buf_size + FRT_MAX_WORD_SIZE);
5769
-
5770
- sm->queue = frt_pq_new(sm->seg_cnt, (frt_lt_ft)&smi_lt, NULL);
5771
-
5772
- sm_merge_term_infos(sm);
5773
-
5774
- frt_os_close(sm->frq_out);
5775
- frt_os_close(sm->prx_out);
5776
- frt_tiw_close(sm->tiw);
5777
- frt_pq_destroy(sm->queue);
5778
- skip_buf_destroy(sm->skip_buf);
5779
- free(sm->term_buf);
5780
- }
5781
-
5782
- static void sm_merge_norms(SegmentMerger *sm)
5783
- {
5784
- FrtSegmentInfo *si;
5785
- int i, j, k;
5786
- FrtStore *store;
5787
- frt_uchar byte;
5788
- FrtFieldInfo *fi;
5789
- FrtOutStream *os;
5790
- FrtInStream *is;
5791
- char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
5792
- SegmentMergeInfo *smi;
5793
- const int seg_cnt = sm->seg_cnt;
5794
- for (i = sm->fis->size - 1; i >= 0; i--) {
5795
- fi = sm->fis->fields[i];
5796
- if (bits_has_norms(fi->bits)) {
5797
- si = sm->si;
5798
- frt_si_advance_norm_gen(si, i);
5799
- si_norm_file_name(si, file_name, i);
5800
- os = sm->store->new_output(sm->store, file_name);
5801
- for (j = 0; j < seg_cnt; j++) {
5802
- smi = sm->smis[j];
5803
- si = smi->si;
5804
- if (si_norm_file_name(si, file_name, i)) {
5805
- const int max_doc = smi->max_doc;
5806
- FrtBitVector *deleted_docs = smi->deleted_docs;
5807
- store = smi->store;
5808
- is = store->open_input(store, file_name);
5809
- if (deleted_docs) {
5810
- for (k = 0; k < max_doc; k++) {
5811
- byte = frt_is_read_byte(is);
5812
- if (!frt_bv_get(deleted_docs, k)) {
5813
- frt_os_write_byte(os, byte);
5814
- }
5815
- }
5816
- }
5817
- else {
5818
- frt_is2os_copy_bytes(is, os, max_doc);
5819
- }
5820
- frt_is_close(is);
5821
- }
5822
- else {
5823
- const int doc_cnt = smi->doc_cnt;
5824
- for (k = 0; k < doc_cnt; k++) {
5825
- frt_os_write_byte(os, '\0');
5826
- }
5827
- }
5828
- }
5829
- frt_os_close(os);
5830
- }
5831
- }
5832
- }
5833
-
5834
- static int sm_merge(SegmentMerger *sm)
5835
- {
5836
- sm_merge_fields(sm);
5837
- sm_merge_terms(sm);
5838
- sm_merge_norms(sm);
5839
- return sm->doc_cnt;
5840
- }
5841
-
5842
-
5843
- /****************************************************************************
5844
- * IndexWriter
5845
- ****************************************************************************/
5846
4868
 
5847
4869
  /* prepare an index ready for writing */
5848
4870
  void frt_index_create(FrtStore *store, FrtFieldInfos *fis)
@@ -5874,68 +4896,6 @@ int frt_iw_doc_count(FrtIndexWriter *iw)
5874
4896
  return doc_cnt;
5875
4897
  }
5876
4898
 
5877
- static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg, const int max_seg) {
5878
- int i;
5879
- FrtSegmentInfos *sis = iw->sis;
5880
- FrtSegmentInfo *si = frt_sis_new_segment(sis, 0, iw->store);
5881
-
5882
- SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg], max_seg - min_seg);
5883
-
5884
- /* This is where all the action happens. */
5885
- si->doc_cnt = sm_merge(merger);
5886
-
5887
- pthread_mutex_lock(&iw->store->mutex);
5888
- /* delete merged segments */
5889
- for (i = min_seg; i < max_seg; i++) {
5890
- si_delete_files(sis->segs[i], iw->fis, iw->deleter);
5891
- }
5892
-
5893
- frt_sis_del_from_to(sis, min_seg, max_seg);
5894
-
5895
- frt_sis_write(sis, iw->store, iw->deleter);
5896
- deleter_commit_pending_deletions(iw->deleter);
5897
-
5898
- pthread_mutex_unlock(&iw->store->mutex);
5899
-
5900
- sm_destroy(merger);
5901
- }
5902
-
5903
- static void iw_merge_segments_from(FrtIndexWriter *iw, int min_segment)
5904
- {
5905
- iw_merge_segments(iw, min_segment, iw->sis->size);
5906
- }
5907
-
5908
- static void iw_maybe_merge_segments(FrtIndexWriter *iw)
5909
- {
5910
- int target_merge_docs = iw->config.merge_factor;
5911
- int min_segment, merge_docs;
5912
- FrtSegmentInfo *si;
5913
-
5914
- while (target_merge_docs > 0
5915
- && target_merge_docs <= iw->config.max_merge_docs) {
5916
- /* find segments smaller than current target size */
5917
- min_segment = iw->sis->size - 1;
5918
- merge_docs = 0;
5919
- while (min_segment >= 0) {
5920
- si = iw->sis->segs[min_segment];
5921
- if (si->doc_cnt >= target_merge_docs) {
5922
- break;
5923
- }
5924
- merge_docs += si->doc_cnt;
5925
- min_segment--;
5926
- }
5927
-
5928
- if (merge_docs >= target_merge_docs) { /* found a merge to do */
5929
- iw_merge_segments_from(iw, min_segment + 1);
5930
- }
5931
- else if (min_segment <= 0) {
5932
- break;
5933
- }
5934
-
5935
- target_merge_docs *= iw->config.merge_factor;
5936
- }
5937
- }
5938
-
5939
4899
  static void iw_flush_ram_segment(FrtIndexWriter *iw) {
5940
4900
  FrtSegmentInfos *sis = iw->sis;
5941
4901
  FrtSegmentInfo *si;
@@ -5947,7 +4907,6 @@ static void iw_flush_ram_segment(FrtIndexWriter *iw) {
5947
4907
  frt_sis_write(iw->sis, iw->store, iw->deleter);
5948
4908
  deleter_commit_pending_deletions(iw->deleter);
5949
4909
  pthread_mutex_unlock(&iw->store->mutex);
5950
- // iw_maybe_merge_segments(iw);
5951
4910
  }
5952
4911
 
5953
4912
  void frt_iw_add_doc(FrtIndexWriter *iw, FrtDocument *doc)
@@ -6051,26 +5010,6 @@ void frt_iw_delete_terms(FrtIndexWriter *iw, ID field, char **terms, const int t
6051
5010
  }
6052
5011
  }
6053
5012
 
6054
- static void iw_optimize_i(FrtIndexWriter *iw)
6055
- {
6056
- int min_segment;
6057
- iw_commit_i(iw);
6058
- // while (iw->sis->size > 1
6059
- // || (iw->sis->size == 1
6060
- // && (frt_si_has_deletions(iw->sis->segs[0])
6061
- // || (iw->sis->segs[0]->store != iw->store)))) {
6062
- // min_segment = iw->sis->size - iw->config.merge_factor;
6063
- // iw_merge_segments_from(iw, min_segment < 0 ? 0 : min_segment);
6064
- // }
6065
- }
6066
-
6067
- void frt_iw_optimize(FrtIndexWriter *iw)
6068
- {
6069
- pthread_mutex_lock(&iw->mutex);
6070
- iw_optimize_i(iw);
6071
- pthread_mutex_unlock(&iw->mutex);
6072
- }
6073
-
6074
5013
  void frt_iw_close(FrtIndexWriter *iw)
6075
5014
  {
6076
5015
  pthread_mutex_lock(&iw->mutex);
@@ -6398,7 +5337,6 @@ static void iw_add_segments(FrtIndexWriter *iw, FrtIndexReader *ir) {
6398
5337
  void frt_iw_add_readers(FrtIndexWriter *iw, FrtIndexReader **readers, const int r_cnt) {
6399
5338
  int i;
6400
5339
  pthread_mutex_lock(&iw->mutex);
6401
- iw_optimize_i(iw);
6402
5340
 
6403
5341
  for (i = 0; i < r_cnt; i++) {
6404
5342
  iw_add_segments(iw, readers[i]);
@@ -6410,6 +5348,5 @@ void frt_iw_add_readers(FrtIndexWriter *iw, FrtIndexReader **readers, const int
6410
5348
  frt_sis_write(iw->sis, iw->store, iw->deleter);
6411
5349
  pthread_mutex_unlock(&iw->store->mutex);
6412
5350
 
6413
- iw_optimize_i(iw);
6414
5351
  pthread_mutex_unlock(&iw->mutex);
6415
5352
  }