isomorfeus-ferret 0.12.7 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +101 -19
- data/README.md +54 -1
- data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
- data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
- data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
- data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
- data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
- data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
- data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
- data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
- data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
- data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
- data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
- data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
- data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
- data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
- data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
- data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
- data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
- data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
- data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
- data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
- data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
- data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
- data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
- data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
- data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
- data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
- data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
- data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
- data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
- data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
- data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
- data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
- data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
- data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
- data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
- data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
- data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
- data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
- data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
- data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
- data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
- data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
- data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
- data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
- data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
- data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
- data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
- data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
- data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
- data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
- data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
- data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
- data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
- data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
- data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
- data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
- data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
- data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
- data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
- data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
- data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
- data/ext/isomorfeus_ferret_ext/test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
- data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
- data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
- data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
- data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
- data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
- data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
- data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
- data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
- data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
- data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
- data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
- data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
- data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
- data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
- data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
- data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
- data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
- data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
- data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +27 -57
- data/ext/isomorfeus_ferret_ext/email.rl +0 -21
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
- data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
- data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
- data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
- data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
- data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
- data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
- data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
- data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -8,6 +8,11 @@
|
|
8
8
|
#include <ctype.h>
|
9
9
|
#include "brotli_decode.h"
|
10
10
|
#include "brotli_encode.h"
|
11
|
+
#include "bzlib.h"
|
12
|
+
#include "lz4frame.h"
|
13
|
+
|
14
|
+
#undef close
|
15
|
+
#undef read
|
11
16
|
|
12
17
|
extern void frt_micro_sleep(const int micro_seconds);
|
13
18
|
|
@@ -41,8 +46,9 @@ static char *ste_next(FrtTermEnum *te);
|
|
41
46
|
#define FORMAT 0
|
42
47
|
#define SEGMENTS_GEN_FILE_NAME "segments"
|
43
48
|
#define MAX_EXT_LEN 10
|
44
|
-
#define
|
45
|
-
#define
|
49
|
+
#define FRT_COMPRESSION_BUFFER_SIZE 16348
|
50
|
+
#define FRT_BROTLI_COMPRESSION_LEVEL 4
|
51
|
+
#define FRT_BZIP_COMPRESSION_LEVEL 9
|
46
52
|
|
47
53
|
/* *** Must be three characters *** */
|
48
54
|
static const char *INDEX_EXTENSIONS[] = {
|
@@ -103,29 +109,22 @@ static frt_u64 str36_to_u64(char *p)
|
|
103
109
|
* @param ext extension of the filename (including .)
|
104
110
|
* @param gen generation
|
105
111
|
*/
|
106
|
-
char *frt_fn_for_generation(char *buf,
|
107
|
-
const char *base,
|
108
|
-
const char *ext,
|
109
|
-
frt_i64 gen)
|
110
|
-
{
|
112
|
+
char *frt_fn_for_generation(char *buf, const char *base, const char *ext, frt_i64 gen) {
|
111
113
|
if (-1 == gen) {
|
112
114
|
return NULL;
|
113
|
-
}
|
114
|
-
else {
|
115
|
+
} else {
|
115
116
|
char b[FRT_SEGMENT_NAME_MAX_LENGTH];
|
116
117
|
char *u = u64_to_str36(b, FRT_SEGMENT_NAME_MAX_LENGTH, (frt_u64)gen);
|
117
118
|
if (ext == NULL) {
|
118
119
|
sprintf(buf, "%s_%s", base, u);
|
119
|
-
}
|
120
|
-
else {
|
120
|
+
} else {
|
121
121
|
sprintf(buf, "%s_%s.%s", base, u, ext);
|
122
122
|
}
|
123
123
|
return buf;
|
124
124
|
}
|
125
125
|
}
|
126
126
|
|
127
|
-
static char *segfn_for_generation(char *buf, frt_u64 generation)
|
128
|
-
{
|
127
|
+
static char *segfn_for_generation(char *buf, frt_u64 generation) {
|
129
128
|
char b[FRT_SEGMENT_NAME_MAX_LENGTH];
|
130
129
|
char *u = u64_to_str36(b, FRT_SEGMENT_NAME_MAX_LENGTH, generation);
|
131
130
|
sprintf(buf, FRT_SEGMENTS_FILE_NAME"_%s", u);
|
@@ -203,8 +202,7 @@ FrtCacheObject *frt_co_create(FrtHash *ref_tab1, FrtHash *ref_tab2,
|
|
203
202
|
return self;
|
204
203
|
}
|
205
204
|
|
206
|
-
FrtHash *frt_co_hash_create()
|
207
|
-
{
|
205
|
+
FrtHash *frt_co_hash_create(void) {
|
208
206
|
return frt_h_new(&co_hash, &co_eq, (frt_free_ft)NULL, (frt_free_ft)&co_destroy);
|
209
207
|
}
|
210
208
|
|
@@ -214,22 +212,33 @@ FrtHash *frt_co_hash_create()
|
|
214
212
|
*
|
215
213
|
****************************************************************************/
|
216
214
|
|
217
|
-
static void fi_set_store(FrtFieldInfo *fi,
|
218
|
-
{
|
215
|
+
static void fi_set_store(FrtFieldInfo *fi, FrtStoreValue store) {
|
219
216
|
switch (store) {
|
220
217
|
case FRT_STORE_NO:
|
221
218
|
break;
|
222
219
|
case FRT_STORE_YES:
|
223
220
|
fi->bits |= FRT_FI_IS_STORED_BM;
|
224
221
|
break;
|
225
|
-
|
226
|
-
|
222
|
+
}
|
223
|
+
}
|
224
|
+
|
225
|
+
static void fi_set_compression(FrtFieldInfo *fi, FrtCompressionType compression) {
|
226
|
+
switch (compression) {
|
227
|
+
case FRT_COMPRESSION_NONE:
|
228
|
+
break;
|
229
|
+
case FRT_COMPRESSION_BROTLI:
|
230
|
+
fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_BROTLI_BM;
|
231
|
+
break;
|
232
|
+
case FRT_COMPRESSION_BZ2:
|
233
|
+
fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_BZ2_BM;
|
234
|
+
break;
|
235
|
+
case FRT_COMPRESSION_LZ4:
|
236
|
+
fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_LZ4_BM;
|
227
237
|
break;
|
228
238
|
}
|
229
239
|
}
|
230
240
|
|
231
|
-
static void fi_set_index(FrtFieldInfo *fi,
|
232
|
-
{
|
241
|
+
static void fi_set_index(FrtFieldInfo *fi, FrtIndexValue index) {
|
233
242
|
switch (index) {
|
234
243
|
case FRT_INDEX_NO:
|
235
244
|
break;
|
@@ -249,8 +258,7 @@ static void fi_set_index(FrtFieldInfo *fi, int index)
|
|
249
258
|
}
|
250
259
|
}
|
251
260
|
|
252
|
-
static void fi_set_term_vector(FrtFieldInfo *fi,
|
253
|
-
{
|
261
|
+
static void fi_set_term_vector(FrtFieldInfo *fi, FrtTermVectorValue term_vector) {
|
254
262
|
switch (term_vector) {
|
255
263
|
case FRT_TERM_VECTOR_NO:
|
256
264
|
break;
|
@@ -270,33 +278,40 @@ static void fi_set_term_vector(FrtFieldInfo *fi, int term_vector)
|
|
270
278
|
}
|
271
279
|
}
|
272
280
|
|
273
|
-
static void fi_check_params(
|
274
|
-
{
|
281
|
+
static void fi_check_params(FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
|
275
282
|
(void)store;
|
276
283
|
if ((index == FRT_INDEX_NO) && (term_vector != FRT_TERM_VECTOR_NO)) {
|
277
|
-
FRT_RAISE(FRT_ARG_ERROR,
|
278
|
-
|
284
|
+
FRT_RAISE(FRT_ARG_ERROR, "You can't store the term vectors of an unindexed field.");
|
285
|
+
}
|
286
|
+
if ((compression != FRT_COMPRESSION_NONE) && (store == FRT_STORE_NO)) {
|
287
|
+
FRT_RAISE(FRT_ARG_ERROR, "Field must be stored for compression to be useful.");
|
279
288
|
}
|
280
289
|
}
|
281
290
|
|
282
|
-
FrtFieldInfo *
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
{
|
287
|
-
FrtFieldInfo *fi = FRT_ALLOC(FrtFieldInfo);
|
291
|
+
FrtFieldInfo *frt_fi_alloc(void) {
|
292
|
+
return FRT_ALLOC(FrtFieldInfo);
|
293
|
+
}
|
294
|
+
|
295
|
+
FrtFieldInfo *frt_fi_init(FrtFieldInfo *fi, ID name, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
|
288
296
|
assert(NULL != name);
|
289
|
-
fi_check_params(store, index, term_vector);
|
297
|
+
fi_check_params(store, compression, index, term_vector);
|
290
298
|
fi->name = name;
|
291
299
|
fi->boost = 1.0f;
|
292
300
|
fi->bits = 0;
|
293
301
|
fi_set_store(fi, store);
|
302
|
+
fi_set_compression(fi, compression);
|
294
303
|
fi_set_index(fi, index);
|
295
304
|
fi_set_term_vector(fi, term_vector);
|
296
305
|
fi->ref_cnt = 1;
|
306
|
+
fi->rfi = Qnil;
|
297
307
|
return fi;
|
298
308
|
}
|
299
309
|
|
310
|
+
FrtFieldInfo *frt_fi_new(ID name, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
|
311
|
+
FrtFieldInfo *fi = frt_fi_alloc();
|
312
|
+
return frt_fi_init(fi, name, store, compression, index, term_vector);
|
313
|
+
}
|
314
|
+
|
300
315
|
void frt_fi_deref(FrtFieldInfo *fi)
|
301
316
|
{
|
302
317
|
if (0 == --(fi->ref_cnt)) {
|
@@ -304,6 +319,22 @@ void frt_fi_deref(FrtFieldInfo *fi)
|
|
304
319
|
}
|
305
320
|
}
|
306
321
|
|
322
|
+
FrtCompressionType frt_fi_get_compression(FrtFieldInfo *fi) {
|
323
|
+
if (fi_is_compressed(fi)) {
|
324
|
+
if (fi_is_compressed_brotli(fi)) {
|
325
|
+
return FRT_COMPRESSION_BROTLI;
|
326
|
+
} else if (fi_is_compressed_bz2(fi)) {
|
327
|
+
return FRT_COMPRESSION_BZ2;
|
328
|
+
} else if (fi_is_compressed_lz4(fi)) {
|
329
|
+
return FRT_COMPRESSION_LZ4;
|
330
|
+
} else {
|
331
|
+
return FRT_COMPRESSION_BROTLI;
|
332
|
+
}
|
333
|
+
} else {
|
334
|
+
return FRT_COMPRESSION_NONE;
|
335
|
+
}
|
336
|
+
}
|
337
|
+
|
307
338
|
char *frt_fi_to_s(FrtFieldInfo *fi)
|
308
339
|
{
|
309
340
|
const char *fi_name = rb_id2name(fi->name);
|
@@ -333,24 +364,31 @@ char *frt_fi_to_s(FrtFieldInfo *fi)
|
|
333
364
|
*
|
334
365
|
****************************************************************************/
|
335
366
|
|
336
|
-
FrtFieldInfos *
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
367
|
+
FrtFieldInfos *frt_fis_alloc(void) {
|
368
|
+
return FRT_ALLOC(FrtFieldInfos);
|
369
|
+
}
|
370
|
+
|
371
|
+
FrtFieldInfos *frt_fis_init(FrtFieldInfos *fis, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
|
372
|
+
fi_check_params(store, compression, index, term_vector);
|
341
373
|
fis->field_dict = frt_h_new_ptr((frt_free_ft)&frt_fi_deref);
|
342
374
|
fis->size = 0;
|
343
375
|
fis->capa = FIELD_INFOS_INIT_CAPA;
|
344
376
|
fis->fields = FRT_ALLOC_N(FrtFieldInfo *, fis->capa);
|
345
377
|
fis->store = store;
|
378
|
+
fis->compression = compression;
|
346
379
|
fis->index = index;
|
347
380
|
fis->term_vector = term_vector;
|
348
381
|
fis->ref_cnt = 1;
|
382
|
+
fis->rfis = Qnil;
|
349
383
|
return fis;
|
350
384
|
}
|
351
385
|
|
352
|
-
|
353
|
-
|
386
|
+
FrtFieldInfos *frt_fis_new(FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
|
387
|
+
FrtFieldInfos *fis = frt_fis_alloc();
|
388
|
+
return frt_fis_init(fis, store, compression, index, term_vector);
|
389
|
+
}
|
390
|
+
|
391
|
+
FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi) {
|
354
392
|
if (fis->size == fis->capa) {
|
355
393
|
fis->capa <<= 1;
|
356
394
|
FRT_REALLOC_N(fis->fields, FrtFieldInfo *, fis->capa);
|
@@ -364,23 +402,20 @@ FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi)
|
|
364
402
|
return fi;
|
365
403
|
}
|
366
404
|
|
367
|
-
FrtFieldInfo *frt_fis_get_field(FrtFieldInfos *fis,
|
368
|
-
{
|
405
|
+
FrtFieldInfo *frt_fis_get_field(FrtFieldInfos *fis, ID name) {
|
369
406
|
return (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
|
370
407
|
}
|
371
408
|
|
372
|
-
int frt_fis_get_field_num(FrtFieldInfos *fis,
|
373
|
-
{
|
409
|
+
int frt_fis_get_field_num(FrtFieldInfos *fis, ID name) {
|
374
410
|
FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
|
375
411
|
if (fi) { return fi->number; }
|
376
412
|
else { return -1; }
|
377
413
|
}
|
378
414
|
|
379
|
-
FrtFieldInfo *frt_fis_get_or_add_field(FrtFieldInfos *fis,
|
380
|
-
{
|
415
|
+
FrtFieldInfo *frt_fis_get_or_add_field(FrtFieldInfos *fis, ID name) {
|
381
416
|
FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
|
382
417
|
if (!fi) {
|
383
|
-
fi = (FrtFieldInfo*)frt_fi_new(name, fis->store, fis->index, fis->term_vector);
|
418
|
+
fi = (FrtFieldInfo*)frt_fi_new(name, fis->store, fis->compression, fis->index, fis->term_vector);
|
384
419
|
frt_fis_add_field(fis, fi);
|
385
420
|
}
|
386
421
|
return fi;
|
@@ -392,16 +427,14 @@ FrtFieldInfos *frt_fis_read(FrtInStream *is)
|
|
392
427
|
char *field_name;
|
393
428
|
FRT_TRY
|
394
429
|
do {
|
395
|
-
FrtStoreValue store_val;
|
396
|
-
FrtIndexValue index_val;
|
397
430
|
FrtTermVectorValue term_vector_val;
|
398
431
|
volatile int i;
|
399
432
|
union { frt_u32 i; float f; } tmp;
|
400
433
|
FrtFieldInfo *volatile fi;
|
401
|
-
store_val = (FrtStoreValue)frt_is_read_vint(is);
|
402
|
-
index_val = (FrtIndexValue)frt_is_read_vint(is);
|
434
|
+
FrtStoreValue store_val = (FrtStoreValue)frt_is_read_vint(is);
|
435
|
+
FrtIndexValue index_val = (FrtIndexValue)frt_is_read_vint(is);
|
403
436
|
term_vector_val = (FrtTermVectorValue)frt_is_read_vint(is);
|
404
|
-
fis = frt_fis_new(store_val, index_val, term_vector_val);
|
437
|
+
fis = frt_fis_new(store_val, FRT_COMPRESSION_NONE, index_val, term_vector_val); // TODO compression, read from store?
|
405
438
|
for (i = frt_is_read_vint(is); i > 0; i--) {
|
406
439
|
fi = FRT_ALLOC_AND_ZERO(FrtFieldInfo);
|
407
440
|
FRT_TRY
|
@@ -803,8 +836,7 @@ static char *sis_next_seg_file_name(char *buf, FrtStore *store)
|
|
803
836
|
|
804
837
|
#define GEN_FILE_RETRY_COUNT 10
|
805
838
|
#define GEN_LOOK_AHEAD_COUNT 10
|
806
|
-
static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
|
807
|
-
void (*run)(FrtStore *store, FindSegmentsFile *fsf))
|
839
|
+
static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf, void (*run)(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir), FrtIndexReader *ir)
|
808
840
|
{
|
809
841
|
volatile int i;
|
810
842
|
volatile int gen_look_ahead_count = 0;
|
@@ -911,7 +943,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
|
|
911
943
|
last_gen = gen;
|
912
944
|
FRT_TRY
|
913
945
|
fsf->generation = gen;
|
914
|
-
run(store, fsf);
|
946
|
+
run(store, fsf, ir);
|
915
947
|
FRT_RETURN_EARLY();
|
916
948
|
return;
|
917
949
|
case FRT_IO_ERROR: case FRT_FILE_NOT_FOUND_ERROR: case FRT_EOF_ERROR:
|
@@ -957,7 +989,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
|
|
957
989
|
* prevSegmentFileName + "'" */
|
958
990
|
FRT_TRY
|
959
991
|
fsf->generation = gen - 1;
|
960
|
-
run(store, fsf);
|
992
|
+
run(store, fsf, ir);
|
961
993
|
/* TODO:LOG "success on fallback " +
|
962
994
|
* prev_seg_file_name */
|
963
995
|
|
@@ -1040,7 +1072,7 @@ void frt_sis_del_from_to(FrtSegmentInfos *sis, int from, int to)
|
|
1040
1072
|
}
|
1041
1073
|
}
|
1042
1074
|
|
1043
|
-
static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf)
|
1075
|
+
static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir_)
|
1044
1076
|
{
|
1045
1077
|
int seg_cnt;
|
1046
1078
|
int i;
|
@@ -1079,7 +1111,7 @@ static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf)
|
|
1079
1111
|
FrtSegmentInfos *frt_sis_read(FrtStore *store)
|
1080
1112
|
{
|
1081
1113
|
FindSegmentsFile fsf;
|
1082
|
-
sis_find_segments_file(store, &fsf, &frt_sis_read_i);
|
1114
|
+
sis_find_segments_file(store, &fsf, &frt_sis_read_i, NULL);
|
1083
1115
|
return fsf.ret.sis;
|
1084
1116
|
}
|
1085
1117
|
|
@@ -1119,7 +1151,7 @@ void frt_sis_write(FrtSegmentInfos *sis, FrtStore *store, FrtDeleter *deleter)
|
|
1119
1151
|
}
|
1120
1152
|
}
|
1121
1153
|
|
1122
|
-
static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf)
|
1154
|
+
static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir_)
|
1123
1155
|
{
|
1124
1156
|
FrtInStream *is;
|
1125
1157
|
frt_u64 version;
|
@@ -1142,7 +1174,7 @@ static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf)
|
|
1142
1174
|
frt_u64 frt_sis_read_current_version(FrtStore *store)
|
1143
1175
|
{
|
1144
1176
|
FindSegmentsFile fsf;
|
1145
|
-
sis_find_segments_file(store, &fsf, &frt_sis_read_ver_i);
|
1177
|
+
sis_find_segments_file(store, &fsf, &frt_sis_read_ver_i, NULL);
|
1146
1178
|
return fsf.ret.uint64;
|
1147
1179
|
}
|
1148
1180
|
|
@@ -1152,18 +1184,17 @@ frt_u64 frt_sis_read_current_version(FrtStore *store)
|
|
1152
1184
|
*
|
1153
1185
|
****************************************************************************/
|
1154
1186
|
|
1155
|
-
static FrtLazyDocField *lazy_df_new(
|
1156
|
-
{
|
1187
|
+
static FrtLazyDocField *lazy_df_new(ID name, const int size, FrtCompressionType compression) {
|
1157
1188
|
FrtLazyDocField *self = FRT_ALLOC(FrtLazyDocField);
|
1158
1189
|
self->name = name;
|
1159
1190
|
self->size = size;
|
1160
1191
|
self->data = FRT_ALLOC_AND_ZERO_N(FrtLazyDocFieldData, size);
|
1161
|
-
self->
|
1192
|
+
self->compression = compression;
|
1193
|
+
self->decompressed = false;
|
1162
1194
|
return self;
|
1163
1195
|
}
|
1164
1196
|
|
1165
|
-
static void lazy_df_destroy(FrtLazyDocField *self)
|
1166
|
-
{
|
1197
|
+
static void lazy_df_destroy(FrtLazyDocField *self) {
|
1167
1198
|
int i;
|
1168
1199
|
for (i = self->size - 1; i >= 0; i--) {
|
1169
1200
|
if (self->data[i].text) {
|
@@ -1174,16 +1205,14 @@ static void lazy_df_destroy(FrtLazyDocField *self)
|
|
1174
1205
|
free(self);
|
1175
1206
|
}
|
1176
1207
|
|
1177
|
-
static void comp_raise()
|
1178
|
-
{
|
1208
|
+
static void comp_raise(void) {
|
1179
1209
|
FRT_RAISE(EXCEPTION, "Compression error");
|
1180
1210
|
}
|
1181
1211
|
|
1182
|
-
static char *
|
1183
|
-
{
|
1212
|
+
static char *is_read_brotli_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
|
1184
1213
|
int buf_out_idx = 0;
|
1185
1214
|
int read_len;
|
1186
|
-
frt_uchar buf_in[
|
1215
|
+
frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
|
1187
1216
|
const frt_uchar *next_in;
|
1188
1217
|
size_t available_in;
|
1189
1218
|
frt_uchar *buf_out = NULL;
|
@@ -1195,20 +1224,20 @@ static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *
|
|
1195
1224
|
if (!b_state) { comp_raise(); return NULL; }
|
1196
1225
|
|
1197
1226
|
do {
|
1198
|
-
read_len = compressed_len >
|
1227
|
+
read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1199
1228
|
frt_is_read_bytes(is, buf_in, read_len);
|
1200
1229
|
compressed_len -= read_len;
|
1201
1230
|
available_in = read_len;
|
1202
1231
|
next_in = buf_in;
|
1203
|
-
available_out =
|
1232
|
+
available_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1204
1233
|
do {
|
1205
|
-
FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx +
|
1234
|
+
FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
|
1206
1235
|
next_out = buf_out + buf_out_idx;
|
1207
1236
|
b_result = BrotliDecoderDecompressStream(b_state,
|
1208
1237
|
&available_in, &next_in,
|
1209
1238
|
&available_out, &next_out, NULL);
|
1210
1239
|
if (b_result == BROTLI_DECODER_RESULT_ERROR) { comp_raise(); return NULL; }
|
1211
|
-
buf_out_idx +=
|
1240
|
+
buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - available_out;
|
1212
1241
|
} while (b_result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT);
|
1213
1242
|
} while (b_result != BROTLI_DECODER_RESULT_SUCCESS && compressed_len > 0);
|
1214
1243
|
|
@@ -1220,16 +1249,180 @@ static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *
|
|
1220
1249
|
return (char *)buf_out;
|
1221
1250
|
}
|
1222
1251
|
|
1223
|
-
|
1224
|
-
{
|
1252
|
+
static void zraise(int ret) {
|
1253
|
+
switch (ret) {
|
1254
|
+
case BZ_IO_ERROR:
|
1255
|
+
if (ferror(stdin))
|
1256
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: error reading stdin");
|
1257
|
+
if (ferror(stdout))
|
1258
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: error writing stdout");
|
1259
|
+
break;
|
1260
|
+
case BZ_CONFIG_ERROR:
|
1261
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: system configuration error");
|
1262
|
+
break;
|
1263
|
+
case BZ_SEQUENCE_ERROR: /* shouldn't occur if code is correct */
|
1264
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! sequence error");
|
1265
|
+
break;
|
1266
|
+
case BZ_PARAM_ERROR: /* shouldn't occur if code is correct */
|
1267
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! parameter error");
|
1268
|
+
break;
|
1269
|
+
case BZ_MEM_ERROR:
|
1270
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: memory error");
|
1271
|
+
break;
|
1272
|
+
case BZ_DATA_ERROR:
|
1273
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check error");
|
1274
|
+
break;
|
1275
|
+
case BZ_DATA_ERROR_MAGIC:
|
1276
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check - non-matching magic");
|
1277
|
+
break;
|
1278
|
+
case BZ_UNEXPECTED_EOF:
|
1279
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: unexpected end-of-file");
|
1280
|
+
break;
|
1281
|
+
case BZ_OUTBUFF_FULL:
|
1282
|
+
FRT_RAISE(FRT_IO_ERROR, "bzlib: output buffer full");
|
1283
|
+
break;
|
1284
|
+
default:
|
1285
|
+
FRT_RAISE(FRT_EXCEPTION, "bzlib: unknown error");
|
1286
|
+
}
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
static char *is_read_bz2_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
|
1290
|
+
int buf_out_idx = 0, ret, read_len;
|
1291
|
+
char *buf_out = NULL;
|
1292
|
+
char buf_in[FRT_COMPRESSION_BUFFER_SIZE];
|
1293
|
+
bz_stream zstrm;
|
1294
|
+
zstrm.bzalloc = NULL;
|
1295
|
+
zstrm.bzfree = NULL;
|
1296
|
+
zstrm.opaque = NULL;
|
1297
|
+
zstrm.next_in = NULL;
|
1298
|
+
zstrm.avail_in = 0;
|
1299
|
+
if ((ret = BZ2_bzDecompressInit(&zstrm, 0, 0)) != BZ_OK) zraise(ret);
|
1300
|
+
|
1301
|
+
do {
|
1302
|
+
read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1303
|
+
frt_is_read_bytes(is, (frt_uchar *)buf_in, read_len);
|
1304
|
+
compressed_len -= read_len;
|
1305
|
+
zstrm.avail_in = read_len;
|
1306
|
+
zstrm.next_in = buf_in;
|
1307
|
+
zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1308
|
+
|
1309
|
+
do {
|
1310
|
+
REALLOC_N(buf_out, char, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
|
1311
|
+
zstrm.next_out = buf_out + buf_out_idx;
|
1312
|
+
ret = BZ2_bzDecompress(&zstrm);
|
1313
|
+
assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
|
1314
|
+
if (ret != BZ_OK && ret != BZ_STREAM_END) {
|
1315
|
+
(void)BZ2_bzDecompressEnd(&zstrm);
|
1316
|
+
zraise(ret);
|
1317
|
+
}
|
1318
|
+
buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
|
1319
|
+
} while (zstrm.avail_out == 0);
|
1320
|
+
} while (ret != BZ_STREAM_END && compressed_len != 0);
|
1321
|
+
|
1322
|
+
(void)BZ2_bzDecompressEnd(&zstrm);
|
1323
|
+
|
1324
|
+
FRT_REALLOC_N(buf_out, char, buf_out_idx + 1);
|
1325
|
+
buf_out[buf_out_idx] = '\0';
|
1326
|
+
|
1327
|
+
*len = buf_out_idx;
|
1328
|
+
return (char *)buf_out;
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
static char *is_read_lz4_compressed_bytes(FrtInStream *is, int compressed_len, int *length) {
|
1332
|
+
frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
|
1333
|
+
char *buf_out = NULL;
|
1334
|
+
int dc_length = 0;
|
1335
|
+
LZ4F_dctx *dctx;
|
1336
|
+
LZ4F_frameInfo_t frame_info;
|
1337
|
+
LZ4F_errorCode_t dctx_status = LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION);
|
1338
|
+
if (LZ4F_isError(dctx_status)) { *length = -1; return NULL; }
|
1339
|
+
|
1340
|
+
/* header and buffer */
|
1341
|
+
int read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1342
|
+
frt_is_read_bytes(is, buf_in, read_length);
|
1343
|
+
compressed_len -= read_length;
|
1344
|
+
|
1345
|
+
size_t consumed_size = read_length;
|
1346
|
+
size_t res = LZ4F_getFrameInfo(dctx, &frame_info, buf_in, &consumed_size);
|
1347
|
+
if (LZ4F_isError(res)) { *length = -1; return NULL; }
|
1348
|
+
size_t buf_out_length;
|
1349
|
+
switch(frame_info.blockSizeID) {
|
1350
|
+
case LZ4F_default:
|
1351
|
+
case LZ4F_max64KB:
|
1352
|
+
buf_out_length = 1 << 16;
|
1353
|
+
break;
|
1354
|
+
case LZ4F_max256KB:
|
1355
|
+
buf_out_length = 1 << 18;
|
1356
|
+
break;
|
1357
|
+
case LZ4F_max1MB:
|
1358
|
+
buf_out_length = 1 << 20;
|
1359
|
+
break;
|
1360
|
+
case LZ4F_max4MB:
|
1361
|
+
buf_out_length = 1 << 22;
|
1362
|
+
break;
|
1363
|
+
default:
|
1364
|
+
buf_out_length = 0;
|
1365
|
+
}
|
1366
|
+
|
1367
|
+
res = 1;
|
1368
|
+
int first_chunk = 1;
|
1369
|
+
|
1370
|
+
/* decompress data */
|
1371
|
+
while (res != 0) {
|
1372
|
+
if (!first_chunk) {
|
1373
|
+
read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
|
1374
|
+
frt_is_read_bytes(is, buf_in, read_length);
|
1375
|
+
compressed_len -= read_length;
|
1376
|
+
consumed_size = 0;
|
1377
|
+
}
|
1378
|
+
first_chunk = 0;
|
1379
|
+
|
1380
|
+
char *src = (char *)(buf_in + consumed_size);
|
1381
|
+
char *src_end = (char *)buf_in + read_length;
|
1382
|
+
|
1383
|
+
while (src < src_end && res != 0){
|
1384
|
+
size_t dest_length = buf_out_length;
|
1385
|
+
size_t consumed_size = read_length;
|
1386
|
+
FRT_REALLOC_N(buf_out, char, dc_length + buf_out_length);
|
1387
|
+
res = LZ4F_decompress(dctx, buf_out + dc_length, &dest_length, src, &consumed_size, NULL);
|
1388
|
+
if (LZ4F_isError(res)) { *length = -1; return NULL; }
|
1389
|
+
dc_length += dest_length;
|
1390
|
+
src = src + consumed_size;
|
1391
|
+
}
|
1392
|
+
}
|
1393
|
+
|
1394
|
+
/* finish up */
|
1395
|
+
LZ4F_freeDecompressionContext(dctx);
|
1396
|
+
|
1397
|
+
FRT_REALLOC_N(buf_out, char, dc_length + 1);
|
1398
|
+
buf_out[dc_length] = '\0';
|
1399
|
+
|
1400
|
+
*length = dc_length;
|
1401
|
+
return buf_out;
|
1402
|
+
}
|
1403
|
+
|
1404
|
+
static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len, FrtCompressionType compression) {
|
1405
|
+
switch (compression) {
|
1406
|
+
case FRT_COMPRESSION_BROTLI:
|
1407
|
+
return is_read_brotli_compressed_bytes(is, compressed_len, len);
|
1408
|
+
case FRT_COMPRESSION_BZ2:
|
1409
|
+
return is_read_bz2_compressed_bytes(is, compressed_len, len);
|
1410
|
+
case FRT_COMPRESSION_LZ4:
|
1411
|
+
return is_read_lz4_compressed_bytes(is, compressed_len, len);
|
1412
|
+
default:
|
1413
|
+
return NULL;
|
1414
|
+
}
|
1415
|
+
}
|
1416
|
+
|
1417
|
+
char *frt_lazy_df_get_data(FrtLazyDocField *self, int i) {
|
1225
1418
|
char *text = NULL;
|
1226
1419
|
if (i < self->size && i >= 0) {
|
1227
1420
|
text = self->data[i].text;
|
1228
1421
|
if (NULL == text) {
|
1229
1422
|
const int read_len = self->data[i].length + 1;
|
1230
1423
|
frt_is_seek(self->doc->fields_in, self->data[i].start);
|
1231
|
-
if (self->
|
1232
|
-
self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length));
|
1424
|
+
if (self->data[i].compression != FRT_COMPRESSION_NONE) {
|
1425
|
+
self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length), self->data[i].compression);
|
1233
1426
|
} else {
|
1234
1427
|
self->data[i].text = text = FRT_ALLOC_N(char, read_len);
|
1235
1428
|
frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
|
@@ -1241,9 +1434,8 @@ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
|
|
1241
1434
|
return text;
|
1242
1435
|
}
|
1243
1436
|
|
1244
|
-
void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
|
1245
|
-
{
|
1246
|
-
if (self->is_compressed == 1) {
|
1437
|
+
void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len) {
|
1438
|
+
if (self->compression != FRT_COMPRESSION_NONE && !self->decompressed) {
|
1247
1439
|
int i;
|
1248
1440
|
self->len = 0;
|
1249
1441
|
for (i = self->size-1; i >= 0; i--) {
|
@@ -1251,7 +1443,7 @@ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
|
|
1251
1443
|
self->len += self->data[i].length + 1;
|
1252
1444
|
}
|
1253
1445
|
self->len--; /* each field separated by ' ' but no need to add to end */
|
1254
|
-
self->
|
1446
|
+
self->decompressed = true;
|
1255
1447
|
}
|
1256
1448
|
if (start < 0 || start >= self->len) {
|
1257
1449
|
FRT_RAISE(FRT_IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
|
@@ -1264,7 +1456,7 @@ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
|
|
1264
1456
|
FRT_RAISE(FRT_IO_ERROR, "Tried to read past end of field. Field is only %d "
|
1265
1457
|
"bytes long but tried to read to %d", self->len, start + len);
|
1266
1458
|
}
|
1267
|
-
if (self->
|
1459
|
+
if (self->compression != FRT_COMPRESSION_NONE) {
|
1268
1460
|
int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
|
1269
1461
|
for (i = 0; i < self->size; i++) {
|
1270
1462
|
cur_end = cur_start + self->data[i].length;
|
@@ -1328,21 +1520,17 @@ static void lazy_doc_add_field(FrtLazyDoc *self, FrtLazyDocField *lazy_df, int i
|
|
1328
1520
|
lazy_df->doc = self;
|
1329
1521
|
}
|
1330
1522
|
|
1331
|
-
FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self,
|
1332
|
-
{
|
1523
|
+
FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self, ID field) {
|
1333
1524
|
return (FrtLazyDocField *)frt_h_get(self->field_dictionary, (void *)field);
|
1334
1525
|
}
|
1335
1526
|
|
1336
1527
|
/****************************************************************************
|
1337
|
-
*
|
1338
1528
|
* FrtFieldsReader
|
1339
|
-
*
|
1340
1529
|
****************************************************************************/
|
1341
1530
|
|
1342
1531
|
#define FIELDS_IDX_PTR_SIZE 12
|
1343
1532
|
|
1344
|
-
FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
|
1345
|
-
{
|
1533
|
+
FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
|
1346
1534
|
FrtFieldsReader *fr = FRT_ALLOC(FrtFieldsReader);
|
1347
1535
|
FrtInStream *fdx_in;
|
1348
1536
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
@@ -1362,8 +1550,7 @@ FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos
|
|
1362
1550
|
return fr;
|
1363
1551
|
}
|
1364
1552
|
|
1365
|
-
FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig)
|
1366
|
-
{
|
1553
|
+
FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig) {
|
1367
1554
|
FrtFieldsReader *fr = FRT_ALLOC(FrtFieldsReader);
|
1368
1555
|
|
1369
1556
|
memcpy(fr, orig, sizeof(FrtFieldsReader));
|
@@ -1373,35 +1560,33 @@ FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig)
|
|
1373
1560
|
return fr;
|
1374
1561
|
}
|
1375
1562
|
|
1376
|
-
void frt_fr_close(FrtFieldsReader *fr)
|
1377
|
-
{
|
1563
|
+
void frt_fr_close(FrtFieldsReader *fr) {
|
1378
1564
|
frt_is_close(fr->fdt_in);
|
1379
1565
|
frt_is_close(fr->fdx_in);
|
1380
1566
|
free(fr);
|
1381
1567
|
}
|
1382
1568
|
|
1383
|
-
static FrtDocField *frt_fr_df_new(
|
1384
|
-
{
|
1569
|
+
static FrtDocField *frt_fr_df_new(ID name, int size, FrtCompressionType compression) {
|
1385
1570
|
FrtDocField *df = FRT_ALLOC(FrtDocField);
|
1386
1571
|
df->name = name;
|
1387
1572
|
df->capa = df->size = size;
|
1388
1573
|
df->data = FRT_ALLOC_N(char *, df->capa);
|
1389
1574
|
df->lengths = FRT_ALLOC_N(int, df->capa);
|
1575
|
+
df->encodings = FRT_ALLOC_N(rb_encoding *, df->capa);
|
1390
1576
|
df->destroy_data = true;
|
1391
1577
|
df->boost = 1.0f;
|
1392
|
-
df->
|
1578
|
+
df->compression = compression;
|
1393
1579
|
return df;
|
1394
1580
|
}
|
1395
1581
|
|
1396
|
-
static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df)
|
1397
|
-
{
|
1582
|
+
static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df, FrtCompressionType compression) {
|
1398
1583
|
int i;
|
1399
1584
|
const int df_size = df->size;
|
1400
1585
|
FrtInStream *fdt_in = fr->fdt_in;
|
1401
1586
|
|
1402
1587
|
for (i = 0; i < df_size; i++) {
|
1403
1588
|
const int compressed_len = df->lengths[i] + 1;
|
1404
|
-
df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]));
|
1589
|
+
df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]), compression);
|
1405
1590
|
}
|
1406
1591
|
}
|
1407
1592
|
|
@@ -1423,18 +1608,20 @@ FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
|
|
1423
1608
|
const int field_num = frt_is_read_vint(fdt_in);
|
1424
1609
|
FrtFieldInfo *fi = fr->fis->fields[field_num];
|
1425
1610
|
const int df_size = frt_is_read_vint(fdt_in);
|
1426
|
-
FrtDocField *df = frt_fr_df_new(fi->name, df_size,
|
1611
|
+
FrtDocField *df = frt_fr_df_new(fi->name, df_size, frt_fi_get_compression(fi));
|
1427
1612
|
|
1428
1613
|
for (j = 0; j < df_size; j++) {
|
1429
1614
|
df->lengths[j] = frt_is_read_vint(fdt_in);
|
1615
|
+
df->encodings[j] = rb_enc_from_index(frt_is_read_vint(fdt_in));
|
1616
|
+
df->compression = frt_is_read_vint(fdt_in);
|
1430
1617
|
}
|
1431
1618
|
|
1432
1619
|
frt_doc_add_field(doc, df);
|
1433
1620
|
}
|
1434
1621
|
for (i = 0; i < stored_cnt; i++) {
|
1435
1622
|
FrtDocField *df = doc->fields[i];
|
1436
|
-
if (df->
|
1437
|
-
frt_fr_read_compressed_fields(fr, df);
|
1623
|
+
if (df->compression != FRT_COMPRESSION_NONE) {
|
1624
|
+
frt_fr_read_compressed_fields(fr, df, df->compression);
|
1438
1625
|
} else {
|
1439
1626
|
const int df_size = df->size;
|
1440
1627
|
for (j = 0; j < df_size; j++) {
|
@@ -1458,31 +1645,37 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
|
|
1458
1645
|
FrtLazyDoc *lazy_doc;
|
1459
1646
|
FrtInStream *fdx_in = fr->fdx_in;
|
1460
1647
|
FrtInStream *fdt_in = fr->fdt_in;
|
1648
|
+
|
1461
1649
|
frt_is_seek(fdx_in, doc_num * FIELDS_IDX_PTR_SIZE);
|
1462
1650
|
pos = (off_t)frt_is_read_u64(fdx_in);
|
1463
1651
|
frt_is_seek(fdt_in, pos);
|
1464
1652
|
stored_cnt = frt_is_read_vint(fdt_in);
|
1653
|
+
|
1465
1654
|
lazy_doc = lazy_doc_new(stored_cnt, fdt_in);
|
1466
1655
|
for (i = 0; i < stored_cnt; i++) {
|
1467
1656
|
FrtFieldInfo *fi = fr->fis->fields[frt_is_read_vint(fdt_in)];
|
1468
|
-
const int
|
1469
|
-
FrtLazyDocField *lazy_df = lazy_df_new(fi->name,
|
1657
|
+
const int df_size = frt_is_read_vint(fdt_in);
|
1658
|
+
FrtLazyDocField *lazy_df = lazy_df_new(fi->name, df_size, frt_fi_get_compression(fi));
|
1470
1659
|
const int field_start = start;
|
1471
1660
|
/* get the starts relative positions this time around */
|
1472
|
-
|
1661
|
+
|
1662
|
+
for (j = 0; j < df_size; j++) {
|
1473
1663
|
lazy_df->data[j].start = start;
|
1474
1664
|
start += 1 + (lazy_df->data[j].length = frt_is_read_vint(fdt_in));
|
1665
|
+
lazy_df->data[j].encoding = rb_enc_from_index(frt_is_read_vint(fdt_in));
|
1666
|
+
lazy_df->data[j].compression = frt_is_read_vint(fdt_in);
|
1475
1667
|
}
|
1668
|
+
|
1476
1669
|
lazy_df->len = start - field_start - 1;
|
1477
1670
|
lazy_doc_add_field(lazy_doc, lazy_df, i);
|
1478
1671
|
}
|
1479
1672
|
/* correct the starts to their correct absolute positions */
|
1673
|
+
const off_t abs_start = frt_is_pos(fdt_in);
|
1480
1674
|
for (i = 0; i < stored_cnt; i++) {
|
1481
1675
|
FrtLazyDocField *lazy_df = lazy_doc->fields[i];
|
1482
|
-
const int
|
1483
|
-
|
1484
|
-
|
1485
|
-
lazy_df->data[j].start += start;
|
1676
|
+
const int df_size = lazy_df->size;
|
1677
|
+
for (j = 0; j < df_size; j++) {
|
1678
|
+
lazy_df->data[j].start += abs_start;
|
1486
1679
|
}
|
1487
1680
|
}
|
1488
1681
|
|
@@ -1660,42 +1853,150 @@ void frt_fw_close(FrtFieldsWriter *fw)
|
|
1660
1853
|
free(fw);
|
1661
1854
|
}
|
1662
1855
|
|
1663
|
-
static int
|
1664
|
-
|
1665
|
-
size_t compressed_len = 0;
|
1856
|
+
static int frt_os_write_brotli_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
|
1857
|
+
size_t compressed_length = 0;
|
1666
1858
|
const frt_uchar *next_in = data;
|
1667
1859
|
size_t available_in = length;
|
1668
1860
|
size_t available_out;
|
1669
|
-
frt_uchar compression_buffer[
|
1861
|
+
frt_uchar compression_buffer[FRT_COMPRESSION_BUFFER_SIZE];
|
1670
1862
|
frt_uchar *next_out;
|
1671
1863
|
BrotliEncoderState *b_state = BrotliEncoderCreateInstance(NULL, NULL, NULL);
|
1672
1864
|
if (!b_state) { comp_raise(); return -1; }
|
1673
1865
|
|
1674
|
-
BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY,
|
1866
|
+
BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, FRT_BROTLI_COMPRESSION_LEVEL);
|
1675
1867
|
|
1676
1868
|
do {
|
1677
|
-
available_out =
|
1869
|
+
available_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1678
1870
|
next_out = compression_buffer;
|
1679
1871
|
if (!BrotliEncoderCompressStream(b_state, BROTLI_OPERATION_FINISH,
|
1680
1872
|
&available_in, &next_in,
|
1681
|
-
&available_out, &next_out, &
|
1873
|
+
&available_out, &next_out, &compressed_length)) {
|
1682
1874
|
BrotliEncoderDestroyInstance(b_state);
|
1683
1875
|
comp_raise();
|
1684
1876
|
return -1;
|
1685
1877
|
}
|
1686
|
-
frt_os_write_bytes(out_stream, compression_buffer,
|
1878
|
+
frt_os_write_bytes(out_stream, compression_buffer, FRT_COMPRESSION_BUFFER_SIZE - available_out);
|
1687
1879
|
} while (!BrotliEncoderIsFinished(b_state));
|
1688
1880
|
|
1689
1881
|
BrotliEncoderDestroyInstance(b_state);
|
1690
|
-
|
1691
|
-
return (int)
|
1882
|
+
|
1883
|
+
return (int)compressed_length;
|
1692
1884
|
}
|
1693
1885
|
|
1694
|
-
|
1695
|
-
|
1886
|
+
static int frt_os_write_bz2_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
|
1887
|
+
int ret, buf_size, compressed_len = 0;
|
1888
|
+
char out_buffer[FRT_COMPRESSION_BUFFER_SIZE];
|
1889
|
+
bz_stream zstrm;
|
1890
|
+
zstrm.bzalloc = NULL;
|
1891
|
+
zstrm.bzfree = NULL;
|
1892
|
+
zstrm.opaque = NULL;
|
1893
|
+
if ((ret = BZ2_bzCompressInit(&zstrm, FRT_BZIP_COMPRESSION_LEVEL, 0, 0)) != BZ_OK) zraise(ret);
|
1894
|
+
|
1895
|
+
zstrm.avail_in = length;
|
1896
|
+
zstrm.next_in = (char *)data;
|
1897
|
+
zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
|
1898
|
+
zstrm.next_out = out_buffer;
|
1899
|
+
|
1900
|
+
do {
|
1901
|
+
ret = BZ2_bzCompress(&zstrm, BZ_FINISH); /* no bad return value */
|
1902
|
+
assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
|
1903
|
+
compressed_len += buf_size = FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
|
1904
|
+
frt_os_write_bytes(out_stream, (frt_uchar *)out_buffer, buf_size);
|
1905
|
+
} while (zstrm.avail_out == 0);
|
1906
|
+
assert(zstrm.avail_in == 0); /* all input will be used */
|
1907
|
+
|
1908
|
+
(void)BZ2_bzCompressEnd(&zstrm);
|
1909
|
+
return compressed_len;
|
1910
|
+
}
|
1911
|
+
|
1912
|
+
static const LZ4F_preferences_t lz4_prefs = {
|
1913
|
+
{
|
1914
|
+
LZ4F_default,
|
1915
|
+
LZ4F_blockLinked,
|
1916
|
+
LZ4F_noContentChecksum,
|
1917
|
+
LZ4F_frame,
|
1918
|
+
0, /* unknown content size */
|
1919
|
+
0, /* no dictID */
|
1920
|
+
LZ4F_noBlockChecksum
|
1921
|
+
},
|
1922
|
+
0,
|
1923
|
+
1,
|
1924
|
+
1,
|
1925
|
+
{0,0,0}
|
1926
|
+
};
|
1927
|
+
|
1928
|
+
static int frt_os_write_lz4_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
|
1929
|
+
int compressed_length = 0;
|
1930
|
+
int remaining_length = length;
|
1931
|
+
size_t ccmp_length = 0;
|
1932
|
+
LZ4F_compressionContext_t ctx;
|
1933
|
+
size_t out_buf_length = LZ4F_compressBound(FRT_COMPRESSION_BUFFER_SIZE, &lz4_prefs);
|
1934
|
+
frt_uchar *out_buf = frt_ecalloc(out_buf_length);
|
1935
|
+
|
1936
|
+
size_t ctx_creation = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
|
1937
|
+
if (LZ4F_isError(ctx_creation)) {
|
1938
|
+
compressed_length = -1;
|
1939
|
+
goto finish;
|
1940
|
+
}
|
1941
|
+
|
1942
|
+
/* create header */
|
1943
|
+
ccmp_length = LZ4F_compressBegin(ctx, out_buf, out_buf_length, &lz4_prefs);
|
1944
|
+
if (LZ4F_isError(ccmp_length)) {
|
1945
|
+
compressed_length = -1;
|
1946
|
+
goto finish;
|
1947
|
+
}
|
1948
|
+
compressed_length = ccmp_length;
|
1949
|
+
frt_os_write_bytes(out_stream, out_buf, ccmp_length);
|
1950
|
+
|
1951
|
+
/* compress data */
|
1952
|
+
do {
|
1953
|
+
int read_length = (FRT_COMPRESSION_BUFFER_SIZE > remaining_length) ? remaining_length : FRT_COMPRESSION_BUFFER_SIZE;
|
1954
|
+
ccmp_length = LZ4F_compressUpdate(ctx, out_buf, out_buf_length, data + (length - remaining_length), read_length, NULL);
|
1955
|
+
if (LZ4F_isError(ccmp_length)) {
|
1956
|
+
compressed_length = -1;
|
1957
|
+
goto finish;
|
1958
|
+
}
|
1959
|
+
frt_os_write_bytes(out_stream, out_buf, ccmp_length);
|
1960
|
+
compressed_length += ccmp_length;
|
1961
|
+
remaining_length -= read_length;
|
1962
|
+
} while (remaining_length > 0);
|
1963
|
+
|
1964
|
+
/* finish up */
|
1965
|
+
ccmp_length = LZ4F_compressEnd(ctx, out_buf, out_buf_length, NULL);
|
1966
|
+
if (LZ4F_isError(ccmp_length)) {
|
1967
|
+
compressed_length = -1;
|
1968
|
+
goto finish;
|
1969
|
+
}
|
1970
|
+
|
1971
|
+
frt_os_write_bytes(out_stream, out_buf, ccmp_length);
|
1972
|
+
compressed_length += ccmp_length;
|
1973
|
+
|
1974
|
+
finish:
|
1975
|
+
LZ4F_freeCompressionContext(ctx);
|
1976
|
+
free(out_buf);
|
1977
|
+
|
1978
|
+
return compressed_length;
|
1979
|
+
}
|
1980
|
+
|
1981
|
+
static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length, FrtCompressionType compression) {
|
1982
|
+
switch (compression) {
|
1983
|
+
case FRT_COMPRESSION_BROTLI:
|
1984
|
+
return frt_os_write_brotli_compressed_bytes(out_stream, data, length);
|
1985
|
+
case FRT_COMPRESSION_BZ2:
|
1986
|
+
return frt_os_write_bz2_compressed_bytes(out_stream, data, length);
|
1987
|
+
case FRT_COMPRESSION_LZ4:
|
1988
|
+
return frt_os_write_lz4_compressed_bytes(out_stream, data, length);
|
1989
|
+
default:
|
1990
|
+
return -1;
|
1991
|
+
}
|
1992
|
+
|
1993
|
+
}
|
1994
|
+
|
1995
|
+
void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
|
1696
1996
|
int i, j, stored_cnt = 0;
|
1697
1997
|
FrtDocField *df;
|
1698
1998
|
FrtFieldInfo *fi;
|
1999
|
+
FrtCompressionType compression;
|
1699
2000
|
FrtOutStream *fdt_out = fw->fdt_out, *fdx_out = fw->fdx_out;
|
1700
2001
|
const int doc_size = doc->size;
|
1701
2002
|
|
@@ -1719,16 +2020,22 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
|
|
1719
2020
|
const int df_size = df->size;
|
1720
2021
|
frt_os_write_vint(fdt_out, fi->number);
|
1721
2022
|
frt_os_write_vint(fdt_out, df_size);
|
2023
|
+
|
1722
2024
|
if (fi_is_compressed(fi)) {
|
2025
|
+
compression = frt_fi_get_compression(fi);
|
1723
2026
|
for (j = 0; j < df_size; j++) {
|
1724
2027
|
const int length = df->lengths[j];
|
1725
|
-
int compressed_len = frt_os_write_compressed_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
|
2028
|
+
int compressed_len = frt_os_write_compressed_bytes(fw->buffer, (frt_uchar*)df->data[j], length, compression);
|
1726
2029
|
frt_os_write_vint(fdt_out, compressed_len - 1);
|
2030
|
+
frt_os_write_vint(fdt_out, rb_enc_to_index(df->encodings[j]));
|
2031
|
+
frt_os_write_vint(fdt_out, compression);
|
1727
2032
|
}
|
1728
2033
|
} else {
|
1729
2034
|
for (j = 0; j < df_size; j++) {
|
1730
2035
|
const int length = df->lengths[j];
|
1731
2036
|
frt_os_write_vint(fdt_out, length);
|
2037
|
+
frt_os_write_vint(fdt_out, rb_enc_to_index(df->encodings[j]));
|
2038
|
+
frt_os_write_vint(fdt_out, FRT_COMPRESSION_NONE);
|
1732
2039
|
frt_os_write_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
|
1733
2040
|
/* leave a space between fields as that is how they are analyzed */
|
1734
2041
|
frt_os_write_byte(fw->buffer, ' ');
|
@@ -2087,8 +2394,7 @@ static char *ste_scan_to(FrtTermEnum *te, const char *term)
|
|
2087
2394
|
}
|
2088
2395
|
}
|
2089
2396
|
|
2090
|
-
static FrtSegmentTermEnum *ste_allocate()
|
2091
|
-
{
|
2397
|
+
static FrtSegmentTermEnum *ste_allocate(void) {
|
2092
2398
|
FrtSegmentTermEnum *ste = FRT_ALLOC_AND_ZERO(FrtSegmentTermEnum);
|
2093
2399
|
|
2094
2400
|
TE(ste)->next = &ste_next;
|
@@ -2113,7 +2419,6 @@ void frt_ste_close(FrtTermEnum *te)
|
|
2113
2419
|
free(te);
|
2114
2420
|
}
|
2115
2421
|
|
2116
|
-
|
2117
2422
|
static char *frt_ste_get_term(FrtTermEnum *te, int pos)
|
2118
2423
|
{
|
2119
2424
|
FrtSegmentTermEnum *ste = STE(te);
|
@@ -2228,9 +2533,7 @@ static void tew_destroy(TermEnumWrapper *tew)
|
|
2228
2533
|
tew->te->close(tew->te);
|
2229
2534
|
}
|
2230
2535
|
|
2231
|
-
static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *te,
|
2232
|
-
FrtIndexReader *ir)
|
2233
|
-
{
|
2536
|
+
static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *te, FrtIndexReader *ir) {
|
2234
2537
|
tew->index = index;
|
2235
2538
|
tew->ir = ir;
|
2236
2539
|
tew->te = te;
|
@@ -2239,9 +2542,7 @@ static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *
|
|
2239
2542
|
return tew;
|
2240
2543
|
}
|
2241
2544
|
|
2242
|
-
|
2243
|
-
static char *mte_next(FrtTermEnum *te)
|
2244
|
-
{
|
2545
|
+
static char *mte_next(FrtTermEnum *te) {
|
2245
2546
|
TermEnumWrapper *top =
|
2246
2547
|
(TermEnumWrapper *)frt_pq_top(MTE(te)->tew_queue);
|
2247
2548
|
|
@@ -2271,8 +2572,7 @@ static char *mte_next(FrtTermEnum *te)
|
|
2271
2572
|
return te->curr_term;
|
2272
2573
|
}
|
2273
2574
|
|
2274
|
-
static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num)
|
2275
|
-
{
|
2575
|
+
static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num) {
|
2276
2576
|
MultiTermEnum *mte = MTE(te);
|
2277
2577
|
int i;
|
2278
2578
|
const int size = mte->size;
|
@@ -2300,8 +2600,7 @@ static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num)
|
|
2300
2600
|
return te;
|
2301
2601
|
}
|
2302
2602
|
|
2303
|
-
static char *mte_skip_to(FrtTermEnum *te, const char *term)
|
2304
|
-
{
|
2603
|
+
static char *mte_skip_to(FrtTermEnum *te, const char *term) {
|
2305
2604
|
MultiTermEnum *mte = MTE(te);
|
2306
2605
|
int i;
|
2307
2606
|
const int size = mte->size;
|
@@ -2317,8 +2616,7 @@ static char *mte_skip_to(FrtTermEnum *te, const char *term)
|
|
2317
2616
|
return mte_next(te);
|
2318
2617
|
}
|
2319
2618
|
|
2320
|
-
static void mte_close(FrtTermEnum *te)
|
2321
|
-
{
|
2619
|
+
static void mte_close(FrtTermEnum *te) {
|
2322
2620
|
int i;
|
2323
2621
|
const int size = MTE(te)->size;
|
2324
2622
|
for (i = 0; i < size; i++) {
|
@@ -2331,10 +2629,9 @@ static void mte_close(FrtTermEnum *te)
|
|
2331
2629
|
free(te);
|
2332
2630
|
}
|
2333
2631
|
|
2334
|
-
FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
|
2335
|
-
|
2336
|
-
|
2337
|
-
int r_cnt = mr->r_cnt;
|
2632
|
+
FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term) {
|
2633
|
+
FrtIndexReader **readers = mr->sub_readers;
|
2634
|
+
int r_cnt = mr->r_cnt;
|
2338
2635
|
int i;
|
2339
2636
|
FrtIndexReader *reader;
|
2340
2637
|
MultiTermEnum *mte = FRT_ALLOC_AND_ZERO(MultiTermEnum);
|
@@ -2362,8 +2659,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
|
|
2362
2659
|
|
2363
2660
|
if (NULL != term) {
|
2364
2661
|
sub_te = reader->terms_from(reader, fnum, term);
|
2365
|
-
}
|
2366
|
-
else {
|
2662
|
+
} else {
|
2367
2663
|
sub_te = reader->terms(reader, fnum);
|
2368
2664
|
}
|
2369
2665
|
|
@@ -2372,8 +2668,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
|
|
2372
2668
|
|| (tew->term && (tew->term[0] != '\0'))) {
|
2373
2669
|
frt_pq_push(mte->tew_queue, tew); /* initialize queue */
|
2374
2670
|
}
|
2375
|
-
}
|
2376
|
-
else {
|
2671
|
+
} else {
|
2377
2672
|
/* add the term_enum_wrapper just in case */
|
2378
2673
|
sub_te = reader->terms(reader, 0);
|
2379
2674
|
sub_te->field_num = -1;
|
@@ -2395,9 +2690,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
|
|
2395
2690
|
*
|
2396
2691
|
****************************************************************************/
|
2397
2692
|
|
2398
|
-
FrtTermInfosReader *frt_tir_open(FrtStore *store,
|
2399
|
-
FrtSegmentFieldIndex *sfi, const char *segment)
|
2400
|
-
{
|
2693
|
+
FrtTermInfosReader *frt_tir_open(FrtStore *store, FrtSegmentFieldIndex *sfi, const char *segment) {
|
2401
2694
|
FrtTermInfosReader *tir = FRT_ALLOC(FrtTermInfosReader);
|
2402
2695
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
2403
2696
|
|
@@ -2410,8 +2703,7 @@ FrtTermInfosReader *frt_tir_open(FrtStore *store,
|
|
2410
2703
|
return tir;
|
2411
2704
|
}
|
2412
2705
|
|
2413
|
-
static FrtTermEnum *tir_enum(FrtTermInfosReader *tir)
|
2414
|
-
{
|
2706
|
+
static FrtTermEnum *tir_enum(FrtTermInfosReader *tir) {
|
2415
2707
|
FrtTermEnum *te;
|
2416
2708
|
if (NULL == (te = (FrtTermEnum *)frt_thread_getspecific(tir->thread_te))) {
|
2417
2709
|
te = frt_ste_clone(tir->orig_te);
|
@@ -2422,8 +2714,7 @@ static FrtTermEnum *tir_enum(FrtTermInfosReader *tir)
|
|
2422
2714
|
return te;
|
2423
2715
|
}
|
2424
2716
|
|
2425
|
-
FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num)
|
2426
|
-
{
|
2717
|
+
FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num) {
|
2427
2718
|
if (field_num != tir->field_num) {
|
2428
2719
|
ste_set_field(tir_enum(tir), field_num);
|
2429
2720
|
tir->field_num = field_num;
|
@@ -2431,8 +2722,7 @@ FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num)
|
|
2431
2722
|
return tir;
|
2432
2723
|
}
|
2433
2724
|
|
2434
|
-
FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term)
|
2435
|
-
{
|
2725
|
+
FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term) {
|
2436
2726
|
FrtTermEnum *te = tir_enum(tir);
|
2437
2727
|
char *match;
|
2438
2728
|
|
@@ -2443,9 +2733,7 @@ FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term)
|
|
2443
2733
|
return NULL;
|
2444
2734
|
}
|
2445
2735
|
|
2446
|
-
static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num,
|
2447
|
-
const char *term)
|
2448
|
-
{
|
2736
|
+
static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num, const char *term) {
|
2449
2737
|
FrtTermEnum *te = tir_enum(tir);
|
2450
2738
|
char *match;
|
2451
2739
|
|
@@ -2461,19 +2749,16 @@ static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num,
|
|
2461
2749
|
return NULL;
|
2462
2750
|
}
|
2463
2751
|
|
2464
|
-
char *frt_tir_get_term(FrtTermInfosReader *tir, int pos)
|
2465
|
-
{
|
2752
|
+
char *frt_tir_get_term(FrtTermInfosReader *tir, int pos) {
|
2466
2753
|
if (pos < 0) {
|
2467
2754
|
return NULL;
|
2468
|
-
}
|
2469
|
-
else {
|
2755
|
+
} else {
|
2470
2756
|
return frt_ste_get_term(tir_enum(tir), pos);
|
2471
2757
|
}
|
2472
2758
|
}
|
2473
2759
|
|
2474
2760
|
|
2475
|
-
void frt_tir_close(FrtTermInfosReader *tir)
|
2476
|
-
{
|
2761
|
+
void frt_tir_close(FrtTermInfosReader *tir) {
|
2477
2762
|
frt_ary_destroy(tir->te_bucket, (frt_free_ft)&frt_ste_close);
|
2478
2763
|
frt_ste_close(tir->orig_te);
|
2479
2764
|
|
@@ -2490,25 +2775,19 @@ void frt_tir_close(FrtTermInfosReader *tir)
|
|
2490
2775
|
*
|
2491
2776
|
****************************************************************************/
|
2492
2777
|
|
2493
|
-
static FrtTermWriter *tw_new(FrtStore *store, char *file_name)
|
2494
|
-
{
|
2778
|
+
static FrtTermWriter *tw_new(FrtStore *store, char *file_name) {
|
2495
2779
|
FrtTermWriter *tw = FRT_ALLOC_AND_ZERO(FrtTermWriter);
|
2496
2780
|
tw->os = store->new_output(store, file_name);
|
2497
2781
|
tw->last_term = FRT_EMPTY_STRING;
|
2498
2782
|
return tw;
|
2499
2783
|
}
|
2500
2784
|
|
2501
|
-
static void tw_close(FrtTermWriter *tw)
|
2502
|
-
{
|
2785
|
+
static void tw_close(FrtTermWriter *tw) {
|
2503
2786
|
frt_os_close(tw->os);
|
2504
2787
|
free(tw);
|
2505
2788
|
}
|
2506
2789
|
|
2507
|
-
FrtTermInfosWriter *frt_tiw_open(FrtStore *store,
|
2508
|
-
const char *segment,
|
2509
|
-
int index_interval,
|
2510
|
-
int skip_interval)
|
2511
|
-
{
|
2790
|
+
FrtTermInfosWriter *frt_tiw_open(FrtStore *store, const char *segment, int index_interval, int skip_interval) {
|
2512
2791
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
2513
2792
|
FrtTermInfosWriter *tiw = FRT_ALLOC(FrtTermInfosWriter);
|
2514
2793
|
size_t segment_len = strlen(segment);
|
@@ -2537,11 +2816,7 @@ FrtTermInfosWriter *frt_tiw_open(FrtStore *store,
|
|
2537
2816
|
return tiw;
|
2538
2817
|
}
|
2539
2818
|
|
2540
|
-
static void tw_write_term(FrtTermWriter *tw,
|
2541
|
-
FrtOutStream *os,
|
2542
|
-
const char *term,
|
2543
|
-
int term_len)
|
2544
|
-
{
|
2819
|
+
static void tw_write_term(FrtTermWriter *tw, FrtOutStream *os, const char *term, int term_len) {
|
2545
2820
|
int start = frt_hlp_string_diff(tw->last_term, term);
|
2546
2821
|
int length = term_len - start;
|
2547
2822
|
|
@@ -2552,12 +2827,7 @@ static void tw_write_term(FrtTermWriter *tw,
|
|
2552
2827
|
tw->last_term = term;
|
2553
2828
|
}
|
2554
2829
|
|
2555
|
-
static void tw_add(FrtTermWriter *tw,
|
2556
|
-
const char *term,
|
2557
|
-
int term_len,
|
2558
|
-
FrtTermInfo *ti,
|
2559
|
-
int skip_interval)
|
2560
|
-
{
|
2830
|
+
static void tw_add(FrtTermWriter *tw, const char *term, int term_len, FrtTermInfo *ti, int skip_interval) {
|
2561
2831
|
FrtOutStream *os = tw->os;
|
2562
2832
|
|
2563
2833
|
#ifdef DEBUG
|
@@ -2587,11 +2857,7 @@ static void tw_add(FrtTermWriter *tw,
|
|
2587
2857
|
tw->counter++;
|
2588
2858
|
}
|
2589
2859
|
|
2590
|
-
void frt_tiw_add(FrtTermInfosWriter *tiw,
|
2591
|
-
const char *term,
|
2592
|
-
int term_len,
|
2593
|
-
FrtTermInfo *ti)
|
2594
|
-
{
|
2860
|
+
void frt_tiw_add(FrtTermInfosWriter *tiw, const char *term, int term_len, FrtTermInfo *ti) {
|
2595
2861
|
off_t tis_pos;
|
2596
2862
|
|
2597
2863
|
if (0 == (tiw->tis_writer->counter % tiw->index_interval)) {
|
@@ -2609,15 +2875,13 @@ void frt_tiw_add(FrtTermInfosWriter *tiw,
|
|
2609
2875
|
tw_add(tiw->tis_writer, term, term_len, ti, tiw->skip_interval);
|
2610
2876
|
}
|
2611
2877
|
|
2612
|
-
static void tw_reset(FrtTermWriter *tw)
|
2613
|
-
{
|
2878
|
+
static void tw_reset(FrtTermWriter *tw) {
|
2614
2879
|
tw->counter = 0;
|
2615
2880
|
tw->last_term = FRT_EMPTY_STRING;
|
2616
2881
|
FRT_ZEROSET(&(tw->last_term_info), FrtTermInfo);
|
2617
2882
|
}
|
2618
2883
|
|
2619
|
-
void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num)
|
2620
|
-
{
|
2884
|
+
void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num) {
|
2621
2885
|
FrtOutStream *tfx_out = tiw->tfx_out;
|
2622
2886
|
frt_os_write_vint(tfx_out, tiw->tix_writer->counter); /* write tix size */
|
2623
2887
|
frt_os_write_vint(tfx_out, tiw->tis_writer->counter); /* write tis size */
|
@@ -2630,8 +2894,7 @@ void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num)
|
|
2630
2894
|
tiw->field_count++;
|
2631
2895
|
}
|
2632
2896
|
|
2633
|
-
void frt_tiw_close(FrtTermInfosWriter *tiw)
|
2634
|
-
{
|
2897
|
+
void frt_tiw_close(FrtTermInfosWriter *tiw) {
|
2635
2898
|
FrtOutStream *tfx_out = tiw->tfx_out;
|
2636
2899
|
frt_os_write_vint(tfx_out, tiw->tix_writer->counter);
|
2637
2900
|
frt_os_write_vint(tfx_out, tiw->tis_writer->counter);
|
@@ -2665,8 +2928,7 @@ void frt_tiw_close(FrtTermInfosWriter *tiw)
|
|
2665
2928
|
}\
|
2666
2929
|
} while (0)
|
2667
2930
|
|
2668
|
-
static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
|
2669
|
-
{
|
2931
|
+
static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti) {
|
2670
2932
|
if (NULL == ti) {
|
2671
2933
|
stde->doc_freq = 0;
|
2672
2934
|
} else {
|
@@ -2684,14 +2946,12 @@ static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
|
|
2684
2946
|
}
|
2685
2947
|
}
|
2686
2948
|
|
2687
|
-
static void stde_seek(FrtTermDocEnum *tde, int field_num, const char *term)
|
2688
|
-
{
|
2949
|
+
static void stde_seek(FrtTermDocEnum *tde, int field_num, const char *term) {
|
2689
2950
|
FrtTermInfo *ti = tir_get_ti_field(STDE(tde)->tir, field_num, term);
|
2690
2951
|
stde_seek_ti(STDE(tde), ti);
|
2691
2952
|
}
|
2692
2953
|
|
2693
|
-
static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te)
|
2694
|
-
{
|
2954
|
+
static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te) {
|
2695
2955
|
#ifdef DEBUG
|
2696
2956
|
if (te->set_field != &ste_set_field) {
|
2697
2957
|
FRT_RAISE(FRT_ARG_ERROR, "Passed an incorrect TermEnum type");
|
@@ -2700,20 +2960,17 @@ static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te)
|
|
2700
2960
|
stde_seek_ti(STDE(tde), &(te->curr_ti));
|
2701
2961
|
}
|
2702
2962
|
|
2703
|
-
static int stde_doc_num(FrtTermDocEnum *tde)
|
2704
|
-
{
|
2963
|
+
static int stde_doc_num(FrtTermDocEnum *tde) {
|
2705
2964
|
CHECK_STATE("doc_num");
|
2706
2965
|
return STDE(tde)->doc_num;
|
2707
2966
|
}
|
2708
2967
|
|
2709
|
-
static int stde_freq(FrtTermDocEnum *tde)
|
2710
|
-
{
|
2968
|
+
static int stde_freq(FrtTermDocEnum *tde) {
|
2711
2969
|
CHECK_STATE("freq");
|
2712
2970
|
return STDE(tde)->freq;
|
2713
2971
|
}
|
2714
2972
|
|
2715
|
-
static bool stde_next(FrtTermDocEnum *tde)
|
2716
|
-
{
|
2973
|
+
static bool stde_next(FrtTermDocEnum *tde) {
|
2717
2974
|
int doc_code;
|
2718
2975
|
FrtSegmentTermDocEnum *stde = STDE(tde);
|
2719
2976
|
|
@@ -2741,8 +2998,7 @@ static bool stde_next(FrtTermDocEnum *tde)
|
|
2741
2998
|
return true;
|
2742
2999
|
}
|
2743
3000
|
|
2744
|
-
static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num)
|
2745
|
-
{
|
3001
|
+
static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num) {
|
2746
3002
|
FrtSegmentTermDocEnum *stde = STDE(tde);
|
2747
3003
|
int i = 0;
|
2748
3004
|
int doc_code;
|
@@ -2769,8 +3025,7 @@ static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num)
|
|
2769
3025
|
return i;
|
2770
3026
|
}
|
2771
3027
|
|
2772
|
-
static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num)
|
2773
|
-
{
|
3028
|
+
static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num) {
|
2774
3029
|
FrtSegmentTermDocEnum *stde = STDE(tde);
|
2775
3030
|
|
2776
3031
|
if (stde->doc_freq >= stde->skip_interval
|
@@ -2834,8 +3089,7 @@ static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num)
|
|
2834
3089
|
return true;
|
2835
3090
|
}
|
2836
3091
|
|
2837
|
-
static void stde_close(FrtTermDocEnum *tde)
|
2838
|
-
{
|
3092
|
+
static void stde_close(FrtTermDocEnum *tde) {
|
2839
3093
|
frt_is_close(STDE(tde)->frq_in);
|
2840
3094
|
|
2841
3095
|
if (NULL != STDE(tde)->skip_in) {
|
@@ -2845,23 +3099,17 @@ static void stde_close(FrtTermDocEnum *tde)
|
|
2845
3099
|
free(tde);
|
2846
3100
|
}
|
2847
3101
|
|
2848
|
-
static void stde_skip_prox(FrtSegmentTermDocEnum *stde)
|
2849
|
-
{
|
3102
|
+
static void stde_skip_prox(FrtSegmentTermDocEnum *stde) {
|
2850
3103
|
(void)stde;
|
2851
3104
|
}
|
2852
3105
|
|
2853
|
-
static void stde_seek_prox(FrtSegmentTermDocEnum *stde, off_t prx_ptr)
|
2854
|
-
{
|
3106
|
+
static void stde_seek_prox(FrtSegmentTermDocEnum *stde, off_t prx_ptr) {
|
2855
3107
|
(void)stde;
|
2856
3108
|
(void)prx_ptr;
|
2857
3109
|
}
|
2858
3110
|
|
2859
3111
|
|
2860
|
-
FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir,
|
2861
|
-
FrtInStream *frq_in,
|
2862
|
-
FrtBitVector *deleted_docs,
|
2863
|
-
int skip_interval)
|
2864
|
-
{
|
3112
|
+
FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir, FrtInStream *frq_in, FrtBitVector *deleted_docs, int skip_interval) {
|
2865
3113
|
FrtSegmentTermDocEnum *stde = FRT_ALLOC_AND_ZERO(FrtSegmentTermDocEnum);
|
2866
3114
|
FrtTermDocEnum *tde = (FrtTermDocEnum *)stde;
|
2867
3115
|
|
@@ -2893,27 +3141,23 @@ FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir,
|
|
2893
3141
|
* SegmentTermPosEnum
|
2894
3142
|
****************************************************************************/
|
2895
3143
|
|
2896
|
-
static void stpe_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
|
2897
|
-
{
|
3144
|
+
static void stpe_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti) {
|
2898
3145
|
if (NULL == ti) {
|
2899
3146
|
stde->doc_freq = 0;
|
2900
|
-
}
|
2901
|
-
else {
|
3147
|
+
} else {
|
2902
3148
|
stde_seek_ti(stde, ti);
|
2903
3149
|
frt_is_seek(stde->prx_in, ti->prx_ptr);
|
2904
3150
|
}
|
2905
3151
|
}
|
2906
3152
|
|
2907
|
-
static void stpe_seek(FrtTermDocEnum *tde, int field_num, const char *term)
|
2908
|
-
{
|
3153
|
+
static void stpe_seek(FrtTermDocEnum *tde, int field_num, const char *term) {
|
2909
3154
|
FrtSegmentTermDocEnum *stde = STDE(tde);
|
2910
3155
|
FrtTermInfo *ti = tir_get_ti_field(stde->tir, field_num, term);
|
2911
3156
|
stpe_seek_ti(stde, ti);
|
2912
3157
|
stde->prx_cnt = 0;
|
2913
3158
|
}
|
2914
3159
|
|
2915
|
-
static bool stpe_next(FrtTermDocEnum *tde)
|
2916
|
-
{
|
3160
|
+
static bool stpe_next(FrtTermDocEnum *tde) {
|
2917
3161
|
FrtSegmentTermDocEnum *stde = STDE(tde);
|
2918
3162
|
frt_is_skip_vints(stde->prx_in, stde->prx_cnt);
|
2919
3163
|
|
@@ -3387,8 +3631,8 @@ FrtTermDocEnum *frt_mtdpe_new(FrtIndexReader *ir, int field_num, char **terms, i
|
|
3387
3631
|
****************************************************************************/
|
3388
3632
|
|
3389
3633
|
static FrtHash *fn_extensions = NULL;
|
3390
|
-
|
3391
|
-
{
|
3634
|
+
|
3635
|
+
static void file_name_filter_init(void) {
|
3392
3636
|
int i;
|
3393
3637
|
fn_extensions = frt_h_new_str((frt_free_ft)NULL, (frt_free_ft)NULL);
|
3394
3638
|
for (i = 0; i < FRT_NELEMS(INDEX_EXTENSIONS); i++) {
|
@@ -3687,9 +3931,8 @@ static void ir_acquire_write_lock(FrtIndexReader *ir)
|
|
3687
3931
|
}
|
3688
3932
|
}
|
3689
3933
|
|
3690
|
-
static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentInfos *sis,
|
3691
|
-
|
3692
|
-
{
|
3934
|
+
static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentInfos *sis, FrtFieldInfos *fis, int is_owner) {
|
3935
|
+
ir->type = FRT_INDEX_READER;
|
3693
3936
|
frt_mutex_init(&ir->mutex, NULL);
|
3694
3937
|
frt_mutex_init(&ir->field_index_mutex, NULL);
|
3695
3938
|
|
@@ -3712,8 +3955,7 @@ static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentI
|
|
3712
3955
|
return ir;
|
3713
3956
|
}
|
3714
3957
|
|
3715
|
-
int frt_ir_doc_freq(FrtIndexReader *ir,
|
3716
|
-
{
|
3958
|
+
int frt_ir_doc_freq(FrtIndexReader *ir, ID field, const char *term) {
|
3717
3959
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3718
3960
|
if (field_num >= 0) {
|
3719
3961
|
return ir->doc_freq(ir, field_num, term);
|
@@ -3723,8 +3965,7 @@ int frt_ir_doc_freq(FrtIndexReader *ir, FrtSymbol field, const char *term)
|
|
3723
3965
|
}
|
3724
3966
|
}
|
3725
3967
|
|
3726
|
-
static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uchar val)
|
3727
|
-
{
|
3968
|
+
static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uchar val) {
|
3728
3969
|
frt_mutex_lock(&ir->mutex);
|
3729
3970
|
ir->acquire_write_lock(ir);
|
3730
3971
|
ir->set_norm_i(ir, doc_num, field_num, val);
|
@@ -3732,8 +3973,7 @@ static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uc
|
|
3732
3973
|
frt_mutex_unlock(&ir->mutex);
|
3733
3974
|
}
|
3734
3975
|
|
3735
|
-
void frt_ir_set_norm(FrtIndexReader *ir, int doc_num,
|
3736
|
-
{
|
3976
|
+
void frt_ir_set_norm(FrtIndexReader *ir, int doc_num, ID field, frt_uchar val) {
|
3737
3977
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3738
3978
|
if (field_num >= 0) {
|
3739
3979
|
ir_set_norm_i(ir, doc_num, field_num, val);
|
@@ -3755,14 +3995,12 @@ frt_uchar *frt_ir_get_norms_i(FrtIndexReader *ir, int field_num)
|
|
3755
3995
|
return norms;
|
3756
3996
|
}
|
3757
3997
|
|
3758
|
-
frt_uchar *frt_ir_get_norms(FrtIndexReader *ir,
|
3759
|
-
{
|
3998
|
+
frt_uchar *frt_ir_get_norms(FrtIndexReader *ir, ID field) {
|
3760
3999
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3761
4000
|
return frt_ir_get_norms_i(ir, field_num);
|
3762
4001
|
}
|
3763
4002
|
|
3764
|
-
frt_uchar *frt_ir_get_norms_into(FrtIndexReader *ir,
|
3765
|
-
{
|
4003
|
+
frt_uchar *frt_ir_get_norms_into(FrtIndexReader *ir, ID field, frt_uchar *buf) {
|
3766
4004
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3767
4005
|
if (field_num >= 0) {
|
3768
4006
|
ir->get_norms_into(ir, field_num, buf);
|
@@ -3793,7 +4031,7 @@ void frt_ir_delete_doc(FrtIndexReader *ir, int doc_num)
|
|
3793
4031
|
}
|
3794
4032
|
}
|
3795
4033
|
|
3796
|
-
FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir,
|
4034
|
+
FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, ID field, const char *term) {
|
3797
4035
|
FrtTermDocEnum *tde = ir_term_docs_for(ir, field, term);
|
3798
4036
|
FrtDocument *doc = NULL;
|
3799
4037
|
|
@@ -3806,8 +4044,7 @@ FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, FrtSymbol field, const
|
|
3806
4044
|
return doc;
|
3807
4045
|
}
|
3808
4046
|
|
3809
|
-
FrtTermEnum *frt_ir_terms(FrtIndexReader *ir,
|
3810
|
-
{
|
4047
|
+
FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, ID field) {
|
3811
4048
|
FrtTermEnum *te = NULL;
|
3812
4049
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3813
4050
|
if (field_num >= 0) {
|
@@ -3816,9 +4053,7 @@ FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, FrtSymbol field)
|
|
3816
4053
|
return te;
|
3817
4054
|
}
|
3818
4055
|
|
3819
|
-
FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir,
|
3820
|
-
const char *term)
|
3821
|
-
{
|
4056
|
+
FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, ID field, const char *term) {
|
3822
4057
|
FrtTermEnum *te = NULL;
|
3823
4058
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3824
4059
|
if (field_num >= 0) {
|
@@ -3827,9 +4062,7 @@ FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, FrtSymbol field,
|
|
3827
4062
|
return te;
|
3828
4063
|
}
|
3829
4064
|
|
3830
|
-
FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir,
|
3831
|
-
const char *term)
|
3832
|
-
{
|
4065
|
+
FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, ID field, const char *term) {
|
3833
4066
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3834
4067
|
FrtTermDocEnum *tde = ir->term_docs(ir);
|
3835
4068
|
if (field_num >= 0) {
|
@@ -3838,9 +4071,7 @@ FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, FrtSymbol field,
|
|
3838
4071
|
return tde;
|
3839
4072
|
}
|
3840
4073
|
|
3841
|
-
FrtTermDocEnum *frt_ir_term_positions_for(FrtIndexReader *ir,
|
3842
|
-
const char *term)
|
3843
|
-
{
|
4074
|
+
FrtTermDocEnum *frt_ir_term_positions_for(FrtIndexReader *ir, ID field, const char *term) {
|
3844
4075
|
int field_num = frt_fis_get_field_num(ir->fis, field);
|
3845
4076
|
FrtTermDocEnum *tde = ir->term_positions(ir);
|
3846
4077
|
if (field_num >= 0) {
|
@@ -3854,7 +4085,7 @@ static void ir_commit_i(FrtIndexReader *ir)
|
|
3854
4085
|
if (ir->has_changes) {
|
3855
4086
|
if (NULL == ir->deleter && NULL != ir->store) {
|
3856
4087
|
/* In the MultiReader case, we share this deleter across all
|
3857
|
-
*
|
4088
|
+
* FrtSegmentReaders: */
|
3858
4089
|
ir->set_deleter_i(ir, frt_deleter_new(ir->sis, ir->store));
|
3859
4090
|
}
|
3860
4091
|
if (ir->is_owner) {
|
@@ -3990,34 +4221,14 @@ static void norm_rewrite(Norm *norm, FrtStore *store, FrtDeleter *dlr,
|
|
3990
4221
|
}
|
3991
4222
|
|
3992
4223
|
/****************************************************************************
|
3993
|
-
*
|
4224
|
+
* FrtSegmentReader
|
3994
4225
|
****************************************************************************/
|
3995
4226
|
|
3996
|
-
typedef struct SegmentReader {
|
3997
|
-
FrtIndexReader ir;
|
3998
|
-
FrtSegmentInfo *si;
|
3999
|
-
char *segment;
|
4000
|
-
FrtFieldsReader *fr;
|
4001
|
-
FrtBitVector *deleted_docs;
|
4002
|
-
FrtInStream *frq_in;
|
4003
|
-
FrtInStream *prx_in;
|
4004
|
-
FrtSegmentFieldIndex *sfi;
|
4005
|
-
FrtTermInfosReader *tir;
|
4006
|
-
frt_thread_key_t thread_fr;
|
4007
|
-
void **fr_bucket;
|
4008
|
-
FrtHash *norms;
|
4009
|
-
FrtStore *cfs_store;
|
4010
|
-
bool deleted_docs_dirty : 1;
|
4011
|
-
bool undelete_all : 1;
|
4012
|
-
bool norms_dirty : 1;
|
4013
|
-
} SegmentReader;
|
4014
|
-
|
4015
4227
|
#define IR(ir) ((FrtIndexReader *)(ir))
|
4016
|
-
|
4017
|
-
#define SR(ir) ((SegmentReader *)(ir))
|
4228
|
+
#define SR(ir) ((FrtSegmentReader *)(ir))
|
4018
4229
|
#define SR_SIZE(ir) (SR(ir)->fr->size)
|
4019
4230
|
|
4020
|
-
static FrtFieldsReader *sr_fr(
|
4231
|
+
static FrtFieldsReader *sr_fr(FrtSegmentReader *sr)
|
4021
4232
|
{
|
4022
4233
|
FrtFieldsReader *fr;
|
4023
4234
|
|
@@ -4029,12 +4240,12 @@ static FrtFieldsReader *sr_fr(SegmentReader *sr)
|
|
4029
4240
|
return fr;
|
4030
4241
|
}
|
4031
4242
|
|
4032
|
-
static bool sr_is_deleted_i(
|
4243
|
+
static bool sr_is_deleted_i(FrtSegmentReader *sr, int doc_num)
|
4033
4244
|
{
|
4034
4245
|
return (NULL != sr->deleted_docs && frt_bv_get(sr->deleted_docs, doc_num));
|
4035
4246
|
}
|
4036
4247
|
|
4037
|
-
static void sr_get_norms_into_i(
|
4248
|
+
static void sr_get_norms_into_i(FrtSegmentReader *sr, int field_num,
|
4038
4249
|
frt_uchar *buf)
|
4039
4250
|
{
|
4040
4251
|
Norm *norm = (Norm *)frt_h_get_int(sr->norms, field_num);
|
@@ -4053,7 +4264,7 @@ static void sr_get_norms_into_i(SegmentReader *sr, int field_num,
|
|
4053
4264
|
}
|
4054
4265
|
}
|
4055
4266
|
|
4056
|
-
static frt_uchar *sr_get_norms_i(
|
4267
|
+
static frt_uchar *sr_get_norms_i(FrtSegmentReader *sr, int field_num)
|
4057
4268
|
{
|
4058
4269
|
Norm *norm = (Norm *)frt_h_get_int(sr->norms, field_num);
|
4059
4270
|
if (NULL == norm) { /* not an indexed field */
|
@@ -4189,7 +4400,7 @@ static void sr_commit_i(FrtIndexReader *ir)
|
|
4189
4400
|
|
4190
4401
|
static void sr_close_i(FrtIndexReader *ir)
|
4191
4402
|
{
|
4192
|
-
|
4403
|
+
FrtSegmentReader *sr = SR(ir);
|
4193
4404
|
|
4194
4405
|
if (sr->fr) frt_fr_close(sr->fr);
|
4195
4406
|
if (sr->tir) frt_tir_close(sr->tir);
|
@@ -4298,14 +4509,12 @@ static FrtTermDocEnum *sr_term_docs(FrtIndexReader *ir)
|
|
4298
4509
|
|
4299
4510
|
static FrtTermDocEnum *sr_term_positions(FrtIndexReader *ir)
|
4300
4511
|
{
|
4301
|
-
|
4512
|
+
FrtSegmentReader *sr = SR(ir);
|
4302
4513
|
return frt_stpe_new(sr->tir, sr->frq_in, sr->prx_in, sr->deleted_docs,
|
4303
4514
|
STE(sr->tir->orig_te)->skip_interval);
|
4304
4515
|
}
|
4305
4516
|
|
4306
|
-
static FrtTermVector *sr_term_vector(FrtIndexReader *ir, int doc_num,
|
4307
|
-
FrtSymbol field)
|
4308
|
-
{
|
4517
|
+
static FrtTermVector *sr_term_vector(FrtIndexReader *ir, int doc_num, ID field) {
|
4309
4518
|
FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(ir->fis->field_dict, (void *)field);
|
4310
4519
|
FrtFieldsReader *fr;
|
4311
4520
|
|
@@ -4360,7 +4569,7 @@ static void sr_open_norms(FrtIndexReader *ir, FrtStore *cfs_store)
|
|
4360
4569
|
SR(ir)->norms_dirty = false;
|
4361
4570
|
}
|
4362
4571
|
|
4363
|
-
static FrtIndexReader *sr_setup_i(
|
4572
|
+
static FrtIndexReader *sr_setup_i(FrtSegmentReader *sr)
|
4364
4573
|
{
|
4365
4574
|
FrtStore *volatile store = sr->si->store;
|
4366
4575
|
FrtIndexReader *ir = IR(sr);
|
@@ -4391,6 +4600,8 @@ static FrtIndexReader *sr_setup_i(SegmentReader *sr)
|
|
4391
4600
|
ir->commit_i = &sr_commit_i;
|
4392
4601
|
ir->close_i = &sr_close_i;
|
4393
4602
|
|
4603
|
+
ir->type = FRT_SEGMENT_READER;
|
4604
|
+
|
4394
4605
|
sr->cfs_store = NULL;
|
4395
4606
|
|
4396
4607
|
FRT_TRY
|
@@ -4430,10 +4641,13 @@ static FrtIndexReader *sr_setup_i(SegmentReader *sr)
|
|
4430
4641
|
return ir;
|
4431
4642
|
}
|
4432
4643
|
|
4433
|
-
|
4434
|
-
|
4435
|
-
|
4436
|
-
|
4644
|
+
FrtSegmentReader *frt_sr_alloc(void) {
|
4645
|
+
return FRT_ALLOC_AND_ZERO(FrtSegmentReader);
|
4646
|
+
}
|
4647
|
+
|
4648
|
+
static FrtIndexReader *sr_open(FrtSegmentInfos *sis, FrtFieldInfos *fis, int si_num, bool is_owner, FrtSegmentReader *sr) {
|
4649
|
+
if (sr == NULL)
|
4650
|
+
sr = frt_sr_alloc();
|
4437
4651
|
sr->si = sis->segs[si_num];
|
4438
4652
|
ir_setup(IR(sr), sr->si->store, sis, fis, is_owner);
|
4439
4653
|
return sr_setup_i(sr);
|
@@ -4604,9 +4818,7 @@ static FrtTermDocEnum *mr_term_positions(FrtIndexReader *ir)
|
|
4604
4818
|
return mtpe_new(MR(ir));
|
4605
4819
|
}
|
4606
4820
|
|
4607
|
-
static FrtTermVector *mr_term_vector(FrtIndexReader *ir, int doc_num,
|
4608
|
-
FrtSymbol field)
|
4609
|
-
{
|
4821
|
+
static FrtTermVector *mr_term_vector(FrtIndexReader *ir, int doc_num, ID field) {
|
4610
4822
|
GET_READER();
|
4611
4823
|
return reader->term_vector(reader, doc_num - MR(ir)->starts[i], field);
|
4612
4824
|
}
|
@@ -4710,10 +4922,12 @@ static void mr_close_i(FrtIndexReader *ir)
|
|
4710
4922
|
free(MR(ir)->starts);
|
4711
4923
|
}
|
4712
4924
|
|
4713
|
-
|
4714
|
-
|
4925
|
+
FrtMultiReader *frt_mr_alloc(void) {
|
4926
|
+
return FRT_ALLOC_AND_ZERO(FrtMultiReader);
|
4927
|
+
}
|
4928
|
+
|
4929
|
+
FrtMultiReader *frt_mr_init(FrtMultiReader *mr, FrtIndexReader **sub_readers, const int r_cnt) {
|
4715
4930
|
int i;
|
4716
|
-
FrtMultiReader *mr = FRT_ALLOC_AND_ZERO(FrtMultiReader);
|
4717
4931
|
FrtIndexReader *ir = IR(mr);
|
4718
4932
|
|
4719
4933
|
mr->sub_readers = sub_readers;
|
@@ -4760,21 +4974,19 @@ static FrtIndexReader *mr_new(FrtIndexReader **sub_readers, const int r_cnt)
|
|
4760
4974
|
ir->commit_i = &mr_commit_i;
|
4761
4975
|
ir->close_i = &mr_close_i;
|
4762
4976
|
|
4763
|
-
|
4977
|
+
ir->type = FRT_MULTI_READER;
|
4978
|
+
|
4979
|
+
return mr;
|
4764
4980
|
}
|
4765
4981
|
|
4766
|
-
static FrtIndexReader *frt_mr_open_i(FrtStore *store,
|
4767
|
-
|
4768
|
-
|
4769
|
-
|
4770
|
-
const int r_cnt)
|
4771
|
-
{
|
4772
|
-
FrtIndexReader *ir = mr_new(sub_readers, r_cnt);
|
4982
|
+
static FrtIndexReader *frt_mr_open_i(FrtStore *store, FrtSegmentInfos *sis, FrtFieldInfos *fis, FrtIndexReader **sub_readers, const int r_cnt, FrtIndexReader *ir) {
|
4983
|
+
if (ir == NULL)
|
4984
|
+
ir = (FrtIndexReader *)frt_mr_alloc();
|
4985
|
+
ir = (FrtIndexReader *)frt_mr_init((FrtMultiReader *)ir, sub_readers, r_cnt);
|
4773
4986
|
return ir_setup(ir, store, sis, fis, true);
|
4774
4987
|
}
|
4775
4988
|
|
4776
|
-
static void mr_close_ext_i(FrtIndexReader *ir)
|
4777
|
-
{
|
4989
|
+
static void mr_close_ext_i(FrtIndexReader *ir) {
|
4778
4990
|
int **field_num_map = MR(ir)->field_num_map;
|
4779
4991
|
if (field_num_map) {
|
4780
4992
|
int i;
|
@@ -4787,12 +4999,13 @@ static void mr_close_ext_i(FrtIndexReader *ir)
|
|
4787
4999
|
mr_close_i(ir);
|
4788
5000
|
}
|
4789
5001
|
|
4790
|
-
FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
|
4791
|
-
|
4792
|
-
|
5002
|
+
FrtIndexReader *frt_mr_open(FrtIndexReader *ir, FrtIndexReader **sub_readers, const int r_cnt) {
|
5003
|
+
if (ir == NULL)
|
5004
|
+
ir = (FrtIndexReader *)frt_mr_alloc();
|
5005
|
+
ir = (FrtIndexReader *)frt_mr_init((FrtMultiReader *)ir, sub_readers, r_cnt);
|
4793
5006
|
FrtMultiReader *mr = MR(ir);
|
4794
5007
|
/* defaults don't matter, this is just for reading fields, not adding */
|
4795
|
-
FrtFieldInfos *fis = frt_fis_new(FRT_STORE_NO, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
|
5008
|
+
FrtFieldInfos *fis = frt_fis_new(FRT_STORE_NO, FRT_COMPRESSION_NONE, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
|
4796
5009
|
int i, j;
|
4797
5010
|
bool need_field_map = false;
|
4798
5011
|
|
@@ -4827,12 +5040,10 @@ FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
|
|
4827
5040
|
mr->field_num_map[i][j] = fi_sub ? fi_sub->number : -1;
|
4828
5041
|
}
|
4829
5042
|
}
|
4830
|
-
}
|
4831
|
-
else {
|
5043
|
+
} else {
|
4832
5044
|
mr->field_num_map = NULL;
|
4833
5045
|
}
|
4834
5046
|
|
4835
|
-
|
4836
5047
|
ir->close_i = &mr_close_ext_i;
|
4837
5048
|
|
4838
5049
|
return ir_setup(ir, NULL, NULL, fis, false);
|
@@ -4842,21 +5053,19 @@ FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
|
|
4842
5053
|
* IndexReader
|
4843
5054
|
****************************************************************************/
|
4844
5055
|
|
4845
|
-
|
4846
|
-
static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
|
4847
|
-
{
|
5056
|
+
static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir) {
|
4848
5057
|
volatile bool success = false;
|
4849
|
-
FrtIndexReader *volatile ir = NULL;
|
5058
|
+
// FrtIndexReader *volatile ir = NULL;
|
4850
5059
|
FrtSegmentInfos *volatile sis = NULL;
|
4851
5060
|
FRT_TRY
|
4852
5061
|
do {
|
4853
5062
|
FrtFieldInfos *fis;
|
4854
5063
|
frt_mutex_lock(&store->mutex);
|
4855
|
-
frt_sis_read_i(store, fsf);
|
5064
|
+
frt_sis_read_i(store, fsf, NULL);
|
4856
5065
|
sis = fsf->ret.sis;
|
4857
5066
|
fis = sis->fis;
|
4858
5067
|
if (sis->size == 1) {
|
4859
|
-
ir = sr_open(sis, fis, 0, true);
|
5068
|
+
ir = sr_open(sis, fis, 0, true, (FrtSegmentReader *)ir);
|
4860
5069
|
}
|
4861
5070
|
else {
|
4862
5071
|
volatile int i;
|
@@ -4864,7 +5073,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
|
|
4864
5073
|
int num_segments = sis->size;
|
4865
5074
|
for (i = num_segments - 1; i >= 0; i--) {
|
4866
5075
|
FRT_TRY
|
4867
|
-
readers[i] = sr_open(sis, fis, i, false);
|
5076
|
+
readers[i] = sr_open(sis, fis, i, false, NULL);
|
4868
5077
|
FRT_XCATCHALL
|
4869
5078
|
for (i++; i < num_segments; i++) {
|
4870
5079
|
frt_ir_close(readers[i]);
|
@@ -4872,7 +5081,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
|
|
4872
5081
|
free(readers);
|
4873
5082
|
FRT_XENDTRY
|
4874
5083
|
}
|
4875
|
-
ir = frt_mr_open_i(store, sis, fis, readers, sis->size);
|
5084
|
+
ir = frt_mr_open_i(store, sis, fis, readers, sis->size, ir);
|
4876
5085
|
}
|
4877
5086
|
fsf->ret.ir = ir;
|
4878
5087
|
success = true;
|
@@ -4881,8 +5090,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
|
|
4881
5090
|
if (!success) {
|
4882
5091
|
if (ir) {
|
4883
5092
|
frt_ir_close(ir);
|
4884
|
-
}
|
4885
|
-
else if (sis) {
|
5093
|
+
} else if (sis) {
|
4886
5094
|
frt_sis_destroy(sis);
|
4887
5095
|
}
|
4888
5096
|
}
|
@@ -4894,15 +5102,12 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
|
|
4894
5102
|
* Will keep a reference to the store. To let this method delete the store
|
4895
5103
|
* make sure you deref the store that you pass to it
|
4896
5104
|
*/
|
4897
|
-
FrtIndexReader *frt_ir_open(FrtStore *store)
|
4898
|
-
{
|
5105
|
+
FrtIndexReader *frt_ir_open(FrtIndexReader *ir, FrtStore *store) {
|
4899
5106
|
FindSegmentsFile fsf;
|
4900
|
-
sis_find_segments_file(store, &fsf, &ir_open_i);
|
5107
|
+
sis_find_segments_file(store, &fsf, &ir_open_i, ir);
|
4901
5108
|
return fsf.ret.ir;
|
4902
5109
|
}
|
4903
5110
|
|
4904
|
-
|
4905
|
-
|
4906
5111
|
/****************************************************************************
|
4907
5112
|
*
|
4908
5113
|
* Occurence
|
@@ -5292,10 +5497,7 @@ static void dw_add_offsets(FrtDocWriter *dw, int pos, off_t start, off_t end)
|
|
5292
5497
|
dw->offsets_size = pos + 1;
|
5293
5498
|
}
|
5294
5499
|
|
5295
|
-
FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
|
5296
|
-
FrtFieldInverter *fld_inv,
|
5297
|
-
FrtDocField *df)
|
5298
|
-
{
|
5500
|
+
FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDocField *df) {
|
5299
5501
|
FrtMemoryPool *mp = dw->mp;
|
5300
5502
|
FrtAnalyzer *a = dw->analyzer;
|
5301
5503
|
FrtHash *curr_plists = dw->curr_plists;
|
@@ -5311,7 +5513,7 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
|
|
5311
5513
|
int pos = -1, num_terms = 0;
|
5312
5514
|
|
5313
5515
|
for (i = 0; i < df_size; i++) {
|
5314
|
-
FrtTokenStream *ts = frt_a_get_ts(a, df->name, df->data[i]);
|
5516
|
+
FrtTokenStream *ts = frt_a_get_ts(a, df->name, df->data[i], df->encodings[i]);
|
5315
5517
|
/* ts->reset(ts, df->data[i]); no longer being called */
|
5316
5518
|
if (store_offsets) {
|
5317
5519
|
while (NULL != (tk = ts->next(ts))) {
|
@@ -5321,21 +5523,16 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
|
|
5321
5523
|
if (pos < 0) {
|
5322
5524
|
pos = 0;
|
5323
5525
|
}
|
5324
|
-
dw_add_posting(mp, curr_plists, fld_plists, doc_num,
|
5325
|
-
|
5326
|
-
dw_add_offsets(dw, pos,
|
5327
|
-
start_offset + tk->start,
|
5328
|
-
start_offset + tk->end);
|
5526
|
+
dw_add_posting(mp, curr_plists, fld_plists, doc_num, tk->text, tk->len, pos);
|
5527
|
+
dw_add_offsets(dw, pos, start_offset + tk->start, start_offset + tk->end);
|
5329
5528
|
if (num_terms++ >= dw->max_field_length) {
|
5330
5529
|
break;
|
5331
5530
|
}
|
5332
5531
|
}
|
5333
|
-
}
|
5334
|
-
else {
|
5532
|
+
} else {
|
5335
5533
|
while (NULL != (tk = ts->next(ts))) {
|
5336
5534
|
pos += tk->pos_inc;
|
5337
|
-
dw_add_posting(mp, curr_plists, fld_plists, doc_num,
|
5338
|
-
tk->text, tk->len, pos);
|
5535
|
+
dw_add_posting(mp, curr_plists, fld_plists, doc_num, tk->text, tk->len, pos);
|
5339
5536
|
if (num_terms++ >= dw->max_field_length) {
|
5340
5537
|
break;
|
5341
5538
|
}
|
@@ -5345,8 +5542,7 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
|
|
5345
5542
|
start_offset += df->lengths[i] + 1;
|
5346
5543
|
}
|
5347
5544
|
fld_inv->length = num_terms;
|
5348
|
-
}
|
5349
|
-
else {
|
5545
|
+
} else {
|
5350
5546
|
char buf[FRT_MAX_WORD_SIZE];
|
5351
5547
|
buf[FRT_MAX_WORD_SIZE - 1] = '\0';
|
5352
5548
|
for (i = 0; i < df_size; i++) {
|
@@ -5356,11 +5552,9 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
|
|
5356
5552
|
len = FRT_MAX_WORD_SIZE - 1;
|
5357
5553
|
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5358
5554
|
}
|
5359
|
-
dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr,
|
5360
|
-
len, i);
|
5555
|
+
dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
|
5361
5556
|
if (store_offsets) {
|
5362
|
-
dw_add_offsets(dw, i, start_offset,
|
5363
|
-
start_offset + df->lengths[i]);
|
5557
|
+
dw_add_offsets(dw, i, start_offset, start_offset + df->lengths[i]);
|
5364
5558
|
}
|
5365
5559
|
start_offset += df->lengths[i] + 1;
|
5366
5560
|
}
|
@@ -5369,14 +5563,12 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
|
|
5369
5563
|
return curr_plists;
|
5370
5564
|
}
|
5371
5565
|
|
5372
|
-
void frt_dw_reset_postings(FrtHash *postings)
|
5373
|
-
{
|
5566
|
+
void frt_dw_reset_postings(FrtHash *postings) {
|
5374
5567
|
FRT_ZEROSET_N(postings->table, FrtHashEntry, postings->mask + 1);
|
5375
5568
|
postings->fill = postings->size = 0;
|
5376
5569
|
}
|
5377
5570
|
|
5378
|
-
void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc)
|
5379
|
-
{
|
5571
|
+
void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc) {
|
5380
5572
|
int i;
|
5381
5573
|
float boost;
|
5382
5574
|
FrtDocField *df;
|
@@ -5398,16 +5590,12 @@ void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc)
|
|
5398
5590
|
|
5399
5591
|
postings = frt_dw_invert_field(dw, fld_inv, df);
|
5400
5592
|
if (fld_inv->store_term_vector) {
|
5401
|
-
frt_fw_add_postings(dw->fw, fld_inv->fi->number,
|
5402
|
-
dw_sort_postings(postings), postings->size,
|
5403
|
-
dw->offsets, dw->offsets_size);
|
5593
|
+
frt_fw_add_postings(dw->fw, fld_inv->fi->number, dw_sort_postings(postings), postings->size, dw->offsets, dw->offsets_size);
|
5404
5594
|
}
|
5405
5595
|
|
5406
5596
|
if (fld_inv->has_norms) {
|
5407
|
-
boost = fld_inv->fi->boost * doc->boost * df->boost *
|
5408
|
-
|
5409
|
-
fld_inv->norms[dw->doc_num] =
|
5410
|
-
frt_sim_encode_norm(dw->similarity, boost);
|
5597
|
+
boost = fld_inv->fi->boost * doc->boost * df->boost * frt_sim_length_norm(dw->similarity, fi->name, fld_inv->length);
|
5598
|
+
fld_inv->norms[dw->doc_num] = frt_sim_encode_norm(dw->similarity, boost);
|
5411
5599
|
}
|
5412
5600
|
frt_dw_reset_postings(postings);
|
5413
5601
|
if (dw->offsets_size > 0) {
|
@@ -5960,15 +6148,12 @@ static void iw_commit_compound_file(FrtIndexWriter *iw, FrtSegmentInfo *si)
|
|
5960
6148
|
iw_create_compound_file(iw->store, iw->fis, si, cfs_name, iw->deleter);
|
5961
6149
|
}
|
5962
6150
|
|
5963
|
-
static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg,
|
5964
|
-
const int max_seg)
|
5965
|
-
{
|
6151
|
+
static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg, const int max_seg) {
|
5966
6152
|
int i;
|
5967
6153
|
FrtSegmentInfos *sis = iw->sis;
|
5968
6154
|
FrtSegmentInfo *si = frt_sis_new_segment(sis, 0, iw->store);
|
5969
6155
|
|
5970
|
-
SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg],
|
5971
|
-
max_seg - min_seg);
|
6156
|
+
SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg], max_seg - min_seg);
|
5972
6157
|
|
5973
6158
|
/* This is where all the action happens. */
|
5974
6159
|
si->doc_cnt = sm_merge(merger);
|
@@ -6080,8 +6265,7 @@ void frt_iw_commit(FrtIndexWriter *iw)
|
|
6080
6265
|
frt_mutex_unlock(&iw->mutex);
|
6081
6266
|
}
|
6082
6267
|
|
6083
|
-
void frt_iw_delete_term(FrtIndexWriter *iw,
|
6084
|
-
{
|
6268
|
+
void frt_iw_delete_term(FrtIndexWriter *iw, ID field, const char *term) {
|
6085
6269
|
int field_num = frt_fis_get_field_num(iw->fis, field);
|
6086
6270
|
if (field_num >= 0) {
|
6087
6271
|
int i;
|
@@ -6092,7 +6276,7 @@ void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
|
|
6092
6276
|
const int seg_cnt = sis->size;
|
6093
6277
|
bool did_delete = false;
|
6094
6278
|
for (i = 0; i < seg_cnt; i++) {
|
6095
|
-
FrtIndexReader *ir = sr_open(sis, iw->fis, i, false);
|
6279
|
+
FrtIndexReader *ir = sr_open(sis, iw->fis, i, false, NULL);
|
6096
6280
|
FrtTermDocEnum *tde = ir->term_docs(ir);
|
6097
6281
|
ir->deleter = iw->deleter;
|
6098
6282
|
stde_seek(tde, field_num, term);
|
@@ -6114,9 +6298,7 @@ void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
|
|
6114
6298
|
}
|
6115
6299
|
}
|
6116
6300
|
|
6117
|
-
void frt_iw_delete_terms(FrtIndexWriter *iw,
|
6118
|
-
char **terms, const int term_cnt)
|
6119
|
-
{
|
6301
|
+
void frt_iw_delete_terms(FrtIndexWriter *iw, ID field, char **terms, const int term_cnt) {
|
6120
6302
|
int field_num = frt_fis_get_field_num(iw->fis, field);
|
6121
6303
|
if (field_num >= 0) {
|
6122
6304
|
int i;
|
@@ -6127,7 +6309,7 @@ void frt_iw_delete_terms(FrtIndexWriter *iw, FrtSymbol field,
|
|
6127
6309
|
const int seg_cnt = sis->size;
|
6128
6310
|
bool did_delete = false;
|
6129
6311
|
for (i = 0; i < seg_cnt; i++) {
|
6130
|
-
FrtIndexReader *ir = sr_open(sis, iw->fis, i, false);
|
6312
|
+
FrtIndexReader *ir = sr_open(sis, iw->fis, i, false, NULL);
|
6131
6313
|
FrtTermDocEnum *tde = ir->term_docs(ir);
|
6132
6314
|
int j;
|
6133
6315
|
for (j = 0 ; j < term_cnt; j++) {
|
@@ -6196,10 +6378,13 @@ void frt_iw_close(FrtIndexWriter *iw)
|
|
6196
6378
|
free(iw);
|
6197
6379
|
}
|
6198
6380
|
|
6199
|
-
FrtIndexWriter *
|
6200
|
-
|
6201
|
-
|
6202
|
-
|
6381
|
+
FrtIndexWriter *frt_iw_alloc(void) {
|
6382
|
+
return FRT_ALLOC_AND_ZERO(FrtIndexWriter);
|
6383
|
+
}
|
6384
|
+
|
6385
|
+
FrtIndexWriter *frt_iw_open(FrtIndexWriter *iw, FrtStore *store, FrtAnalyzer *volatile analyzer, const FrtConfig *config) {
|
6386
|
+
if (iw == NULL)
|
6387
|
+
iw = frt_iw_alloc();
|
6203
6388
|
frt_mutex_init(&iw->mutex, NULL);
|
6204
6389
|
iw->store = store;
|
6205
6390
|
if (!config) {
|
@@ -6230,7 +6415,7 @@ FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
|
|
6230
6415
|
|
6231
6416
|
iw->similarity = frt_sim_create_default();
|
6232
6417
|
iw->analyzer = analyzer ? (FrtAnalyzer *)analyzer
|
6233
|
-
:
|
6418
|
+
: frt_standard_analyzer_new(true);
|
6234
6419
|
|
6235
6420
|
iw->deleter = frt_deleter_new(iw->sis, store);
|
6236
6421
|
deleter_delete_deletable_files(iw->deleter);
|
@@ -6242,9 +6427,7 @@ FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
|
|
6242
6427
|
/*******************/
|
6243
6428
|
/*** Add Indexes ***/
|
6244
6429
|
/*******************/
|
6245
|
-
static void iw_cp_fields(FrtIndexWriter *iw,
|
6246
|
-
const char *segment, int *map)
|
6247
|
-
{
|
6430
|
+
static void iw_cp_fields(FrtIndexWriter *iw, FrtSegmentReader *sr, const char *segment, int *map) {
|
6248
6431
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
6249
6432
|
FrtOutStream *fdt_out, *fdx_out;
|
6250
6433
|
FrtInStream *fdt_in, *fdx_in;
|
@@ -6271,7 +6454,6 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
|
|
6271
6454
|
frt_is2os_copy_bytes(del_in, del_out, frt_is_length(del_in));
|
6272
6455
|
}
|
6273
6456
|
|
6274
|
-
|
6275
6457
|
if (map) {
|
6276
6458
|
int i;
|
6277
6459
|
const int max_doc = sr_max_doc(IR(sr));
|
@@ -6292,10 +6474,14 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
|
|
6292
6474
|
frt_os_write_vint(fdt_out, df_size);
|
6293
6475
|
/* sum total lengths of FrtDocField */
|
6294
6476
|
for (k = 0; k < df_size; k++) {
|
6295
|
-
|
6296
|
-
const int
|
6477
|
+
const int flen = frt_is_read_vint(fdt_in); /* length */
|
6478
|
+
const int fenc = frt_is_read_vint(fdt_in); /* encoding */
|
6479
|
+
const int fcmp = frt_is_read_vint(fdt_in); /* compression */
|
6297
6480
|
frt_os_write_vint(fdt_out, flen);
|
6298
|
-
|
6481
|
+
frt_os_write_vint(fdt_out, fenc);
|
6482
|
+
frt_os_write_vint(fdt_out, fcmp);
|
6483
|
+
/* Each field has one ' ' byte so add 1 */
|
6484
|
+
data_len += flen + 1;
|
6299
6485
|
}
|
6300
6486
|
}
|
6301
6487
|
frt_is2os_copy_bytes(fdt_in, fdt_out, data_len);
|
@@ -6318,8 +6504,7 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
|
|
6318
6504
|
frt_os_write_vint(fdt_out, tv_size);
|
6319
6505
|
}
|
6320
6506
|
}
|
6321
|
-
}
|
6322
|
-
else {
|
6507
|
+
} else {
|
6323
6508
|
frt_is2os_copy_bytes(fdt_in, fdt_out, frt_is_length(fdt_in));
|
6324
6509
|
frt_is2os_copy_bytes(fdx_in, fdx_out, frt_is_length(fdx_in));
|
6325
6510
|
}
|
@@ -6329,7 +6514,7 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
|
|
6329
6514
|
frt_os_close(fdx_out);
|
6330
6515
|
}
|
6331
6516
|
|
6332
|
-
static void iw_cp_terms(FrtIndexWriter *iw,
|
6517
|
+
static void iw_cp_terms(FrtIndexWriter *iw, FrtSegmentReader *sr,
|
6333
6518
|
const char *segment, int *map)
|
6334
6519
|
{
|
6335
6520
|
char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
|
@@ -6398,7 +6583,7 @@ static void iw_cp_terms(FrtIndexWriter *iw, SegmentReader *sr,
|
|
6398
6583
|
frt_os_close(prx_out);
|
6399
6584
|
}
|
6400
6585
|
|
6401
|
-
static void iw_cp_norms(FrtIndexWriter *iw,
|
6586
|
+
static void iw_cp_norms(FrtIndexWriter *iw, FrtSegmentReader *sr,
|
6402
6587
|
FrtSegmentInfo *si, int *map)
|
6403
6588
|
{
|
6404
6589
|
int i;
|
@@ -6429,9 +6614,7 @@ static void iw_cp_norms(FrtIndexWriter *iw, SegmentReader *sr,
|
|
6429
6614
|
}
|
6430
6615
|
}
|
6431
6616
|
|
6432
|
-
static void iw_cp_map_files(FrtIndexWriter *iw,
|
6433
|
-
FrtSegmentInfo *si)
|
6434
|
-
{
|
6617
|
+
static void iw_cp_map_files(FrtIndexWriter *iw, FrtSegmentReader *sr, FrtSegmentInfo *si) {
|
6435
6618
|
int i;
|
6436
6619
|
FrtFieldInfos *from_fis = IR(sr)->fis;
|
6437
6620
|
FrtFieldInfos *to_fis = iw->fis;
|
@@ -6449,15 +6632,13 @@ static void iw_cp_map_files(FrtIndexWriter *iw, SegmentReader *sr,
|
|
6449
6632
|
free(field_map);
|
6450
6633
|
}
|
6451
6634
|
|
6452
|
-
static void iw_cp_files(FrtIndexWriter *iw,
|
6453
|
-
FrtSegmentInfo *si)
|
6454
|
-
{
|
6635
|
+
static void iw_cp_files(FrtIndexWriter *iw, FrtSegmentReader *sr, FrtSegmentInfo *si) {
|
6455
6636
|
iw_cp_fields(iw, sr, si->name, NULL);
|
6456
6637
|
iw_cp_terms( iw, sr, si->name, NULL);
|
6457
6638
|
iw_cp_norms( iw, sr, si, NULL);
|
6458
6639
|
}
|
6459
6640
|
|
6460
|
-
static void iw_add_segment(FrtIndexWriter *iw,
|
6641
|
+
static void iw_add_segment(FrtIndexWriter *iw, FrtSegmentReader *sr)
|
6461
6642
|
{
|
6462
6643
|
FrtSegmentInfo *si = frt_sis_new_segment(iw->sis, 0, iw->store);
|
6463
6644
|
FrtFieldInfos *fis = iw->fis;
|
@@ -6472,7 +6653,7 @@ static void iw_add_segment(FrtIndexWriter *iw, SegmentReader *sr)
|
|
6472
6653
|
FrtFieldInfo *fi = sub_fis->fields[j];
|
6473
6654
|
FrtFieldInfo *new_fi = frt_fis_get_field(fis, fi->name);
|
6474
6655
|
if (NULL == new_fi) {
|
6475
|
-
new_fi = frt_fi_new(fi->name, FRT_STORE_NO, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
|
6656
|
+
new_fi = frt_fi_new(fi->name, FRT_STORE_NO, FRT_COMPRESSION_NONE, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
|
6476
6657
|
new_fi->bits = fi->bits;
|
6477
6658
|
frt_fis_add_field(fis, new_fi);
|
6478
6659
|
}
|