ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/term.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
#include
|
1
|
+
#include "index.h"
|
2
|
+
#include "helper.h"
|
3
|
+
#include "hash.h"
|
2
4
|
#include <string.h>
|
3
|
-
#include <helper.h>
|
4
|
-
#include <hash.h>
|
5
5
|
|
6
6
|
static char * const FORMAT_VERSION_ERROR_MSG = "Unknown format version";
|
7
7
|
static char * const TERM_ORDER_ERROR_MSG = "term out of order";
|
@@ -32,11 +32,10 @@ Term *term_create(const char *field, char *text)
|
|
32
32
|
return t;
|
33
33
|
}
|
34
34
|
|
35
|
-
void term_destroy(
|
35
|
+
void term_destroy(Term *self)
|
36
36
|
{
|
37
|
-
|
38
|
-
free(
|
39
|
-
free(t);
|
37
|
+
free(self->text);
|
38
|
+
free(self);
|
40
39
|
}
|
41
40
|
|
42
41
|
int term_cmp(void *t1, void *t2)
|
@@ -87,9 +86,9 @@ TermBuffer *tb_create()
|
|
87
86
|
return tb;
|
88
87
|
}
|
89
88
|
|
90
|
-
void tb_destroy(
|
89
|
+
void tb_destroy(TermBuffer *tb)
|
91
90
|
{
|
92
|
-
free(
|
91
|
+
free(tb);
|
93
92
|
}
|
94
93
|
|
95
94
|
TermBuffer *tb_set_term(TermBuffer *tb, Term *t)
|
@@ -106,8 +105,9 @@ Term *tb_get_term(TermBuffer *tb)
|
|
106
105
|
|
107
106
|
int tb_cmp(TermBuffer *tb1, TermBuffer *tb2)
|
108
107
|
{
|
109
|
-
int res
|
110
|
-
if (
|
108
|
+
int res;
|
109
|
+
if ((tb1->field != tb2->field) &&
|
110
|
+
(0 != (res = strcmp(tb1->field, tb2->field)))) {
|
111
111
|
return res;
|
112
112
|
} else {
|
113
113
|
return strcmp(tb1->text, tb2->text);
|
@@ -133,12 +133,13 @@ TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2)
|
|
133
133
|
|
134
134
|
TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis)
|
135
135
|
{
|
136
|
-
int
|
137
|
-
int
|
136
|
+
signed int fnum;
|
137
|
+
int start = (int)is_read_vint(is);
|
138
|
+
int length = (int)is_read_vint(is);
|
138
139
|
int total_length = start + length;
|
139
140
|
is_read_bytes(is, (uchar *)tb->text, start, length);
|
140
141
|
tb->text[total_length] = '\0';
|
141
|
-
|
142
|
+
fnum = (signed int)is_read_vint(is);
|
142
143
|
if (fnum < 0)
|
143
144
|
tb->field = (char *)EMPTY_STRING;
|
144
145
|
else
|
@@ -171,9 +172,9 @@ TermInfo *ti_set(TermInfo *ti, int doc_freq, int freq_pointer, int prox_pointer,
|
|
171
172
|
return ti;
|
172
173
|
}
|
173
174
|
|
174
|
-
void ti_destroy(
|
175
|
+
void ti_destroy(TermInfo *ti)
|
175
176
|
{
|
176
|
-
free(
|
177
|
+
free(ti);
|
177
178
|
}
|
178
179
|
|
179
180
|
TermInfo *ti_cpy(TermInfo *ti, TermInfo *other)
|
@@ -208,13 +209,12 @@ TermEnum *te_create()
|
|
208
209
|
return te;
|
209
210
|
}
|
210
211
|
|
211
|
-
void te_destroy(
|
212
|
+
void te_destroy(TermEnum *te)
|
212
213
|
{
|
213
|
-
TermEnum *te = (TermEnum *)p;
|
214
214
|
tb_destroy(te->tb_curr);
|
215
215
|
tb_destroy(te->tb_prev);
|
216
216
|
ti_destroy(te->ti_curr);
|
217
|
-
free(
|
217
|
+
free(te);
|
218
218
|
}
|
219
219
|
|
220
220
|
Term *te_get_term(TermEnum *te)
|
@@ -246,11 +246,12 @@ TermBuffer *te_skip_to(TermEnum *te, Term *t)
|
|
246
246
|
*
|
247
247
|
****************************************************************************/
|
248
248
|
|
249
|
-
#define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data
|
249
|
+
#define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data
|
250
250
|
|
251
251
|
TermBuffer *ste_next(TermEnum *te)
|
252
252
|
{
|
253
253
|
GET_STE;
|
254
|
+
TermInfo *ti;
|
254
255
|
InStream *is = ste->is;
|
255
256
|
ste->pos++;
|
256
257
|
if (ste->pos > ste->size - 1) {
|
@@ -261,38 +262,43 @@ TermBuffer *ste_next(TermEnum *te)
|
|
261
262
|
tb_cpy(te->tb_prev, te->tb_curr);
|
262
263
|
tb_read(te->tb_curr, is, ste->fis);
|
263
264
|
|
264
|
-
|
265
|
-
ti->doc_freq = is_read_vint(is);
|
266
|
-
ti->freq_pointer += is_read_vint(is)
|
267
|
-
ti->prox_pointer += is_read_vint(is)
|
265
|
+
ti = te->ti_curr;
|
266
|
+
ti->doc_freq = (int)is_read_vint(is); /* read doc freq */
|
267
|
+
ti->freq_pointer += (int)is_read_vint(is);/* read freq pointer */
|
268
|
+
ti->prox_pointer += (int)is_read_vint(is);/* read prox pointer */
|
268
269
|
|
269
270
|
if (ste->format == -1) {
|
270
|
-
|
271
|
-
|
271
|
+
/* just read skip_offset in order to increment file pointer
|
272
|
+
* value is never used since skip_to is switched off */
|
272
273
|
if (!ste->is_index) {
|
273
|
-
if (ti->doc_freq > ste->format_m1skip_interval)
|
274
|
-
ti->skip_offset = is_read_vint(is);
|
274
|
+
if (ti->doc_freq > ste->format_m1skip_interval) {
|
275
|
+
ti->skip_offset = (int)is_read_vint(is);
|
276
|
+
}
|
275
277
|
}
|
276
278
|
} else {
|
277
|
-
if (ti->doc_freq >= ste->skip_interval)
|
278
|
-
ti->skip_offset = is_read_vint(is);
|
279
|
+
if (ti->doc_freq >= ste->skip_interval) {
|
280
|
+
ti->skip_offset = (int)is_read_vint(is);
|
281
|
+
}
|
279
282
|
}
|
280
283
|
|
281
|
-
if (ste->is_index)
|
282
|
-
ste->index_pointer += is_read_vint(is);
|
284
|
+
if (ste->is_index) {
|
285
|
+
ste->index_pointer += (int)is_read_vint(is); /* read index pointer */
|
286
|
+
}
|
283
287
|
|
284
288
|
return te->tb_curr;
|
285
289
|
}
|
286
290
|
|
287
291
|
TermEnum *ste_clone(TermEnum *other_te);
|
292
|
+
|
288
293
|
TermEnum *ste_allocate()
|
289
294
|
{
|
290
295
|
TermEnum *te = te_create();
|
296
|
+
SegmentTermEnum *ste;
|
297
|
+
|
291
298
|
te->next = &ste_next;
|
292
299
|
te->close = &ste_close;
|
293
300
|
te->clone = &ste_clone;
|
294
|
-
|
295
|
-
ALLOC(SegmentTermEnum);
|
301
|
+
ste = ALLOC(SegmentTermEnum);
|
296
302
|
te->data = ste;
|
297
303
|
return te;
|
298
304
|
}
|
@@ -302,6 +308,7 @@ TermEnum *ste_clone(TermEnum *other_te)
|
|
302
308
|
SegmentTermEnum *other_ste = (SegmentTermEnum *)other_te->data;
|
303
309
|
TermEnum *te = ste_allocate();
|
304
310
|
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
311
|
+
|
305
312
|
memcpy(ste, other_ste, sizeof(SegmentTermEnum));
|
306
313
|
ste->is = is_clone(other_ste->is);
|
307
314
|
tb_cpy(te->tb_curr, other_te->tb_curr);
|
@@ -323,6 +330,8 @@ TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
|
|
323
330
|
{
|
324
331
|
TermEnum *te = ste_allocate();
|
325
332
|
GET_STE;
|
333
|
+
int first_int;
|
334
|
+
|
326
335
|
ste->fis = fis;
|
327
336
|
ste->is_index = is_index;
|
328
337
|
ste->is = is;
|
@@ -330,36 +339,36 @@ TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
|
|
330
339
|
ste->index_pointer = 0;
|
331
340
|
ste->format_m1skip_interval = -1;
|
332
341
|
|
333
|
-
|
342
|
+
first_int = (int)is_read_int(is);
|
334
343
|
|
335
344
|
if (first_int >= 0) {
|
336
|
-
|
345
|
+
/* original-format file, without explicit format version number */
|
337
346
|
ste->format = 0;
|
338
347
|
ste->size = first_int;
|
339
348
|
|
340
|
-
|
349
|
+
/* back-compatible settings */
|
341
350
|
ste->index_interval = 128;
|
342
|
-
ste->skip_interval = INT_MAX;
|
351
|
+
ste->skip_interval = INT_MAX; /* switch off skip_to optimization */
|
343
352
|
|
344
353
|
} else {
|
345
|
-
|
354
|
+
/* check that it is a format we can understand */
|
346
355
|
if (first_int < TERM_INFO_FORMAT)
|
347
|
-
RAISE(
|
356
|
+
RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
|
348
357
|
|
349
|
-
|
358
|
+
/* we have a format version number */
|
350
359
|
ste->format = first_int;
|
351
360
|
|
352
361
|
|
353
|
-
ste->size = is_read_long(is);
|
362
|
+
ste->size = (int)is_read_long(is); /* read the size */
|
354
363
|
|
355
364
|
if (ste->format == -1) {
|
356
365
|
if (!ste->is_index) {
|
357
366
|
ste->index_interval = is_read_int(is);
|
358
367
|
ste->format_m1skip_interval = is_read_int(is);
|
359
368
|
}
|
360
|
-
|
361
|
-
|
362
|
-
|
369
|
+
/* switch off skip_to optimization for file format prior to
|
370
|
+
* 1.4rc2 in order to avoid a bug in skip_to implementation
|
371
|
+
* of these versions */
|
363
372
|
ste->skip_interval = INT_MAX;
|
364
373
|
} else {
|
365
374
|
ste->index_interval = is_read_int(is);
|
@@ -407,7 +416,7 @@ Term *ste_scan_for_term(TermEnum *te, int pos)
|
|
407
416
|
*
|
408
417
|
****************************************************************************/
|
409
418
|
|
410
|
-
#define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data
|
419
|
+
#define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data
|
411
420
|
|
412
421
|
TermBuffer *mte_next(TermEnum *te)
|
413
422
|
{
|
@@ -426,11 +435,11 @@ TermBuffer *mte_next(TermEnum *te)
|
|
426
435
|
|
427
436
|
while ((top != NULL) && (tb_cmp(te->tb_curr, top->tb) == 0)) {
|
428
437
|
pq_pop(mte->smi_queue);
|
429
|
-
te->ti_curr->doc_freq += top->te->ti_curr->doc_freq
|
438
|
+
te->ti_curr->doc_freq += top->te->ti_curr->doc_freq;/* increment freq */
|
430
439
|
if (smi_next(top)) {
|
431
|
-
pq_push(mte->smi_queue, top);
|
440
|
+
pq_push(mte->smi_queue, top); /* restore queue */
|
432
441
|
} else {
|
433
|
-
smi_destroy(top);
|
442
|
+
smi_destroy(top); /* done with a segment */
|
434
443
|
}
|
435
444
|
top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
|
436
445
|
}
|
@@ -455,21 +464,21 @@ TermEnum *mte_clone(TermEnum *te)
|
|
455
464
|
TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
|
456
465
|
{
|
457
466
|
int i;
|
467
|
+
IndexReader *reader;
|
468
|
+
TermEnum *sub_te;
|
469
|
+
MultiTermEnum *mte = ALLOC(MultiTermEnum);
|
458
470
|
TermEnum *te = te_create();
|
459
471
|
te->next = &mte_next;
|
460
472
|
te->clone = &mte_clone;
|
461
473
|
te->close = &mte_close;
|
462
474
|
|
463
|
-
MultiTermEnum *mte = ALLOC(MultiTermEnum);
|
464
475
|
te->data = mte;
|
465
476
|
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
mte->smi_queue = pq_create(rcnt, &smi_lt);
|
470
|
-
mte->smi_queue->free_elem = &smi_destroy;
|
477
|
+
mte->smi_queue = pq_create(rcnt, (lt_ft)&smi_lt);
|
478
|
+
mte->smi_queue->free_elem = (free_ft)&smi_destroy;
|
471
479
|
|
472
480
|
for (i = 0; i < rcnt; i++) {
|
481
|
+
SegmentMergeInfo *smi;
|
473
482
|
reader = readers[i];
|
474
483
|
|
475
484
|
if (t != NULL) {
|
@@ -478,10 +487,10 @@ TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
|
|
478
487
|
sub_te = reader->terms(reader);
|
479
488
|
}
|
480
489
|
|
481
|
-
|
490
|
+
smi = smi_create(starts[i], sub_te, reader);
|
482
491
|
if (((t == NULL) && smi_next(smi)) ||
|
483
492
|
(sub_te->tb_curr->field != (char *)EMPTY_STRING)) {
|
484
|
-
pq_push(mte->smi_queue, smi);
|
493
|
+
pq_push(mte->smi_queue, smi); /* initialize queue */
|
485
494
|
} else {
|
486
495
|
smi_destroy(smi);
|
487
496
|
}
|
@@ -508,7 +517,10 @@ TermInfosWriter *tiw_open_internal(Store *store,
|
|
508
517
|
int interval,
|
509
518
|
int is_index)
|
510
519
|
{
|
520
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
511
521
|
TermInfosWriter *tiw = ALLOC(TermInfosWriter);
|
522
|
+
OutStream *os;
|
523
|
+
|
512
524
|
tiw->index_interval = interval;
|
513
525
|
tiw->skip_interval = 16;
|
514
526
|
tiw->last_index_pointer = 0;
|
@@ -520,14 +532,13 @@ TermInfosWriter *tiw_open_internal(Store *store,
|
|
520
532
|
tiw->curr_field = NULL;
|
521
533
|
tiw->curr_field_num = -1;
|
522
534
|
|
523
|
-
char fname[SEGMENT_NAME_MAX_LENGTH];
|
524
535
|
strcpy(fname, segment);
|
525
536
|
strcat(fname, (is_index ? ".tii" : ".tis"));
|
526
|
-
|
527
|
-
os_write_int(os, TERM_INFO_FORMAT);
|
528
|
-
os_write_long(os, 0);
|
529
|
-
os_write_int(os, tiw->index_interval);
|
530
|
-
os_write_int(os, tiw->skip_interval);
|
537
|
+
os = tiw->os = store->create_output(store, fname);
|
538
|
+
os_write_int(os, TERM_INFO_FORMAT); /* write format */
|
539
|
+
os_write_long(os, 0); /* leave space for size */
|
540
|
+
os_write_int(os, tiw->index_interval); /* write index_interval */
|
541
|
+
os_write_int(os, tiw->skip_interval); /* write skip_interval */
|
531
542
|
if (!is_index) {
|
532
543
|
tiw->other = tiw_open_internal(store, segment, fis, interval, true);
|
533
544
|
tiw->other->other = tiw;
|
@@ -544,11 +555,11 @@ void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
|
|
544
555
|
{
|
545
556
|
//printf("%s, %s\n", tiw->last_term->text, t->text);
|
546
557
|
int start = hlp_string_diff(tiw->last_term->text, t->text);
|
547
|
-
int length = strlen(t->text) - start;
|
558
|
+
int length = (int)strlen(t->text) - start;
|
548
559
|
|
549
|
-
os_write_vint(os, start);
|
550
|
-
os_write_vint(os, length);
|
551
|
-
os_write_chars(os, t->text, start, length);
|
560
|
+
os_write_vint(os, start); /* write shared prefix length */
|
561
|
+
os_write_vint(os, length); /* write delta length */
|
562
|
+
os_write_chars(os, t->text, start, length); /* write delta chars */
|
552
563
|
if (tiw->curr_field != t->field) {
|
553
564
|
tiw->curr_field = t->field;
|
554
565
|
tiw->curr_field_num = fis_get_number(tiw->fis, t->field);
|
@@ -559,6 +570,7 @@ void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
|
|
559
570
|
|
560
571
|
void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
|
561
572
|
{
|
573
|
+
#ifdef DEBUG
|
562
574
|
if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
|
563
575
|
RAISE(STATE_ERROR, TERM_ORDER_ERROR_MSG);
|
564
576
|
}
|
@@ -568,22 +580,27 @@ void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
|
|
568
580
|
if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
|
569
581
|
RAISE(STATE_ERROR, PP_ORDER_ERROR_MSG);
|
570
582
|
}
|
583
|
+
#endif
|
571
584
|
|
572
|
-
if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0)
|
573
|
-
|
585
|
+
if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0) {
|
586
|
+
/* add an index term */
|
587
|
+
tiw_add(tiw->other, tiw->last_term, tiw->last_term_info);
|
588
|
+
}
|
574
589
|
|
575
|
-
tiw_write_term(tiw, tiw->os, t);
|
576
|
-
os_write_vint(tiw->os, ti->doc_freq);
|
590
|
+
tiw_write_term(tiw, tiw->os, t); /* write term */
|
591
|
+
os_write_vint(tiw->os, ti->doc_freq); /* write doc freq */
|
577
592
|
os_write_vint(tiw->os, ti->freq_pointer - tiw->last_term_info->freq_pointer);
|
578
593
|
os_write_vint(tiw->os, ti->prox_pointer - tiw->last_term_info->prox_pointer);
|
579
|
-
|
594
|
+
|
595
|
+
if (ti->doc_freq >= tiw->skip_interval) {
|
580
596
|
os_write_vint(tiw->os, ti->skip_offset);
|
597
|
+
}
|
581
598
|
|
582
599
|
if (tiw->is_index) {
|
583
600
|
OutStream *other_os = tiw->other->os;
|
584
601
|
int other_pos = os_pos(other_os);
|
585
602
|
os_write_vint(tiw->os, other_pos - tiw->last_index_pointer);
|
586
|
-
tiw->last_index_pointer = other_pos;
|
603
|
+
tiw->last_index_pointer = other_pos; /* write pointer */
|
587
604
|
}
|
588
605
|
|
589
606
|
ti_cpy(tiw->last_term_info, ti);
|
@@ -593,7 +610,7 @@ void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
|
|
593
610
|
void tiw_close(TermInfosWriter *tiw)
|
594
611
|
{
|
595
612
|
OutStream *os = tiw->os;
|
596
|
-
os_seek(os, 4);
|
613
|
+
os_seek(os, 4); /* write @size after format */
|
597
614
|
os_write_long(os, tiw->size);
|
598
615
|
os_close(os);
|
599
616
|
|
@@ -632,17 +649,20 @@ void tir_close(TermInfosReader *tir)
|
|
632
649
|
|
633
650
|
TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis)
|
634
651
|
{
|
652
|
+
SegmentTermEnum *ste;
|
635
653
|
TermInfosReader *tir = ALLOC(TermInfosReader);
|
636
654
|
char fname[SEGMENT_NAME_MAX_LENGTH];
|
655
|
+
InStream *is;
|
656
|
+
|
637
657
|
mutex_init(&tir->mutex, NULL);
|
638
658
|
strcpy(fname, segment);
|
639
659
|
strcpy(fname + strlen(segment), ".tis");
|
640
|
-
|
660
|
+
is = store->open_input(store, fname);
|
641
661
|
tir->orig_te = ste_create(is, fis, false);
|
642
662
|
thread_key_create(&tir->thread_te, NULL);
|
643
|
-
tir->te_bucket = ary_create(1, (
|
663
|
+
tir->te_bucket = ary_create(1, (free_ft)tir->orig_te->close);
|
644
664
|
|
645
|
-
|
665
|
+
ste = tir->orig_te->data;
|
646
666
|
tir->size = ste->size;
|
647
667
|
tir->skip_interval = ste->skip_interval;
|
648
668
|
|
@@ -659,6 +679,9 @@ void tir_ensure_index_is_read(TermInfosReader *tir)
|
|
659
679
|
{
|
660
680
|
mutex_lock(&tir->mutex);
|
661
681
|
if (tir->index_terms == NULL) {
|
682
|
+
TermEnum *index_te;
|
683
|
+
SegmentTermEnum *ste;
|
684
|
+
int i = 0;
|
662
685
|
int index_size = ((SegmentTermEnum *)tir->index_te->data)->size;
|
663
686
|
tir->index_size = index_size;
|
664
687
|
|
@@ -666,9 +689,8 @@ void tir_ensure_index_is_read(TermInfosReader *tir)
|
|
666
689
|
tir->index_term_infos = ALLOC_N(TermInfo *, index_size);
|
667
690
|
tir->index_pointers = ALLOC_N(int, index_size);
|
668
691
|
|
669
|
-
|
670
|
-
|
671
|
-
SegmentTermEnum *ste = index_te->data;
|
692
|
+
index_te = tir->index_te;
|
693
|
+
ste = index_te->data;
|
672
694
|
|
673
695
|
TRY
|
674
696
|
while (ste_next(index_te) != NULL) {
|
@@ -708,7 +730,7 @@ void tir_seek_enum(TermInfosReader *tir, int ind_offset)
|
|
708
730
|
|
709
731
|
int tir_get_index_offset(TermInfosReader *tir, Term *t)
|
710
732
|
{
|
711
|
-
int lo = 0;
|
733
|
+
int lo = 0; /* binary search tir->index_terms[] */
|
712
734
|
int hi = tir->index_size - 1;
|
713
735
|
int mid, delta;
|
714
736
|
Term **index_terms = tir->index_terms;
|
@@ -729,61 +751,70 @@ int tir_get_index_offset(TermInfosReader *tir, Term *t)
|
|
729
751
|
|
730
752
|
TermInfo *tir_get_ti(TermInfosReader *tir, Term *t)
|
731
753
|
{
|
732
|
-
|
754
|
+
TermEnum *te;
|
755
|
+
SegmentTermEnum *ste;
|
756
|
+
if (tir->size == 0) {
|
733
757
|
return NULL;
|
758
|
+
}
|
734
759
|
|
735
760
|
tir_ensure_index_is_read(tir);
|
736
761
|
|
737
|
-
|
738
|
-
|
739
|
-
|
762
|
+
/* optimize sequential access: first try scanning cached enum w/o seeking */
|
763
|
+
te = tir_enum(tir);
|
764
|
+
ste = (SegmentTermEnum *)te->data;
|
740
765
|
if (ste->pos < ste->size && tb_term_cmp(te->tb_curr, t) <= 0) {
|
741
766
|
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
742
767
|
int enum_offset = (int)(ste->pos / ste->index_interval) + 1;
|
743
768
|
if (tir->index_size == enum_offset ||
|
744
|
-
term_cmp(t, tir->index_terms[enum_offset]) < 0) {
|
745
|
-
return ste_scan_for_term_info(te, t);
|
769
|
+
term_cmp(t, tir->index_terms[enum_offset]) < 0) { /* but before end of block */
|
770
|
+
return ste_scan_for_term_info(te, t); /* no need to seek */
|
746
771
|
}
|
747
772
|
}
|
748
773
|
|
749
|
-
|
774
|
+
/* random-access: must seek */
|
750
775
|
tir_seek_enum(tir, tir_get_index_offset(tir, t));
|
751
776
|
return ste_scan_for_term_info(te, t);
|
752
777
|
}
|
753
778
|
|
754
779
|
Term *tir_get_term(TermInfosReader *tir, int pos)
|
755
780
|
{
|
756
|
-
if (tir->size == 0)
|
781
|
+
if (tir->size == 0) {
|
757
782
|
return NULL;
|
783
|
+
} else {
|
784
|
+
TermEnum *te = tir_enum(tir);
|
785
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
786
|
+
if (pos >= ste->pos &&
|
787
|
+
pos < (ste->pos + ste->index_interval)) {
|
788
|
+
return ste_scan_for_term(te, pos); /* can avoid seek */
|
789
|
+
}
|
758
790
|
|
759
|
-
|
760
|
-
|
761
|
-
if (pos >= ste->pos &&
|
762
|
-
pos < (ste->pos + ste->index_interval)) {
|
763
|
-
return ste_scan_for_term(te, pos); // can avoid seek
|
791
|
+
tir_seek_enum(tir, (int)(pos / ste->index_interval)); /* must seek */
|
792
|
+
return ste_scan_for_term(te, pos);
|
764
793
|
}
|
765
|
-
|
766
|
-
tir_seek_enum(tir, (int)(pos / ste->index_interval)); // must seek
|
767
|
-
return ste_scan_for_term(te, pos);
|
768
794
|
}
|
769
795
|
|
770
796
|
int tir_get_term_pos(TermInfosReader *tir, Term *t)
|
771
797
|
{
|
772
|
-
if (tir->size == 0)
|
798
|
+
if (tir->size == 0) {
|
773
799
|
return -1;
|
800
|
+
} else {
|
801
|
+
TermEnum *te;
|
802
|
+
int ind_offset;
|
803
|
+
|
804
|
+
tir_ensure_index_is_read(tir);
|
774
805
|
|
775
|
-
|
776
|
-
|
777
|
-
int ind_offset = tir_get_index_offset(tir, t);
|
778
|
-
tir_seek_enum(tir, ind_offset);
|
806
|
+
ind_offset = tir_get_index_offset(tir, t);
|
807
|
+
tir_seek_enum(tir, ind_offset);
|
779
808
|
|
780
|
-
|
781
|
-
|
782
|
-
|
809
|
+
te = tir_enum(tir);
|
810
|
+
while ((tb_term_cmp(te->tb_curr, t) < 0) && (ste_next(te) != NULL)) {
|
811
|
+
}
|
783
812
|
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
813
|
+
if (tb_term_cmp(te->tb_curr, t) == 0) {
|
814
|
+
return ((SegmentTermEnum *)te->data)->pos;
|
815
|
+
} else {
|
816
|
+
return -1;
|
817
|
+
}
|
818
|
+
}
|
788
819
|
}
|
789
820
|
|