ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/termdocs.c
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#include
|
1
|
+
#include "index.h"
|
2
2
|
#include <string.h>
|
3
3
|
|
4
4
|
static char * const TPE_VS_TDE_ERROR_MSG = "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.";
|
@@ -59,22 +59,24 @@ bool stde_next(TermDocEnum *tde)
|
|
59
59
|
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
60
60
|
while (true) {
|
61
61
|
|
62
|
-
if (stde->count >= stde->doc_freq)
|
62
|
+
if (stde->count >= stde->doc_freq) {
|
63
63
|
return false;
|
64
|
+
}
|
64
65
|
|
65
|
-
doc_code = is_read_vint(stde->freq_in);
|
66
|
-
stde->doc_num += doc_code >> 1;
|
67
|
-
if ((doc_code & 1) != 0) {
|
68
|
-
stde->freq = 1;
|
66
|
+
doc_code = (int)is_read_vint(stde->freq_in);
|
67
|
+
stde->doc_num += doc_code >> 1; /* shift off low bit */
|
68
|
+
if ((doc_code & 1) != 0) { /* if low bit is set */
|
69
|
+
stde->freq = 1; /* freq is one */
|
69
70
|
} else {
|
70
|
-
stde->freq = is_read_vint(stde->freq_in);
|
71
|
+
stde->freq = (int)is_read_vint(stde->freq_in); /* read freq */
|
71
72
|
}
|
72
73
|
|
73
74
|
stde->count++;
|
74
75
|
|
75
76
|
if (stde->deleted_docs == NULL ||
|
76
|
-
bv_get(stde->deleted_docs, stde->doc_num) == 0)
|
77
|
-
break;
|
77
|
+
bv_get(stde->deleted_docs, stde->doc_num) == 0) {
|
78
|
+
break; /* We found an undeleted doc so return */
|
79
|
+
}
|
78
80
|
|
79
81
|
stde->skip_prox(stde);
|
80
82
|
}
|
@@ -90,41 +92,49 @@ int stde_freq(TermDocEnum *tde)
|
|
90
92
|
bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
|
91
93
|
{
|
92
94
|
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
93
|
-
if (stde->doc_freq >= stde->skip_interval) { // optimized case
|
94
95
|
|
95
|
-
|
96
|
-
|
96
|
+
if (stde->doc_freq >= stde->skip_interval) { /* optimized case */
|
97
|
+
int last_skip_doc;
|
98
|
+
int last_freq_pointer;
|
99
|
+
int last_prox_pointer;
|
100
|
+
int num_skipped;
|
101
|
+
|
102
|
+
if (stde->skip_in == NULL) {
|
103
|
+
stde->skip_in = is_clone(stde->freq_in); /* lazily clone */
|
104
|
+
}
|
97
105
|
|
98
|
-
if (!stde->have_skipped) {
|
106
|
+
if (!stde->have_skipped) { /* lazily seek skip stream */
|
99
107
|
is_seek(stde->skip_in, stde->skip_pointer);
|
100
108
|
stde->have_skipped = true;
|
101
109
|
}
|
102
110
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
111
|
+
/* scan skip data */
|
112
|
+
last_skip_doc = stde->skip_doc;
|
113
|
+
last_freq_pointer = is_pos(stde->freq_in);
|
114
|
+
last_prox_pointer = -1;
|
115
|
+
num_skipped = -1 - (stde->count % stde->skip_interval);
|
108
116
|
|
109
117
|
while (target_doc_num > stde->skip_doc) {
|
110
118
|
last_skip_doc = stde->skip_doc;
|
111
119
|
last_freq_pointer = stde->freq_pointer;
|
112
120
|
last_prox_pointer = stde->prox_pointer;
|
113
121
|
|
114
|
-
if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num)
|
122
|
+
if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num) {
|
115
123
|
num_skipped += stde->skip_interval;
|
124
|
+
}
|
116
125
|
|
117
|
-
if(stde->skip_count >= stde->num_skips)
|
126
|
+
if(stde->skip_count >= stde->num_skips) {
|
118
127
|
break;
|
128
|
+
}
|
119
129
|
|
120
|
-
stde->skip_doc += is_read_vint(stde->skip_in);
|
121
|
-
stde->freq_pointer += is_read_vint(stde->skip_in);
|
122
|
-
stde->prox_pointer += is_read_vint(stde->skip_in);
|
130
|
+
stde->skip_doc += (int)is_read_vint(stde->skip_in);
|
131
|
+
stde->freq_pointer += (int)is_read_vint(stde->skip_in);
|
132
|
+
stde->prox_pointer += (int)is_read_vint(stde->skip_in);
|
123
133
|
|
124
134
|
stde->skip_count++;
|
125
135
|
}
|
126
136
|
|
127
|
-
|
137
|
+
/* if we found something to skip, so skip it */
|
128
138
|
if (last_freq_pointer > is_pos(stde->freq_in)) {
|
129
139
|
is_seek(stde->freq_in, last_freq_pointer);
|
130
140
|
stde->seek_prox(stde, last_prox_pointer);
|
@@ -134,7 +144,7 @@ bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
|
|
134
144
|
}
|
135
145
|
}
|
136
146
|
|
137
|
-
|
147
|
+
/* done skipping, now just scan */
|
138
148
|
do {
|
139
149
|
if (! tde->next(tde)) {
|
140
150
|
return false;
|
@@ -148,13 +158,14 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
|
148
158
|
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
149
159
|
int i = 0, doc_code;
|
150
160
|
while (i < req_num && stde->count < stde->doc_freq) {
|
151
|
-
|
152
|
-
doc_code = is_read_vint(stde->freq_in);
|
153
|
-
stde->doc_num += doc_code >> 1;
|
154
|
-
if ((doc_code & 1) != 0)
|
155
|
-
stde->freq = 1;
|
156
|
-
else
|
157
|
-
stde->freq = is_read_vint(stde->freq_in);
|
161
|
+
/* manually inlined call to next() for speed */
|
162
|
+
doc_code = (int)is_read_vint(stde->freq_in);
|
163
|
+
stde->doc_num += (doc_code >> 1); /* shift off low bit */
|
164
|
+
if ((doc_code & 1) != 0) { /* if low bit is set */
|
165
|
+
stde->freq = 1; /* freq is one */
|
166
|
+
} else {
|
167
|
+
stde->freq = (int)is_read_vint(stde->freq_in); /* else read freq */
|
168
|
+
}
|
158
169
|
|
159
170
|
stde->count++;
|
160
171
|
|
@@ -170,6 +181,7 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
|
170
181
|
|
171
182
|
TermDocEnum *stde_create(IndexReader *ir)
|
172
183
|
{
|
184
|
+
SegmentTermDocEnum *stde = ALLOC_AND_ZERO(SegmentTermDocEnum);
|
173
185
|
SegmentReader *sr = (SegmentReader *)ir->data;
|
174
186
|
TermDocEnum *tde = ALLOC(TermDocEnum);
|
175
187
|
tde->seek = &stde_seek;
|
@@ -181,8 +193,6 @@ TermDocEnum *stde_create(IndexReader *ir)
|
|
181
193
|
tde->next_position = NULL;
|
182
194
|
tde->close = &stde_close;
|
183
195
|
|
184
|
-
SegmentTermDocEnum *stde = ALLOC(SegmentTermDocEnum);
|
185
|
-
ZEROSET(stde, SegmentTermDocEnum, 1); // set all values to 0
|
186
196
|
tde->data = stde;
|
187
197
|
stde->parent = sr;
|
188
198
|
stde->freq_in = is_clone(sr->freq_in);
|
@@ -260,11 +270,12 @@ int stpe_next_position(TermDocEnum *tde)
|
|
260
270
|
{
|
261
271
|
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
262
272
|
stde->prox_cnt--;
|
263
|
-
return stde->position += is_read_vint(stde->prox_in);
|
273
|
+
return stde->position += (int)is_read_vint(stde->prox_in);
|
264
274
|
}
|
265
275
|
|
266
276
|
TermDocEnum *stpe_create(IndexReader *ir)
|
267
277
|
{
|
278
|
+
SegmentTermDocEnum *stde;
|
268
279
|
SegmentReader *sr = (SegmentReader *)ir->data;
|
269
280
|
TermDocEnum *tde = stde_create(ir);
|
270
281
|
tde->close = &stpe_close;
|
@@ -273,7 +284,7 @@ TermDocEnum *stpe_create(IndexReader *ir)
|
|
273
284
|
tde->read = &stpe_read;
|
274
285
|
tde->next_position = &stpe_next_position;
|
275
286
|
|
276
|
-
|
287
|
+
stde = (SegmentTermDocEnum *)tde->data;
|
277
288
|
stde->prox_in = is_clone(sr->prox_in);
|
278
289
|
stde->prox_cnt = 0;
|
279
290
|
stde->position = 0;
|
@@ -321,16 +332,18 @@ TermDocEnum *mtde_term_docs_from_reader(IndexReader *ir)
|
|
321
332
|
|
322
333
|
TermDocEnum *mtde_term_docs(MultiTermDocEnum *mtde, int i)
|
323
334
|
{
|
324
|
-
if (mtde->term == NULL)
|
335
|
+
if (mtde->term == NULL) {
|
325
336
|
return NULL;
|
337
|
+
} else {
|
326
338
|
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
339
|
+
TermDocEnum *tde = mtde->irs_tde[i];
|
340
|
+
if (tde == NULL) {
|
341
|
+
tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
|
342
|
+
}
|
331
343
|
|
332
|
-
|
333
|
-
|
344
|
+
tde->seek(tde, mtde->term);
|
345
|
+
return tde;
|
346
|
+
}
|
334
347
|
}
|
335
348
|
|
336
349
|
bool mtde_next(TermDocEnum *tde)
|
@@ -411,6 +424,7 @@ int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
|
411
424
|
|
412
425
|
TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
|
413
426
|
{
|
427
|
+
MultiTermDocEnum *mtde = ALLOC_AND_ZERO(MultiTermDocEnum);
|
414
428
|
TermDocEnum *tde = ALLOC(TermDocEnum);
|
415
429
|
tde->close = &mtde_close;
|
416
430
|
tde->seek = &mtde_seek;
|
@@ -421,8 +435,6 @@ TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
|
|
421
435
|
tde->read = &mtde_read;
|
422
436
|
tde->next_position = NULL;
|
423
437
|
|
424
|
-
MultiTermDocEnum *mtde = ALLOC(MultiTermDocEnum);
|
425
|
-
ZEROSET(mtde, MultiTermDocEnum, 1); // set all values to 0
|
426
438
|
tde->data = mtde;
|
427
439
|
mtde->irs = irs;
|
428
440
|
mtde->starts = starts;
|
@@ -467,8 +479,7 @@ TermDocEnum *mtpe_create(IndexReader **irs, int *starts, int ir_cnt)
|
|
467
479
|
****************************************************************************/
|
468
480
|
|
469
481
|
#define GET_MTDPE MultipleTermDocPosEnum *mtdpe = (MultipleTermDocPosEnum *)self->data
|
470
|
-
void tde_destroy(
|
471
|
-
TermDocEnum *self = (TermDocEnum *)p;
|
482
|
+
void tde_destroy(TermDocEnum *self) {
|
472
483
|
self->close(self);
|
473
484
|
}
|
474
485
|
|
@@ -570,12 +581,26 @@ int mtdpe_next_position(TermDocEnum *self)
|
|
570
581
|
|
571
582
|
TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
|
572
583
|
{
|
584
|
+
int i;
|
573
585
|
TermDocEnum *self = ALLOC(TermDocEnum);
|
574
|
-
MultipleTermDocPosEnum *mtdpe =
|
586
|
+
MultipleTermDocPosEnum *mtdpe = ALLOC_AND_ZERO_N(MultipleTermDocPosEnum, 1);
|
575
587
|
PriorityQueue *pq;
|
576
588
|
TermDocEnum *tpe;
|
577
|
-
int i;
|
578
589
|
|
590
|
+
pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
|
591
|
+
mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
|
592
|
+
mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
|
593
|
+
for (i = 0; i < t_cnt; i++) {
|
594
|
+
tpe = ir_term_positions_for(ir, terms[i]);
|
595
|
+
if (tpe->next(tpe)) {
|
596
|
+
pq_push(pq, tpe);
|
597
|
+
} else {
|
598
|
+
tpe->close(tpe);
|
599
|
+
}
|
600
|
+
}
|
601
|
+
pq->free_elem = (free_ft)&tde_destroy;
|
602
|
+
|
603
|
+
self->data = mtdpe;
|
579
604
|
self->close = &mtdpe_close;
|
580
605
|
self->seek = &mtdpe_seek;
|
581
606
|
self->next = &mtdpe_next;
|
@@ -585,17 +610,6 @@ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
|
|
585
610
|
self->read = &mtdpe_read;
|
586
611
|
self->next_position = &mtdpe_next_position;
|
587
612
|
|
588
|
-
ZEROSET(mtdpe, MultipleTermDocPosEnum, 1); // set all values to 0
|
589
|
-
self->data = mtdpe;
|
590
|
-
pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
|
591
|
-
mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
|
592
|
-
mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
|
593
|
-
for (i = 0; i < t_cnt; i++) {
|
594
|
-
tpe = ir_term_positions_for(ir, terms[i]);
|
595
|
-
if (tpe->next(tpe)) pq_push(pq, tpe);
|
596
|
-
}
|
597
|
-
pq->free_elem = &tde_destroy;
|
598
|
-
|
599
613
|
return self;
|
600
614
|
}
|
601
615
|
|
data/ext/vector.c
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
#include
|
1
|
+
#include "index.h"
|
2
|
+
#include "helper.h"
|
2
3
|
#include <string.h>
|
3
|
-
#include <helper.h>
|
4
4
|
|
5
5
|
static char * const NULL_POS_ERROR_MSG = "Trying to write positions that are null!";
|
6
6
|
static char * const NULL_OFFSETS_ERROR_MSG = "Trying to write offsets that are null!";
|
@@ -50,33 +50,25 @@ TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets)
|
|
50
50
|
|
51
51
|
void tvt_destroy(void *p)
|
52
52
|
{
|
53
|
-
//int i;
|
54
|
-
//TVTerm *tvt = (TVTerm *)p;
|
55
|
-
//free(tvt->text);
|
56
|
-
//free(tvt->positions);
|
57
|
-
//if (tvt->offsets != NULL) {
|
58
|
-
// for (i = 0; i < tvt->freq; i++) {
|
59
|
-
// tvoi_destroy(tvt->offsets[i]);
|
60
|
-
// }
|
61
|
-
// free(tvt->offsets);
|
62
|
-
//}
|
63
53
|
free(p);
|
64
54
|
}
|
65
55
|
|
66
56
|
|
67
57
|
TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
|
68
58
|
{
|
59
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
60
|
+
size_t segment_len = strlen(segment);
|
69
61
|
TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
|
62
|
+
OutStream *os;
|
63
|
+
|
70
64
|
tvw->curr_field = NULL;
|
71
65
|
tvw->curr_doc_pointer = -1;
|
72
66
|
|
73
|
-
|
74
|
-
char fname[SEGMENT_NAME_MAX_LENGTH];
|
75
|
-
int segment_len = strlen(segment);
|
67
|
+
/* Open files for TermVector storage */
|
76
68
|
strcpy(fname, segment);
|
77
69
|
|
78
70
|
strcpy(fname + segment_len, TVX_EXTENSION);
|
79
|
-
|
71
|
+
os = tvw->tvx = store->create_output(store, fname);
|
80
72
|
os_write_int(os, FORMAT_VERSION);
|
81
73
|
|
82
74
|
strcpy(fname + segment_len, TVD_EXTENSION);
|
@@ -104,43 +96,48 @@ void tvw_write_field(TermVectorsWriter *tvw)
|
|
104
96
|
int i, j, start, length;
|
105
97
|
char *last_term_text;
|
106
98
|
TVOffsetInfo *tmp_offset;
|
107
|
-
|
99
|
+
TVTerm **terms = tvw->terms;
|
100
|
+
TVTerm *term;
|
101
|
+
/* remember where this field is written */
|
108
102
|
OutStream *tvf = tvw->tvf;
|
103
|
+
int store_positions = tvw->curr_field->store_positions;
|
104
|
+
int store_offsets = tvw->curr_field->store_offsets;
|
105
|
+
uchar bits = 0x0;
|
106
|
+
|
109
107
|
tvw->curr_field->tvf_pointer = os_pos(tvf);
|
110
108
|
|
111
|
-
|
109
|
+
/* write the number of terms */
|
112
110
|
os_write_vint(tvf, tvw->tcnt);
|
113
111
|
|
114
|
-
|
115
|
-
int store_offsets = tvw->curr_field->store_offsets;
|
116
|
-
int bits = 0x0;
|
117
|
-
if (store_positions)
|
112
|
+
if (store_positions) {
|
118
113
|
bits |= STORE_POSITIONS_WITH_TERMVECTOR;
|
114
|
+
}
|
119
115
|
|
120
|
-
if (store_offsets)
|
116
|
+
if (store_offsets) {
|
121
117
|
bits |= STORE_OFFSET_WITH_TERMVECTOR;
|
118
|
+
}
|
122
119
|
|
123
|
-
os_write_byte(tvf, bits);
|
120
|
+
os_write_byte(tvf, (uchar)bits);
|
124
121
|
|
125
122
|
last_term_text = (char *)EMPTY_STRING;
|
126
|
-
TVTerm **terms = tvw->terms;
|
127
|
-
TVTerm *term;
|
128
123
|
for (i = 0; i < tvw->tcnt; i++) {
|
129
124
|
term = terms[i];
|
130
125
|
start = hlp_string_diff(last_term_text, term->text);
|
131
|
-
length = strlen(term->text) - start;
|
132
|
-
os_write_vint(tvf, start);
|
133
|
-
os_write_vint(tvf, length);
|
134
|
-
os_write_chars(tvf, term->text, start, length);
|
126
|
+
length = (int)strlen(term->text) - start;
|
127
|
+
os_write_vint(tvf, start); /* write shared prefix length */
|
128
|
+
os_write_vint(tvf, length); /* write delta length */
|
129
|
+
os_write_chars(tvf, term->text, start, length); /* write delta chars */
|
135
130
|
os_write_vint(tvf, term->freq);
|
136
131
|
last_term_text = term->text;
|
137
132
|
|
138
133
|
if (store_positions) {
|
139
|
-
if (term->positions == NULL)
|
140
|
-
RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
|
141
|
-
|
142
|
-
// use delta encoding for positions
|
143
134
|
int last_pos = 0;
|
135
|
+
|
136
|
+
if (term->positions == NULL) {
|
137
|
+
RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
|
138
|
+
}
|
139
|
+
|
140
|
+
/* use delta encoding for positions */
|
144
141
|
for (j = 0; j < term->freq; j++) {
|
145
142
|
os_write_vint(tvf, term->positions[j] - last_pos);
|
146
143
|
last_pos = term->positions[j];
|
@@ -148,16 +145,18 @@ void tvw_write_field(TermVectorsWriter *tvw)
|
|
148
145
|
}
|
149
146
|
|
150
147
|
if (store_offsets) {
|
151
|
-
if (term->offsets == NULL)
|
152
|
-
RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
|
153
|
-
|
154
|
-
// use delta encoding for offsets
|
155
148
|
int last_end = 0;
|
149
|
+
|
150
|
+
if (term->offsets == NULL) {
|
151
|
+
RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
|
152
|
+
}
|
153
|
+
|
154
|
+
/* use delta encoding for offsets */
|
156
155
|
for (j = 0; j < term->freq; j++) {
|
157
156
|
tmp_offset = term->offsets[j];
|
158
157
|
os_write_vint(tvf, tmp_offset->start - last_end);
|
159
158
|
|
160
|
-
|
159
|
+
/* save the diff between the two */
|
161
160
|
os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
|
162
161
|
last_end = tmp_offset->end;
|
163
162
|
}
|
@@ -169,13 +168,14 @@ void tvw_close_field(TermVectorsWriter *tvw)
|
|
169
168
|
{
|
170
169
|
int i;
|
171
170
|
if (tvw->curr_field != NULL) {
|
172
|
-
|
171
|
+
/* save field and terms */
|
173
172
|
tvw_write_field(tvw);
|
174
173
|
|
175
174
|
if (tvw->fcnt >= tvw->fsize) {
|
176
175
|
tvw->fsize *=2;
|
177
|
-
if (tvw->fsize < FIELD_ARR_START_SIZE)
|
176
|
+
if (tvw->fsize < FIELD_ARR_START_SIZE) {
|
178
177
|
tvw->fsize = FIELD_ARR_START_SIZE;
|
178
|
+
}
|
179
179
|
REALLOC_N(tvw->fields, TVField *, tvw->fsize);
|
180
180
|
}
|
181
181
|
tvw->fields[tvw->fcnt] = tvw->curr_field;
|
@@ -205,30 +205,34 @@ void tvw_open_field(TermVectorsWriter *tvw, char *field)
|
|
205
205
|
|
206
206
|
void tvw_write_doc(TermVectorsWriter *tvw)
|
207
207
|
{
|
208
|
-
|
208
|
+
OutStream *tvd = tvw->tvd;
|
209
|
+
int i;
|
210
|
+
TVField **fields = tvw->fields;
|
211
|
+
int last_field_pointer = 0;
|
212
|
+
|
213
|
+
if (tvw->curr_field != NULL) {
|
209
214
|
RAISE(STATE_ERROR, FIELD_OPEN_ERROR_MSG);
|
215
|
+
}
|
210
216
|
|
211
|
-
//
|
212
|
-
|
217
|
+
//printf("Writing doc pointer: %d\n", tvw->curr_doc_pointer);
|
218
|
+
/* write document index record */
|
213
219
|
os_write_long(tvw->tvx, tvw->curr_doc_pointer);
|
214
220
|
|
215
|
-
|
216
|
-
|
221
|
+
//printf("Writing field count: %ld, %d, %d -> ", (long long)tvw, tvw->fcnt, os_pos(tvd));
|
222
|
+
/* write the number of @fields */
|
217
223
|
os_write_vint(tvd, tvw->fcnt);
|
218
|
-
|
219
|
-
|
220
|
-
int i;
|
221
|
-
TVField **fields = tvw->fields;
|
224
|
+
|
225
|
+
/* write field numbers */
|
222
226
|
for (i = 0; i < tvw->fcnt; i++) {
|
223
227
|
os_write_vint(tvd, fields[i]->number);
|
224
228
|
}
|
225
229
|
|
226
|
-
|
227
|
-
int last_field_pointer = 0;
|
230
|
+
/* write field pointers */
|
228
231
|
for (i = 0; i < tvw->fcnt; i++) {
|
229
232
|
os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
|
230
233
|
last_field_pointer = fields[i]->tvf_pointer;
|
231
234
|
}
|
235
|
+
//printf("%d\n", os_pos(tvw->tvd));
|
232
236
|
}
|
233
237
|
|
234
238
|
void tvw_close_doc(TermVectorsWriter *tvw)
|
@@ -257,8 +261,10 @@ void tvw_add_term(TermVectorsWriter *tvw,
|
|
257
261
|
{
|
258
262
|
if (tvw->tcnt >= tvw->tsize) {
|
259
263
|
tvw->tsize *= 2;
|
260
|
-
if (tvw->tsize < TERM_ARR_START_SIZE)
|
264
|
+
if (tvw->tsize < TERM_ARR_START_SIZE) {
|
261
265
|
tvw->tsize = TERM_ARR_START_SIZE;
|
266
|
+
}
|
267
|
+
|
262
268
|
REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
|
263
269
|
}
|
264
270
|
tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
|
@@ -267,31 +273,36 @@ void tvw_add_term(TermVectorsWriter *tvw,
|
|
267
273
|
|
268
274
|
void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
|
269
275
|
{
|
270
|
-
tvw_open_doc(tvw);
|
271
|
-
|
272
276
|
int i, j, store_positions, store_offsets;
|
273
277
|
TermVector *tv;
|
278
|
+
|
279
|
+
tvw_open_doc(tvw);
|
280
|
+
|
274
281
|
for (i = 0; i < vectors->size; i++) {
|
275
282
|
tv = vectors->elems[i];
|
276
283
|
|
277
284
|
store_positions = (tv->tcnt > 0 && tv->positions != NULL);
|
278
285
|
store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
|
279
286
|
|
280
|
-
tvw_create_field(tvw, fis_get_number(tvw->fis, tv->field),
|
287
|
+
tvw_create_field(tvw, (int)fis_get_number(tvw->fis, tv->field),
|
281
288
|
store_positions, store_offsets);
|
282
289
|
|
283
290
|
if (store_positions && store_offsets) {
|
284
|
-
for (j = 0; j < tv->tcnt; j++)
|
291
|
+
for (j = 0; j < tv->tcnt; j++) {
|
285
292
|
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
|
293
|
+
}
|
286
294
|
} else if (store_positions) {
|
287
|
-
for (j = 0; j < tv->tcnt; j++)
|
295
|
+
for (j = 0; j < tv->tcnt; j++) {
|
288
296
|
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
|
297
|
+
}
|
289
298
|
} else if (store_offsets) {
|
290
|
-
for (j = 0; j < tv->tcnt; j++)
|
299
|
+
for (j = 0; j < tv->tcnt; j++) {
|
291
300
|
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
|
301
|
+
}
|
292
302
|
} else {
|
293
|
-
for (j = 0; j < tv->tcnt; j++)
|
303
|
+
for (j = 0; j < tv->tcnt; j++) {
|
294
304
|
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
|
305
|
+
}
|
295
306
|
}
|
296
307
|
tvw_close_field(tvw);
|
297
308
|
}
|
@@ -333,10 +344,9 @@ TermVector *tv_create(
|
|
333
344
|
return tv;
|
334
345
|
}
|
335
346
|
|
336
|
-
void tv_destroy(
|
347
|
+
void tv_destroy(TermVector *tv)
|
337
348
|
{
|
338
349
|
int i, j;
|
339
|
-
TermVector *tv = (TermVector *)p;
|
340
350
|
for (i = 0; i < tv->tcnt; i++) {
|
341
351
|
free(tv->terms[i]);
|
342
352
|
}
|
@@ -357,12 +367,11 @@ void tv_destroy(void *p)
|
|
357
367
|
free(tv->offsets);
|
358
368
|
}
|
359
369
|
free(tv->freqs);
|
360
|
-
free(
|
370
|
+
free(tv);
|
361
371
|
}
|
362
372
|
|
363
|
-
void tv_destroy_except_data(
|
373
|
+
void tv_destroy_except_data(TermVector *tv)
|
364
374
|
{
|
365
|
-
TermVector *tv = (TermVector *)p;
|
366
375
|
free(tv->terms);
|
367
376
|
if (tv->positions != NULL) {
|
368
377
|
free(tv->positions);
|
@@ -371,23 +380,23 @@ void tv_destroy_except_data(void *p)
|
|
371
380
|
free(tv->offsets);
|
372
381
|
}
|
373
382
|
free(tv->freqs);
|
374
|
-
free(
|
383
|
+
free(tv);
|
375
384
|
}
|
376
385
|
|
377
386
|
int tvr_check_valid_format(InStream *is)
|
378
387
|
{
|
379
388
|
int format = is_read_int(is);
|
380
389
|
if (format > FORMAT_VERSION)
|
381
|
-
RAISE(
|
390
|
+
RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
|
382
391
|
return format;
|
383
392
|
}
|
384
393
|
|
385
394
|
TermVectorsReader *tvr_clone(TermVectorsReader *orig)
|
386
395
|
{
|
387
396
|
TermVectorsReader *clone = NULL;
|
397
|
+
clone = ALLOC(TermVectorsReader);
|
398
|
+
memcpy(clone, orig, sizeof(TermVectorsReader));
|
388
399
|
if (orig->tvx && orig->tvd && orig->tvf) {
|
389
|
-
clone = ALLOC(TermVectorsReader);
|
390
|
-
memcpy(clone, orig, sizeof(TermVectorsReader));
|
391
400
|
clone->tvx = is_clone(orig->tvx);
|
392
401
|
clone->tvd = is_clone(orig->tvd);
|
393
402
|
clone->tvf = is_clone(orig->tvf);
|
@@ -400,23 +409,30 @@ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis)
|
|
400
409
|
TermVectorsReader *tvr = ALLOC(TermVectorsReader);
|
401
410
|
// Open files for TermVector storage
|
402
411
|
char fname[SEGMENT_NAME_MAX_LENGTH];
|
403
|
-
|
412
|
+
size_t segment_len = strlen(segment);
|
413
|
+
InStream *is;
|
414
|
+
|
404
415
|
strcpy(fname, segment);
|
405
416
|
|
406
417
|
strcpy(fname + segment_len, TVX_EXTENSION);
|
407
|
-
|
408
|
-
|
409
|
-
|
418
|
+
if (!store->exists(store, fname)) {
|
419
|
+
tvr->tvx = tvr->tvd = tvr->tvf = NULL;
|
420
|
+
tvr->size = 0;
|
421
|
+
} else {
|
422
|
+
is = tvr->tvx = store->open_input(store, fname);
|
423
|
+
tvr_check_valid_format(is);
|
424
|
+
tvr->size = is_length(is)/8;
|
410
425
|
|
411
|
-
|
412
|
-
|
413
|
-
|
426
|
+
strcpy(fname + segment_len, TVD_EXTENSION);
|
427
|
+
is = tvr->tvd = store->open_input(store, fname);
|
428
|
+
tvr->tvd_format = tvr_check_valid_format(is);
|
414
429
|
|
415
|
-
|
416
|
-
|
417
|
-
|
430
|
+
strcpy(fname + segment_len, TVF_EXTENSION);
|
431
|
+
is = tvr->tvf = store->open_input(store, fname);
|
432
|
+
tvr->tvf_format = tvr_check_valid_format(is);
|
418
433
|
|
419
|
-
|
434
|
+
tvr->fis = fis;
|
435
|
+
}
|
420
436
|
return tvr;
|
421
437
|
}
|
422
438
|
|
@@ -426,9 +442,15 @@ void tvr_close(TermVectorsReader *tvr)
|
|
426
442
|
* exception, everything else will also be closed. */
|
427
443
|
TRY
|
428
444
|
XFINALLY
|
429
|
-
|
430
|
-
|
431
|
-
|
445
|
+
if (tvr->tvx) {
|
446
|
+
is_close(tvr->tvx);
|
447
|
+
}
|
448
|
+
if (tvr->tvd) {
|
449
|
+
is_close(tvr->tvd);
|
450
|
+
}
|
451
|
+
if (tvr->tvf) {
|
452
|
+
is_close(tvr->tvf);
|
453
|
+
}
|
432
454
|
free(tvr);
|
433
455
|
XENDTRY
|
434
456
|
}
|
@@ -436,17 +458,29 @@ void tvr_close(TermVectorsReader *tvr)
|
|
436
458
|
TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
437
459
|
char *field, int tvf_pointer)
|
438
460
|
{
|
439
|
-
int i, j, store_positions, store_offsets, bits;
|
440
|
-
|
441
|
-
|
442
|
-
|
461
|
+
int i, j, store_positions, store_offsets, bits, num_terms;
|
462
|
+
char **terms;
|
463
|
+
int *term_freqs;
|
464
|
+
|
465
|
+
/* we may not need these, but declare them */
|
466
|
+
int **positions = NULL;
|
467
|
+
TVOffsetInfo ***offsets = NULL;
|
468
|
+
int start, delta_length, total_length, freq, prev_pos;
|
469
|
+
int start_offset, end_offset, prev_offset;
|
470
|
+
int *pos;
|
471
|
+
TVOffsetInfo **offs;
|
472
|
+
char buffer[MAX_WORD_SIZE] = "";
|
473
|
+
|
474
|
+
/* Now read the data from specified position. We don't need to offset
|
475
|
+
* offset by the FORMAT here since the pointer already includes the offset */
|
443
476
|
is_seek(tvr->tvf, tvf_pointer);
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
if (num_terms == 0)
|
477
|
+
num_terms = (int)is_read_vint(tvr->tvf);
|
478
|
+
|
479
|
+
/* If no terms - return a constant empty termvector. However, this should
|
480
|
+
* never occur! */
|
481
|
+
if (num_terms == 0) {
|
449
482
|
return tv_create(field, NULL, 0, NULL, NULL, NULL);
|
483
|
+
}
|
450
484
|
|
451
485
|
if(tvr->tvf_format == FORMAT_VERSION) {
|
452
486
|
bits = is_read_byte(tvr->tvf);
|
@@ -458,41 +492,34 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
|
458
492
|
store_offsets = false;
|
459
493
|
}
|
460
494
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
// we may not need these, but declare them
|
465
|
-
int **positions = NULL;
|
466
|
-
TVOffsetInfo ***offsets = NULL;
|
495
|
+
terms = ALLOC_N(char *, num_terms);
|
496
|
+
term_freqs = ALLOC_N(int, num_terms);
|
467
497
|
|
468
|
-
if(store_positions)
|
498
|
+
if (store_positions) {
|
469
499
|
positions = ALLOC_N(int *, num_terms);
|
500
|
+
}
|
470
501
|
|
471
|
-
if(store_offsets)
|
502
|
+
if (store_offsets) {
|
472
503
|
offsets = ALLOC_N(TVOffsetInfo **, num_terms);
|
504
|
+
}
|
473
505
|
|
474
|
-
int start, delta_length, total_length, freq, prev_pos;
|
475
|
-
int start_offset, end_offset, prev_offset;
|
476
|
-
int *pos;
|
477
|
-
TVOffsetInfo **offs;
|
478
|
-
char buffer[MAX_WORD_SIZE] = "";
|
479
506
|
|
480
507
|
for (i = 0; i < num_terms; i++) {
|
481
|
-
start = is_read_vint(tvr->tvf);
|
482
|
-
delta_length = is_read_vint(tvr->tvf);
|
508
|
+
start = (int)is_read_vint(tvr->tvf);
|
509
|
+
delta_length = (int)is_read_vint(tvr->tvf);
|
483
510
|
total_length = start + delta_length;
|
484
511
|
is_read_chars(tvr->tvf, buffer, start, delta_length);
|
485
512
|
buffer[total_length] = '\0';
|
486
513
|
terms[i] = estrdup(buffer);
|
487
|
-
freq = is_read_vint(tvr->tvf);
|
514
|
+
freq = (int)is_read_vint(tvr->tvf);
|
488
515
|
term_freqs[i] = freq;
|
489
516
|
|
490
|
-
if (store_positions) {
|
517
|
+
if (store_positions) {/* read in the positions */
|
491
518
|
pos = ALLOC_N(int, freq);
|
492
519
|
positions[i] = pos;
|
493
520
|
prev_pos = 0;
|
494
521
|
for (j = 0; j < freq; j++) {
|
495
|
-
pos[j] = prev_pos + is_read_vint(tvr->tvf);
|
522
|
+
pos[j] = prev_pos + (int)is_read_vint(tvr->tvf);
|
496
523
|
prev_pos = pos[j];
|
497
524
|
}
|
498
525
|
}
|
@@ -502,8 +529,8 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
|
502
529
|
offsets[i] = offs;
|
503
530
|
prev_offset = 0;
|
504
531
|
for (j = 0; j < freq; j++) {
|
505
|
-
start_offset = prev_offset + is_read_vint(tvr->tvf);
|
506
|
-
end_offset = start_offset + is_read_vint(tvr->tvf);
|
532
|
+
start_offset = prev_offset + (int)is_read_vint(tvr->tvf);
|
533
|
+
end_offset = start_offset + (int)is_read_vint(tvr->tvf);
|
507
534
|
offs[j] = tvoi_create(start_offset, end_offset);
|
508
535
|
prev_offset = end_offset;
|
509
536
|
}
|
@@ -516,39 +543,41 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
|
|
516
543
|
{
|
517
544
|
int i;
|
518
545
|
Array *tvs = NULL;
|
519
|
-
|
546
|
+
/* Check if no term vectors are available for this segment at all */
|
520
547
|
if (tvr->tvx != NULL) {
|
521
|
-
|
548
|
+
int position, field_count;
|
549
|
+
/* We need to offset by */
|
522
550
|
is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
|
523
551
|
|
524
|
-
|
552
|
+
position = (int)is_read_long(tvr->tvx);
|
525
553
|
|
526
554
|
is_seek(tvr->tvd, position);
|
527
|
-
|
555
|
+
field_count = (int)is_read_vint(tvr->tvd);
|
528
556
|
|
529
|
-
|
557
|
+
/* No fields are vectorized for this document */
|
530
558
|
if (field_count > 0) {
|
531
559
|
int number = 0;
|
560
|
+
int position = 0;
|
561
|
+
int *tvf_pointers = ALLOC_N(int, field_count);
|
532
562
|
char **fields = ALLOC_N(char *, field_count);
|
533
563
|
|
534
564
|
for (i = 0; i < field_count; i++) {
|
535
|
-
if (tvr->tvd_format == FORMAT_VERSION)
|
536
|
-
number = is_read_vint(tvr->tvd);
|
537
|
-
else
|
538
|
-
number += is_read_vint(tvr->tvd);
|
565
|
+
if (tvr->tvd_format == FORMAT_VERSION) {
|
566
|
+
number = (int)is_read_vint(tvr->tvd);
|
567
|
+
} else {
|
568
|
+
number += (int)is_read_vint(tvr->tvd);
|
569
|
+
}
|
539
570
|
|
540
571
|
fields[i] = tvr->fis->by_number[number]->name;
|
541
572
|
}
|
542
573
|
|
543
|
-
|
544
|
-
int position = 0;
|
545
|
-
int *tvf_pointers = ALLOC_N(int, field_count);
|
574
|
+
/* Compute position in the tvf file */
|
546
575
|
for (i = 0; i < field_count; i++) {
|
547
|
-
position += is_read_vint(tvr->tvd);
|
576
|
+
position += (int)is_read_vint(tvr->tvd);
|
548
577
|
tvf_pointers[i] = position;
|
549
578
|
}
|
550
579
|
|
551
|
-
tvs = ary_create(field_count, &tv_destroy);
|
580
|
+
tvs = ary_create(field_count, (free_ft)&tv_destroy);
|
552
581
|
for (i = 0; i < field_count; i++) {
|
553
582
|
ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
|
554
583
|
}
|
@@ -562,45 +591,45 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
|
|
562
591
|
TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
|
563
592
|
{
|
564
593
|
int i;
|
565
|
-
|
566
|
-
int field_number = fis_get_number(tvr->fis, field);
|
594
|
+
/* Check if no term vectors are available for this segment at all */
|
595
|
+
int field_number = (int)fis_get_number(tvr->fis, field);
|
567
596
|
TermVector *tv = NULL;
|
568
597
|
|
569
598
|
if (tvr->tvx != NULL) {
|
570
|
-
|
571
|
-
|
572
|
-
|
599
|
+
int pos, field_count, number = 0, found = -1;
|
600
|
+
/* We need to account for the FORMAT_SIZE at when seeking in the @tvx
|
601
|
+
* We don't need to do this in other seeks because we already have the
|
602
|
+
* file pointer that was written in another file */
|
573
603
|
is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
|
574
|
-
//
|
575
|
-
|
604
|
+
//printf("TVX Pointer: %d\n", is_pos(tvr->tvx));
|
605
|
+
pos = (int)is_read_long(tvr->tvx);
|
576
606
|
|
577
607
|
is_seek(tvr->tvd, pos);
|
578
|
-
|
579
|
-
//
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
int number = 0;
|
584
|
-
int found = -1;
|
585
|
-
|
608
|
+
field_count = (int)is_read_vint(tvr->tvd);
|
609
|
+
//printf("Num Fields: %d\n", field_count);
|
610
|
+
/* There are only a few fields per document. We opt for a full scan
|
611
|
+
* rather then requiring that they be ordered. We need to read through
|
612
|
+
* all of the fields anyway to get to the tvf pointers. */
|
586
613
|
for (i = 0; i < field_count; i++) {
|
587
|
-
if (tvr->tvd_format == FORMAT_VERSION)
|
588
|
-
number = is_read_vint(tvr->tvd);
|
589
|
-
else
|
590
|
-
number += is_read_vint(tvr->tvd);
|
614
|
+
if (tvr->tvd_format == FORMAT_VERSION) {
|
615
|
+
number = (int)is_read_vint(tvr->tvd);
|
616
|
+
} else {
|
617
|
+
number += (int)is_read_vint(tvr->tvd);
|
618
|
+
}
|
591
619
|
|
592
|
-
if (number == field_number)
|
620
|
+
if (number == field_number) {
|
593
621
|
found = i;
|
622
|
+
}
|
594
623
|
}
|
595
624
|
|
596
|
-
|
597
|
-
|
625
|
+
/* This field, although valid in the segment, was not found in this
|
626
|
+
* document */
|
598
627
|
if (found != -1) {
|
599
|
-
|
628
|
+
/* Compute pos in the tvf file */
|
600
629
|
pos = 0;
|
601
|
-
for (i = 0; i <= found; i++)
|
602
|
-
pos += is_read_vint(tvr->tvd);
|
603
|
-
|
630
|
+
for (i = 0; i <= found; i++) {
|
631
|
+
pos += (int)is_read_vint(tvr->tvd);
|
632
|
+
}
|
604
633
|
tv = tvr_read_term_vector(tvr, field, pos);
|
605
634
|
}
|
606
635
|
}
|