ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/index_rw.c
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#include
|
1
|
+
#include "index.h"
|
2
2
|
#include <stdlib.h>
|
3
3
|
#include <string.h>
|
4
4
|
#include <array.h>
|
@@ -24,11 +24,11 @@ const char *VECTOR_EXTENSIONS[] = {
|
|
24
24
|
};
|
25
25
|
|
26
26
|
FerretConfig config = {
|
27
|
-
10,
|
28
|
-
10,
|
29
|
-
INT_MAX,
|
30
|
-
10000,
|
31
|
-
128
|
27
|
+
10, /* default merge_factor */
|
28
|
+
10, /* default min_merge_docs */
|
29
|
+
INT_MAX, /* default max_merge_docs */
|
30
|
+
10000, /* default max_field_length */
|
31
|
+
128 /* default term_index_interval */
|
32
32
|
};
|
33
33
|
|
34
34
|
/***************************************************************************
|
@@ -47,33 +47,32 @@ int co_eq(const void *key1, const void *key2)
|
|
47
47
|
return (key1 == key2);
|
48
48
|
}
|
49
49
|
|
50
|
-
void co_destroy(
|
50
|
+
void co_destroy(CacheObject *self)
|
51
51
|
{
|
52
|
-
|
53
|
-
h_rem(
|
54
|
-
|
55
|
-
|
56
|
-
free(co);
|
52
|
+
h_rem(self->ref_tab1, self->ref2, false);
|
53
|
+
h_rem(self->ref_tab2, self->ref1, false);
|
54
|
+
self->destroy(self->obj);
|
55
|
+
free(self);
|
57
56
|
}
|
58
57
|
|
59
58
|
CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
|
60
|
-
void *ref1, void *ref2,
|
59
|
+
void *ref1, void *ref2, free_ft destroy, void *obj)
|
61
60
|
{
|
62
|
-
CacheObject *
|
63
|
-
h_set(ref_tab1, ref2,
|
64
|
-
h_set(ref_tab2, ref1,
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
return
|
61
|
+
CacheObject *self = ALLOC(CacheObject);
|
62
|
+
h_set(ref_tab1, ref2, self);
|
63
|
+
h_set(ref_tab2, ref1, self);
|
64
|
+
self->ref_tab1 = ref_tab1;
|
65
|
+
self->ref_tab2 = ref_tab2;
|
66
|
+
self->ref1 = ref1;
|
67
|
+
self->ref2 = ref2;
|
68
|
+
self->destroy = destroy;
|
69
|
+
self->obj = obj;
|
70
|
+
return self;
|
72
71
|
}
|
73
72
|
|
74
73
|
HshTable *co_hsh_create()
|
75
74
|
{
|
76
|
-
return h_new(&co_hash, &co_eq, NULL, &co_destroy);
|
75
|
+
return h_new(&co_hash, &co_eq, (free_ft)NULL, (free_ft)&co_destroy);
|
77
76
|
}
|
78
77
|
|
79
78
|
/***************************************************************************
|
@@ -84,39 +83,38 @@ HshTable *co_hsh_create()
|
|
84
83
|
|
85
84
|
Posting *p_create(Term *term, int position, TVOffsetInfo *offset)
|
86
85
|
{
|
87
|
-
Posting *
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
return
|
86
|
+
Posting *self = ALLOC(Posting);
|
87
|
+
self->freq = 1;
|
88
|
+
self->size = 1;
|
89
|
+
self->term = term;
|
90
|
+
self->positions = ALLOC(int);
|
91
|
+
self->positions[0] = position;
|
92
|
+
self->offsets = ALLOC(TVOffsetInfo *);
|
93
|
+
self->offsets[0] = offset;
|
94
|
+
return self;
|
96
95
|
}
|
97
96
|
|
98
|
-
void p_destroy(
|
97
|
+
void p_destroy(Posting *self)
|
99
98
|
{
|
100
|
-
|
99
|
+
/* the positions and offsets will be put in a TVTerm so no need to free */
|
101
100
|
int i;
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
free(
|
107
|
-
free(p);
|
101
|
+
free(self->positions);
|
102
|
+
for (i = 0; i < self->freq; i++)
|
103
|
+
tvoi_destroy(self->offsets[i]);
|
104
|
+
free(self->offsets);
|
105
|
+
free(self);
|
108
106
|
}
|
109
107
|
|
110
|
-
void p_add_occurance(Posting *
|
108
|
+
void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset)
|
111
109
|
{
|
112
|
-
if (
|
113
|
-
|
114
|
-
REALLOC_N(
|
115
|
-
REALLOC_N(
|
110
|
+
if (self->freq >= self->size) {
|
111
|
+
self->size *= 2;
|
112
|
+
REALLOC_N(self->positions, int, self->size);
|
113
|
+
REALLOC_N(self->offsets, TVOffsetInfo *, self->size);
|
116
114
|
}
|
117
|
-
|
118
|
-
|
119
|
-
|
115
|
+
self->positions[self->freq] = position;
|
116
|
+
self->offsets[self->freq] = offset;
|
117
|
+
self->freq++;
|
120
118
|
}
|
121
119
|
|
122
120
|
inline int p_cmp(const void *const p1, const void *const p2)
|
@@ -137,47 +135,49 @@ DocumentWriter *dw_open(Store *store,
|
|
137
135
|
int max_field_length,
|
138
136
|
int term_index_interval)
|
139
137
|
{
|
140
|
-
DocumentWriter *
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
138
|
+
DocumentWriter *self = ALLOC(DocumentWriter);
|
139
|
+
self->store = store;
|
140
|
+
self->analyzer = analyzer;
|
141
|
+
self->similarity = similarity;
|
142
|
+
self->fis = NULL;
|
143
|
+
self->postingtable = h_new(&term_hash, &term_eq,
|
144
|
+
(free_ft)&term_destroy,
|
145
|
+
(free_ft)&p_destroy);
|
146
|
+
self->max_field_length = max_field_length;
|
147
|
+
self->term_index_interval = term_index_interval;
|
148
|
+
return self;
|
149
149
|
}
|
150
150
|
|
151
|
-
void dw_close(DocumentWriter *
|
151
|
+
void dw_close(DocumentWriter *self)
|
152
152
|
{
|
153
|
-
if (
|
154
|
-
h_destroy(
|
155
|
-
free(
|
153
|
+
if (self->fis) fis_destroy(self->fis);
|
154
|
+
h_destroy(self->postingtable);
|
155
|
+
free(self);
|
156
156
|
}
|
157
157
|
|
158
|
-
void dw_add_position(DocumentWriter *
|
158
|
+
void dw_add_position(DocumentWriter *self, char *field, char *text,
|
159
159
|
int position, TVOffsetInfo *offset)
|
160
160
|
{
|
161
161
|
Term termbuf = {field, text}, *term;
|
162
|
-
Posting *p = (Posting *)h_get(
|
162
|
+
Posting *p = (Posting *)h_get(self->postingtable, &termbuf);
|
163
163
|
|
164
|
-
if (p) {
|
165
|
-
// double the size of posting to make room for more posts.
|
164
|
+
if (p) { /* word seen before */
|
166
165
|
if (p->freq >= p->size) {
|
166
|
+
/* double size of posting to make room for more posts. */
|
167
167
|
p->size <<= 1;
|
168
168
|
REALLOC_N(p->positions, int, p->size);
|
169
169
|
p->offsets = REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
|
170
170
|
}
|
171
|
-
p->positions[p->freq] = position;
|
172
|
-
p->offsets[p->freq] = offset;
|
173
|
-
p->freq++;
|
174
|
-
} else {
|
171
|
+
p->positions[p->freq] = position; /* add new position */
|
172
|
+
p->offsets[p->freq] = offset; /* add new offset */
|
173
|
+
p->freq++; /* update frequency */
|
174
|
+
} else { /* word not seen before */
|
175
175
|
term = term_create(field, text);
|
176
|
-
h_set(
|
176
|
+
h_set(self->postingtable, term, p_create(term, position, offset));
|
177
177
|
}
|
178
178
|
}
|
179
179
|
|
180
|
-
void dw_invert_doc(DocumentWriter *
|
180
|
+
void dw_invert_doc(DocumentWriter *self, Document *doc)
|
181
181
|
{
|
182
182
|
int i;
|
183
183
|
int dfcnt = doc->dfcnt;
|
@@ -191,69 +191,74 @@ void dw_invert_doc(DocumentWriter *dw, Document *doc)
|
|
191
191
|
for (i = 0; i < dfcnt; i++) {
|
192
192
|
field = fields[i];
|
193
193
|
field_name = field->name;
|
194
|
-
fi = ((FieldInfo *)ht_get(
|
194
|
+
fi = ((FieldInfo *)ht_get(self->fis->by_name, field_name));
|
195
195
|
field_number = fi->number;
|
196
196
|
|
197
|
-
length =
|
198
|
-
offset =
|
199
|
-
position =
|
197
|
+
length = self->field_lengths[field_number];
|
198
|
+
offset = self->field_offsets[field_number];
|
199
|
+
position = self->field_positions[field_number];
|
200
200
|
|
201
201
|
if (fi->is_indexed) {
|
202
|
-
if (!field->is_tokenized) {
|
202
|
+
if (!field->is_tokenized) { /* un-tokenized field */
|
203
203
|
text = field->data;
|
204
|
-
slen = strlen(text);
|
204
|
+
slen = (int)strlen(text);
|
205
205
|
if (fi->store_offset) {
|
206
|
-
dw_add_position(
|
206
|
+
dw_add_position(self, field_name, text, position,
|
207
207
|
tvoi_create(offset, offset+slen));
|
208
208
|
} else {
|
209
|
-
dw_add_position(
|
209
|
+
dw_add_position(self, field_name, text, position, NULL);
|
210
210
|
}
|
211
211
|
offset += slen;
|
212
212
|
length++;
|
213
213
|
} else {
|
214
214
|
|
215
|
-
|
216
|
-
stream = a_get_ts(
|
215
|
+
/* Tokenize field and add to posting_table */
|
216
|
+
stream = a_get_ts(self->analyzer, field_name, field->data);
|
217
217
|
|
218
218
|
while ((token = ts_next(stream)) != NULL) {
|
219
219
|
position += (token->pos_inc - 1);
|
220
220
|
|
221
221
|
if (fi->store_offset) {
|
222
|
-
dw_add_position(
|
222
|
+
dw_add_position(self,
|
223
223
|
field_name,
|
224
224
|
token->text,
|
225
225
|
position,
|
226
226
|
tvoi_create(offset + token->start, offset + token->end));
|
227
227
|
position++;
|
228
228
|
} else {
|
229
|
-
dw_add_position(
|
229
|
+
dw_add_position(self, field_name, token->text, position, NULL);
|
230
230
|
position++;
|
231
231
|
}
|
232
232
|
|
233
233
|
length++;
|
234
|
-
|
235
|
-
if (length >
|
234
|
+
/* stop if we reach the max field length */
|
235
|
+
if (length > self->max_field_length) {
|
236
236
|
break;
|
237
|
+
}
|
237
238
|
}
|
238
239
|
|
239
|
-
if (token)
|
240
|
+
if (token) {
|
240
241
|
offset += token->end + 1;
|
242
|
+
}
|
241
243
|
}
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
244
|
+
self->field_lengths[field_number] = length;
|
245
|
+
self->field_offsets[field_number] = offset;
|
246
|
+
self->field_positions[field_number] = position;
|
247
|
+
self->field_boosts[field_number] *= field->boost;
|
246
248
|
}
|
247
249
|
}
|
248
250
|
}
|
249
251
|
|
250
|
-
Posting **dw_sort_posting_table(DocumentWriter *
|
252
|
+
Posting **dw_sort_posting_table(DocumentWriter *self)
|
251
253
|
{
|
252
|
-
HshTable *ht =
|
253
|
-
int i;
|
254
|
-
dw->pcnt = i = ht->used;
|
255
|
-
Posting **postings = ALLOC_N(Posting *, i);
|
254
|
+
HshTable *ht = self->postingtable;
|
256
255
|
HshEntry *he = ht->table;
|
256
|
+
Posting **postings;
|
257
|
+
int i;
|
258
|
+
|
259
|
+
self->pcnt = i = ht->used;
|
260
|
+
postings = ALLOC_N(Posting *, i);
|
261
|
+
|
257
262
|
while (i > 0) {
|
258
263
|
if (he->value != NULL) {
|
259
264
|
i--;
|
@@ -261,16 +266,16 @@ Posting **dw_sort_posting_table(DocumentWriter *dw)
|
|
261
266
|
}
|
262
267
|
he++;
|
263
268
|
}
|
264
|
-
qsort(postings,
|
269
|
+
qsort(postings, self->pcnt, sizeof(Posting *), &p_cmp);
|
265
270
|
return postings;
|
266
271
|
}
|
267
272
|
|
268
|
-
void dw_write_postings(DocumentWriter *
|
273
|
+
void dw_write_postings(DocumentWriter *self, Posting **postings, char *segment)
|
269
274
|
{
|
270
275
|
OutStream * volatile freq_out = NULL, * volatile prox_out = NULL;
|
271
276
|
TermInfosWriter * volatile tiw = NULL;
|
272
277
|
TermVectorsWriter * volatile tvw = NULL;
|
273
|
-
Store *store =
|
278
|
+
Store *store = self->store;
|
274
279
|
TermInfo * volatile ti = NULL;
|
275
280
|
Posting *posting;
|
276
281
|
int i, j, posting_freq, position, last_position;
|
@@ -278,31 +283,31 @@ void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
|
|
278
283
|
strcpy(fname, segment);
|
279
284
|
|
280
285
|
TRY
|
281
|
-
|
286
|
+
/* open files for inverse index storage */
|
282
287
|
sprintf(fname, "%s.frq", segment);
|
283
288
|
freq_out = store->create_output(store, fname);
|
284
289
|
sprintf(fname, "%s.prx", segment);
|
285
290
|
prox_out = store->create_output(store, fname);
|
286
|
-
tiw = tiw_open(store, segment,
|
291
|
+
tiw = tiw_open(store, segment, self->fis, self->term_index_interval);
|
287
292
|
ti = ti_create(0, 0, 0, 0);
|
288
293
|
|
289
|
-
for (i = 0; i <
|
294
|
+
for (i = 0; i < self->pcnt; i++) {
|
290
295
|
posting = postings[i];
|
291
296
|
|
292
|
-
|
297
|
+
/* add an entry to dictionary with pointers to prox and freq_out files */
|
293
298
|
ti_set(ti, 1, os_pos(freq_out), os_pos(prox_out), -1);
|
294
299
|
tiw_add(tiw, posting->term, ti);
|
295
300
|
|
296
|
-
|
301
|
+
/* add an entry to the freq_out file */
|
297
302
|
posting_freq = posting->freq;
|
298
|
-
if (posting_freq == 1) {
|
299
|
-
os_write_vint(freq_out, 1);
|
303
|
+
if (posting_freq == 1) { /* optimize freq=1 */
|
304
|
+
os_write_vint(freq_out, 1); /* set low bit of doc num */
|
300
305
|
} else {
|
301
|
-
os_write_vint(freq_out, 0);
|
302
|
-
os_write_vint(freq_out, posting_freq);
|
306
|
+
os_write_vint(freq_out, 0); /* the doc number */
|
307
|
+
os_write_vint(freq_out, posting_freq); /* frequency in doc */
|
303
308
|
}
|
304
309
|
|
305
|
-
last_position = 0;
|
310
|
+
last_position = 0; /* write positions */
|
306
311
|
|
307
312
|
for (j = 0; j < posting_freq; j++) {
|
308
313
|
position = posting->positions[j];
|
@@ -310,16 +315,16 @@ void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
|
|
310
315
|
last_position = position;
|
311
316
|
}
|
312
317
|
|
313
|
-
|
318
|
+
/* check to see if we switched to a new field */
|
314
319
|
term_field = posting->term->field;
|
315
320
|
if (curr_field != term_field) {
|
316
321
|
FieldInfo *fi;
|
317
|
-
|
322
|
+
/* changing field - see if there is something to save */
|
318
323
|
curr_field = term_field;
|
319
|
-
fi = (FieldInfo *)ht_get(
|
324
|
+
fi = (FieldInfo *)ht_get(self->fis->by_name, curr_field);
|
320
325
|
if (fi->store_tv) {
|
321
326
|
if (tvw == NULL) {
|
322
|
-
tvw = tvw_open(store, segment,
|
327
|
+
tvw = tvw_open(store, segment, self->fis);
|
323
328
|
tvw_open_doc(tvw);
|
324
329
|
}
|
325
330
|
tvw_open_field(tvw, curr_field);
|
@@ -328,7 +333,7 @@ void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
|
|
328
333
|
tvw_close_field(tvw);
|
329
334
|
}
|
330
335
|
}
|
331
|
-
|
336
|
+
/* tvw->curr_field != NULL implies field is still open */
|
332
337
|
if (tvw != NULL && tvw->curr_field != NULL) {
|
333
338
|
tvw_add_term(tvw, posting->term->text, posting_freq, posting->positions, posting->offsets);
|
334
339
|
}
|
@@ -338,8 +343,8 @@ void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
|
|
338
343
|
tvw_close_doc(tvw);
|
339
344
|
tvw_close(tvw);
|
340
345
|
}
|
341
|
-
|
342
|
-
|
346
|
+
/* make an effort to close all streams we can but remember and re-raise
|
347
|
+
* the last exception encountered in this process */
|
343
348
|
if (freq_out) os_close(freq_out);
|
344
349
|
if (prox_out) os_close(prox_out);
|
345
350
|
if (tiw) tiw_close(tiw);
|
@@ -347,24 +352,25 @@ void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
|
|
347
352
|
XENDTRY
|
348
353
|
}
|
349
354
|
|
350
|
-
void dw_write_norms(DocumentWriter *
|
355
|
+
void dw_write_norms(DocumentWriter *self, char *segment)
|
351
356
|
{
|
352
357
|
int i;
|
353
358
|
float norm;
|
354
359
|
OutStream *norms_out;
|
355
360
|
char fname[SEGMENT_NAME_MAX_LENGTH];
|
356
|
-
FieldInfos *fis =
|
361
|
+
FieldInfos *fis = self->fis;
|
357
362
|
FieldInfo *fi;
|
358
363
|
|
359
364
|
for (i = 0; i < fis->fcnt; i++) {
|
360
365
|
fi = fis->by_number[i];
|
361
366
|
|
362
367
|
if (fi->is_indexed && !fi->omit_norms) {
|
363
|
-
norm =
|
368
|
+
norm = self->field_boosts[i] *
|
369
|
+
sim_length_norm(self->similarity, fi->name, self->field_lengths[i]);
|
364
370
|
sprintf(fname, "%s.f%d", segment, i);
|
365
|
-
norms_out =
|
371
|
+
norms_out = self->store->create_output(self->store, fname);
|
366
372
|
TRY
|
367
|
-
os_write_byte(norms_out, sim_encode_norm(
|
373
|
+
os_write_byte(norms_out, sim_encode_norm(self->similarity, norm));
|
368
374
|
XFINALLY
|
369
375
|
os_close(norms_out);
|
370
376
|
XENDTRY
|
@@ -372,49 +378,54 @@ void dw_write_norms(DocumentWriter *dw, char *segment)
|
|
372
378
|
}
|
373
379
|
}
|
374
380
|
|
375
|
-
void dw_add_doc(DocumentWriter *
|
381
|
+
void dw_add_doc(DocumentWriter *self, char *segment, Document *doc)
|
376
382
|
{
|
383
|
+
Posting **postings;
|
384
|
+
FieldsWriter *fw;
|
377
385
|
int i;
|
378
|
-
// write field names
|
379
|
-
dw->fis = fis_create();
|
380
|
-
fis_add_doc(dw->fis, doc);
|
381
|
-
fis_write(dw->fis, dw->store, segment, ".fnm");
|
382
386
|
|
383
|
-
|
384
|
-
|
387
|
+
/* write field names */
|
388
|
+
self->fis = fis_create();
|
389
|
+
fis_add_doc(self->fis, doc);
|
390
|
+
fis_write(self->fis, self->store, segment, ".fnm");
|
391
|
+
|
392
|
+
/* write field values */
|
393
|
+
fw = fw_open(self->store, segment, self->fis);
|
385
394
|
TRY
|
386
395
|
fw_add_doc(fw, doc);
|
387
396
|
XFINALLY
|
388
397
|
fw_close(fw);
|
389
398
|
XENDTRY
|
390
399
|
|
391
|
-
|
392
|
-
h_clear(dw->postingtable); // clear posting_table
|
400
|
+
/* invert doc into posting_table */
|
393
401
|
|
394
|
-
|
395
|
-
dw->field_lengths = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
|
396
|
-
dw->field_offsets = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
|
397
|
-
dw->field_positions = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
|
402
|
+
h_clear(self->postingtable); /* clear posting_table */
|
398
403
|
|
399
|
-
|
400
|
-
|
404
|
+
self->field_boosts = ALLOC_N(float, self->fis->fcnt);
|
405
|
+
self->field_lengths = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
|
406
|
+
self->field_offsets = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
|
407
|
+
self->field_positions = ALLOC_AND_ZERO_N(int, self->fis->fcnt);
|
401
408
|
|
402
|
-
|
409
|
+
for (i = 0; i < self->fis->fcnt; i++) {
|
410
|
+
self->field_boosts[i] = doc->boost;
|
411
|
+
}
|
412
|
+
|
413
|
+
dw_invert_doc(self, doc);
|
403
414
|
|
404
|
-
|
405
|
-
|
415
|
+
/* sort posting_table into an array */
|
416
|
+
postings = dw_sort_posting_table(self);
|
406
417
|
|
407
|
-
|
408
|
-
dw_write_postings(
|
418
|
+
/* write postings */
|
419
|
+
dw_write_postings(self, postings, segment);
|
409
420
|
free(postings);
|
410
421
|
|
411
|
-
|
412
|
-
dw_write_norms(
|
422
|
+
/* write norms of indexed fields */
|
423
|
+
dw_write_norms(self, segment);
|
413
424
|
|
414
|
-
free(
|
415
|
-
free(
|
416
|
-
free(
|
417
|
-
free(
|
425
|
+
free(self->field_boosts);
|
426
|
+
free(self->field_lengths);
|
427
|
+
free(self->field_offsets);
|
428
|
+
free(self->field_positions);
|
418
429
|
}
|
419
430
|
|
420
431
|
/****************************************************************************
|
@@ -432,9 +443,8 @@ SegmentInfo *si_create(char *name, int doc_cnt, Store *store)
|
|
432
443
|
return si;
|
433
444
|
}
|
434
445
|
|
435
|
-
void si_destroy(
|
446
|
+
void si_destroy(SegmentInfo *si)
|
436
447
|
{
|
437
|
-
SegmentInfo *si = (SegmentInfo *)p;
|
438
448
|
free(si->name);
|
439
449
|
free(si);
|
440
450
|
}
|
@@ -501,21 +511,19 @@ SegmentInfos *sis_create()
|
|
501
511
|
return sis;
|
502
512
|
}
|
503
513
|
|
504
|
-
void sis_destroy_not_infos(
|
514
|
+
void sis_destroy_not_infos(SegmentInfos *sis)
|
505
515
|
{
|
506
|
-
SegmentInfos *sis = (SegmentInfos *)p;
|
507
516
|
free(sis->segs);
|
508
|
-
free(
|
517
|
+
free(sis);
|
509
518
|
}
|
510
519
|
|
511
|
-
void sis_destroy(
|
520
|
+
void sis_destroy(SegmentInfos *sis)
|
512
521
|
{
|
513
522
|
int i;
|
514
|
-
SegmentInfos *sis = (SegmentInfos *)p;
|
515
523
|
for (i = 0; i < sis->scnt; i++)
|
516
524
|
si_destroy(sis->segs[i]);
|
517
525
|
free(sis->segs);
|
518
|
-
free(
|
526
|
+
free(sis);
|
519
527
|
}
|
520
528
|
|
521
529
|
void sis_add_si(SegmentInfos *sis, SegmentInfo *si)
|
@@ -533,8 +541,9 @@ void sis_del_at(SegmentInfos *sis, int at)
|
|
533
541
|
int i;
|
534
542
|
si_destroy(sis->segs[at]);
|
535
543
|
sis->scnt--;
|
536
|
-
for (i = at; i < sis->scnt; i++)
|
544
|
+
for (i = at; i < sis->scnt; i++) {
|
537
545
|
sis->segs[i] = sis->segs[i+1];
|
546
|
+
}
|
538
547
|
}
|
539
548
|
|
540
549
|
void sis_del_from_to(SegmentInfos *sis, int from, int to)
|
@@ -561,24 +570,25 @@ void sis_clear(SegmentInfos *sis)
|
|
561
570
|
void sis_read(SegmentInfos *sis, Store *store)
|
562
571
|
{
|
563
572
|
int doc_cnt;
|
573
|
+
int seg_count;
|
574
|
+
int i;
|
564
575
|
char *name;
|
565
576
|
InStream *is = store->open_input(store, SEGMENT_FILENAME);
|
566
577
|
|
567
578
|
TRY
|
568
579
|
|
569
580
|
sis->format = is_read_int(is);
|
570
|
-
if (sis->format < 0) {
|
571
|
-
|
581
|
+
if (sis->format < 0) { /* file contains explicit format info */
|
582
|
+
/* check that it is a format we can understand */
|
572
583
|
if (sis->format < FORMAT)
|
573
|
-
RAISE(
|
574
|
-
sis->version = is_read_long(is);
|
575
|
-
sis->counter = is_read_int(is);
|
576
|
-
} else {
|
584
|
+
RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
|
585
|
+
sis->version = (uint)is_read_long(is);
|
586
|
+
sis->counter = (int)is_read_int(is);
|
587
|
+
} else { /* file is in old format without explicit format info */
|
577
588
|
sis->counter = sis->format;
|
578
589
|
}
|
579
590
|
|
580
|
-
|
581
|
-
int i;
|
591
|
+
seg_count = is_read_int(is);
|
582
592
|
for (i = 0; i < seg_count; i++) {
|
583
593
|
name = is_read_string(is);
|
584
594
|
doc_cnt = is_read_int(is);
|
@@ -586,11 +596,12 @@ void sis_read(SegmentInfos *sis, Store *store)
|
|
586
596
|
}
|
587
597
|
|
588
598
|
if (sis->format >= 0) {
|
589
|
-
|
590
|
-
if (is_pos(is) >= is_length(is))
|
591
|
-
sis->version = 0;
|
592
|
-
else
|
593
|
-
sis->version = is_read_long(is);
|
599
|
+
/* in old format the version number may be at the end of the file */
|
600
|
+
if (is_pos(is) >= is_length(is)) {
|
601
|
+
sis->version = 0; /* old file format without version number */
|
602
|
+
} else {
|
603
|
+
sis->version = (int)is_read_long(is); /* read version */
|
604
|
+
}
|
594
605
|
}
|
595
606
|
XFINALLY
|
596
607
|
is_close(is);
|
@@ -604,7 +615,7 @@ void sis_write(SegmentInfos *sis, Store *store)
|
|
604
615
|
OutStream *os = store->create_output(store, TEMPORARY_SEGMENT_FILENAME);
|
605
616
|
TRY
|
606
617
|
os_write_int(os, FORMAT);
|
607
|
-
os_write_long(os, ++(sis->version));
|
618
|
+
os_write_long(os, ++(sis->version)); /* every write changes the index */
|
608
619
|
os_write_int(os, sis->counter);
|
609
620
|
os_write_int(os, sis->scnt);
|
610
621
|
for (i = 0; i < sis->scnt; i++) {
|
@@ -617,24 +628,27 @@ void sis_write(SegmentInfos *sis, Store *store)
|
|
617
628
|
os_close(os);
|
618
629
|
XENDTRY
|
619
630
|
|
620
|
-
|
631
|
+
/* install new segment info */
|
621
632
|
store->rename(store, TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME);
|
622
633
|
}
|
623
634
|
|
624
635
|
int sis_read_current_version(Store *store)
|
625
636
|
{
|
626
|
-
|
627
|
-
|
628
|
-
InStream *is = store->open_input(store, SEGMENT_FILENAME);
|
637
|
+
InStream *is;
|
638
|
+
SegmentInfos *sis;
|
629
639
|
int format = 0;
|
630
640
|
int version = 0;
|
631
641
|
|
642
|
+
if (!store->exists(store, SEGMENT_FILENAME))
|
643
|
+
return 0;
|
644
|
+
is = store->open_input(store, SEGMENT_FILENAME);
|
645
|
+
|
632
646
|
TRY
|
633
647
|
format = is_read_int(is);
|
634
648
|
if (format < 0) {
|
635
649
|
if (format < FORMAT)
|
636
|
-
RAISE(
|
637
|
-
version = is_read_long(is);
|
650
|
+
RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
|
651
|
+
version = (int)is_read_long(is);
|
638
652
|
}
|
639
653
|
XFINALLY
|
640
654
|
is_close(is);
|
@@ -643,11 +657,11 @@ int sis_read_current_version(Store *store)
|
|
643
657
|
if (format < 0)
|
644
658
|
return version;
|
645
659
|
|
646
|
-
|
647
|
-
|
648
|
-
|
660
|
+
/* We cannot be sure about the format of the file.
|
661
|
+
* Therefore we have to read the whole file and cannot simply
|
662
|
+
* seek to the version entry. */
|
649
663
|
|
650
|
-
|
664
|
+
sis = sis_create();
|
651
665
|
sis_read(sis, store);
|
652
666
|
version = sis->version;
|
653
667
|
sis_destroy(sis);
|
@@ -660,8 +674,10 @@ int sis_read_current_version(Store *store)
|
|
660
674
|
*
|
661
675
|
****************************************************************************/
|
662
676
|
|
663
|
-
|
664
|
-
|
677
|
+
/**
|
678
|
+
* Deletes the analyzer by default but leaves the store by default
|
679
|
+
*/
|
680
|
+
IndexWriter *iw_open(Store *store, Analyzer *analyzer, bool create)
|
665
681
|
{
|
666
682
|
IndexWriter *iw = ALLOC(IndexWriter);
|
667
683
|
if (create)
|
@@ -674,15 +690,14 @@ IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
|
674
690
|
iw->term_index_interval = config.term_index_interval;
|
675
691
|
iw->use_compound_file = true;
|
676
692
|
iw->store = store;
|
677
|
-
|
678
|
-
iw->close_analyzer = close_analyzer;
|
693
|
+
ref(store);
|
679
694
|
iw->analyzer = analyzer;
|
680
695
|
iw->sis = sis_create();
|
681
696
|
iw->similarity = sim_create_default();
|
682
697
|
iw->ram_store = open_ram_store();
|
683
698
|
|
684
699
|
mutex_lock(&store->mutex);
|
685
|
-
|
700
|
+
/* keep the write_lock obtained until the IndexWriter is closed. */
|
686
701
|
iw->write_lock = store->open_lock(store, WRITE_LOCK_NAME);
|
687
702
|
if (!iw->write_lock->obtain(iw->write_lock)) {
|
688
703
|
RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
|
@@ -695,7 +710,7 @@ IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
|
695
710
|
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
696
711
|
}
|
697
712
|
TRY
|
698
|
-
|
713
|
+
/* commit the index */
|
699
714
|
store->clear(store);
|
700
715
|
sis_write(iw->sis, store);
|
701
716
|
XFINALLY
|
@@ -714,8 +729,9 @@ const char base36_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
|
|
714
729
|
char *new_segment_name(int counter)
|
715
730
|
{
|
716
731
|
char buf[SEGMENT_NAME_MAX_LENGTH];
|
717
|
-
buf[SEGMENT_NAME_MAX_LENGTH - 1] = '\0';
|
718
732
|
int i;
|
733
|
+
|
734
|
+
buf[SEGMENT_NAME_MAX_LENGTH - 1] = '\0';
|
719
735
|
for (i = SEGMENT_NAME_MAX_LENGTH - 2; ; i--) {
|
720
736
|
buf[i] = base36_digitmap[counter%36];
|
721
737
|
counter /= 36;
|
@@ -749,8 +765,8 @@ void delete_files(Array *file_names, Store *store)
|
|
749
765
|
Array *sr_file_names(IndexReader *ir);
|
750
766
|
void iw_delete_segments(IndexWriter *iw, IndexReader **segment_readers, int del_cnt)
|
751
767
|
{
|
752
|
-
|
753
|
-
|
768
|
+
/* The java version keeps a record of files that it couldn't delete. This
|
769
|
+
* shouldn't be a problem on linux I hope. */
|
754
770
|
IndexReader *ir;
|
755
771
|
int i;
|
756
772
|
for (i = 0; i < del_cnt; i++) {
|
@@ -761,22 +777,25 @@ void iw_delete_segments(IndexWriter *iw, IndexReader **segment_readers, int del_
|
|
761
777
|
|
762
778
|
void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merger)
|
763
779
|
{
|
780
|
+
Array *files_to_delete;
|
781
|
+
Lock *commit_lock;
|
764
782
|
char merged_tmp[SEGMENT_NAME_MAX_LENGTH], merged_cfs[SEGMENT_NAME_MAX_LENGTH];
|
765
783
|
|
766
784
|
mutex_lock(&iw->store->mutex);
|
767
785
|
sprintf(merged_tmp, "%s.tmp", merged_name);
|
768
786
|
sprintf(merged_cfs, "%s.cfs", merged_name);
|
769
787
|
|
770
|
-
|
771
|
-
|
788
|
+
files_to_delete = sm_create_compound_file(merger, merged_tmp);
|
789
|
+
commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
772
790
|
|
773
791
|
if (!commit_lock->obtain(commit_lock)) {
|
774
792
|
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
775
793
|
}
|
776
794
|
|
777
|
-
|
795
|
+
/* make compound file visible for SegmentReaders */
|
778
796
|
iw->store->rename(iw->store, merged_tmp, merged_cfs);
|
779
|
-
|
797
|
+
|
798
|
+
/* delete now unused files of segment */
|
780
799
|
delete_files(files_to_delete, iw->store);
|
781
800
|
|
782
801
|
commit_lock->release(commit_lock);
|
@@ -787,7 +806,9 @@ void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merge
|
|
787
806
|
void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segment)
|
788
807
|
{
|
789
808
|
int i;
|
790
|
-
|
809
|
+
int merged_doc_count;
|
810
|
+
Lock *commit_lock;
|
811
|
+
IndexReader **segments_to_delete = ALLOC_N(IndexReader *, max_segment - min_segment);
|
791
812
|
int del_cnt = 0;
|
792
813
|
|
793
814
|
char *merged_name = new_segment_name(iw->sis->counter++);
|
@@ -797,31 +818,31 @@ void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segmen
|
|
797
818
|
|
798
819
|
|
799
820
|
for (i = min_segment; i < max_segment; i++) {
|
800
|
-
reader = sr_open(iw->sis, i, false
|
821
|
+
reader = sr_open(iw->sis, i, false);
|
801
822
|
sm_add(merger, reader);
|
802
|
-
if ((reader->store == iw->store) ||
|
823
|
+
if ((reader->store == iw->store) || /* if we own the directory */
|
803
824
|
(reader->store == iw->ram_store)) {
|
804
|
-
segments_to_delete[del_cnt++] = reader;
|
825
|
+
segments_to_delete[del_cnt++] = reader; /* queue segment for deletion */
|
805
826
|
}
|
806
827
|
}
|
807
828
|
|
808
|
-
|
829
|
+
merged_doc_count = sm_merge(merger);
|
809
830
|
|
810
831
|
sis_del_from_to(iw->sis, min_segment, max_segment);
|
811
832
|
|
812
833
|
sis_add_si(iw->sis, si_create(merged_name, merged_doc_count, iw->store));
|
813
834
|
|
814
|
-
|
835
|
+
/* close readers before we attempt to delete now-obsolete segments */
|
815
836
|
|
816
837
|
mutex_lock(&iw->store->mutex);
|
817
|
-
|
838
|
+
commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
818
839
|
if (!commit_lock->obtain(commit_lock)) {
|
819
840
|
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
820
841
|
}
|
821
|
-
|
842
|
+
/* commit the index */
|
822
843
|
sis_write(iw->sis, iw->store);
|
823
844
|
iw_delete_segments(iw, segments_to_delete, del_cnt);
|
824
|
-
|
845
|
+
|
825
846
|
commit_lock->release(commit_lock);
|
826
847
|
iw->store->close_lock(commit_lock);
|
827
848
|
mutex_unlock(&iw->store->mutex);
|
@@ -830,6 +851,7 @@ void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segmen
|
|
830
851
|
make_compound_file(iw, merged_name, merger);
|
831
852
|
}
|
832
853
|
|
854
|
+
free(segments_to_delete);
|
833
855
|
sm_destroy(merger);
|
834
856
|
}
|
835
857
|
|
@@ -845,23 +867,25 @@ void iw_maybe_merge_segments(IndexWriter *iw)
|
|
845
867
|
SegmentInfo *si;
|
846
868
|
|
847
869
|
while (target_merge_docs <= iw->max_merge_docs) {
|
848
|
-
|
870
|
+
/* find segments smaller than current target size */
|
849
871
|
min_segment = iw->sis->scnt - 1;
|
850
872
|
merge_docs = 0;
|
851
873
|
while (min_segment >= 0) {
|
852
874
|
si = iw->sis->segs[min_segment];
|
853
|
-
if (si->doc_cnt >= target_merge_docs)
|
875
|
+
if (si->doc_cnt >= target_merge_docs) {
|
854
876
|
break;
|
877
|
+
}
|
855
878
|
merge_docs += si->doc_cnt;
|
856
879
|
min_segment -= 1;
|
857
880
|
}
|
858
881
|
|
859
|
-
if (merge_docs >= target_merge_docs)
|
882
|
+
if (merge_docs >= target_merge_docs) { /* found a merge to do */
|
860
883
|
iw_merge_segments(iw, min_segment + 1);
|
861
|
-
else
|
884
|
+
} else {
|
862
885
|
break;
|
886
|
+
}
|
863
887
|
|
864
|
-
target_merge_docs *= iw->merge_factor;
|
888
|
+
target_merge_docs *= iw->merge_factor; /* increase target size */
|
865
889
|
}
|
866
890
|
}
|
867
891
|
|
@@ -883,12 +907,14 @@ void iw_flush_ram_segments(IndexWriter *iw)
|
|
883
907
|
* that wasn't the ram segment. But if it fit's in with the merge
|
884
908
|
* factor, why not merge it. Otherwise we leave it and increment min_seg
|
885
909
|
*/
|
886
|
-
if (min_segment < 0 ||
|
887
|
-
(doc_count + segs[min_segment]->doc_cnt) > iw->merge_factor ||
|
888
|
-
(segs[iw->sis->scnt-1]->store != iw->ram_store))
|
910
|
+
if ((min_segment < 0) || /* add one FS segment? */
|
911
|
+
((doc_count + segs[min_segment]->doc_cnt) > iw->merge_factor) ||
|
912
|
+
(segs[iw->sis->scnt - 1]->store != iw->ram_store)) {
|
889
913
|
min_segment++;
|
890
|
-
|
914
|
+
}
|
915
|
+
if (min_segment >= iw->sis->scnt) {
|
891
916
|
return;
|
917
|
+
}
|
892
918
|
iw_merge_segments(iw, min_segment);
|
893
919
|
}
|
894
920
|
|
@@ -937,17 +963,16 @@ void iw_close(IndexWriter *iw)
|
|
937
963
|
{
|
938
964
|
mutex_lock(&iw->mutex);
|
939
965
|
iw_flush_ram_segments(iw);
|
940
|
-
|
966
|
+
store_deref(iw->ram_store);
|
941
967
|
sis_destroy(iw->sis);
|
942
968
|
|
943
969
|
sim_destroy(iw->similarity);
|
944
|
-
|
970
|
+
a_deref(iw->analyzer);
|
945
971
|
|
946
972
|
iw->write_lock->release(iw->write_lock);
|
947
973
|
iw->store->close_lock(iw->write_lock);
|
948
974
|
|
949
|
-
|
950
|
-
store_close(iw->store);
|
975
|
+
store_deref(iw->store);
|
951
976
|
mutex_destroy(&iw->mutex);
|
952
977
|
free(iw);
|
953
978
|
}
|
@@ -957,13 +982,13 @@ void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
|
|
957
982
|
int i, j, end, start;
|
958
983
|
|
959
984
|
mutex_lock(&iw->mutex);
|
960
|
-
iw_optimize_internal(iw);
|
985
|
+
iw_optimize_internal(iw); /* start with zero or 1 seg */
|
961
986
|
|
962
987
|
start = iw->sis->scnt;
|
963
988
|
|
964
989
|
for (i = 0; i < cnt; i++) {
|
965
990
|
Store *store = stores[i];
|
966
|
-
SegmentInfos *sis = sis_create();
|
991
|
+
SegmentInfos *sis = sis_create(); /* read infos from dir */
|
967
992
|
sis_read(sis, store);
|
968
993
|
|
969
994
|
for (j = 0; j < sis->scnt; j++) {
|
@@ -973,7 +998,7 @@ void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
|
|
973
998
|
sis_destroy_not_infos(sis);
|
974
999
|
}
|
975
1000
|
|
976
|
-
|
1001
|
+
/* merge newly added segments in log(n) passes */
|
977
1002
|
while (iw->sis->scnt > start + iw->merge_factor) {
|
978
1003
|
for (i = start + 1; i < iw->sis->scnt; i++) {
|
979
1004
|
end = MIN(iw->sis->scnt, i + iw->merge_factor);
|
@@ -983,7 +1008,7 @@ void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
|
|
983
1008
|
}
|
984
1009
|
}
|
985
1010
|
|
986
|
-
|
1011
|
+
/* final cleanup */
|
987
1012
|
iw_optimize_internal(iw);
|
988
1013
|
mutex_unlock(&iw->mutex);
|
989
1014
|
}
|
@@ -996,16 +1021,20 @@ void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
|
|
996
1021
|
{
|
997
1022
|
IndexReader *ir = NULL;
|
998
1023
|
int i, del_cnt = 0;
|
999
|
-
|
1024
|
+
int doc_count;
|
1025
|
+
char *merged_name;
|
1026
|
+
SegmentMerger *merger;
|
1027
|
+
Lock *commit_lock;
|
1028
|
+
|
1000
1029
|
mutex_lock(&iw->mutex);
|
1001
|
-
iw_optimize_internal(iw);
|
1030
|
+
iw_optimize_internal(iw); /* start with zero or 1 seg */
|
1002
1031
|
|
1003
|
-
|
1032
|
+
merged_name = new_segment_name(iw->sis->counter++);
|
1004
1033
|
|
1005
|
-
|
1006
|
-
merger->readers->free_elem = NULL;
|
1034
|
+
merger = sm_create(iw->store, merged_name, iw->term_index_interval);
|
1035
|
+
merger->readers->free_elem = NULL; /* don't close readers */
|
1007
1036
|
|
1008
|
-
if (iw->sis->scnt == 1) {
|
1037
|
+
if (iw->sis->scnt == 1) { /* add existing index, if any */
|
1009
1038
|
ir = sr_open_si(iw->sis->segs[0]);
|
1010
1039
|
sm_add(merger, ir);
|
1011
1040
|
del_cnt = 1;
|
@@ -1015,18 +1044,19 @@ void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
|
|
1015
1044
|
sm_add(merger, irs[i]);
|
1016
1045
|
}
|
1017
1046
|
|
1018
|
-
|
1047
|
+
doc_count = sm_merge(merger); /* merge 'em */
|
1019
1048
|
|
1020
|
-
|
1049
|
+
/* pop old infos and add new ones. */
|
1021
1050
|
sis_clear(iw->sis);
|
1022
1051
|
sis_add_si(iw->sis, si_create(merged_name, doc_count, iw->store));
|
1023
1052
|
|
1024
1053
|
|
1025
|
-
|
1026
|
-
if (!commit_lock->obtain(commit_lock))
|
1054
|
+
commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
1055
|
+
if (!commit_lock->obtain(commit_lock)) { /* obtain write lock */
|
1027
1056
|
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
1057
|
+
}
|
1028
1058
|
|
1029
|
-
sis_write(iw->sis, iw->store);
|
1059
|
+
sis_write(iw->sis, iw->store); /* commit changes */
|
1030
1060
|
iw_delete_segments(iw, &ir, del_cnt);
|
1031
1061
|
if (ir) ir_close(ir);
|
1032
1062
|
|
@@ -1059,24 +1089,28 @@ Norm *norm_create(InStream *is, int field_num)
|
|
1059
1089
|
return norm;
|
1060
1090
|
}
|
1061
1091
|
|
1062
|
-
void norm_destroy(
|
1092
|
+
void norm_destroy(Norm *norm)
|
1063
1093
|
{
|
1064
|
-
Norm *norm = (Norm *)p;
|
1065
1094
|
is_close(norm->is);
|
1066
|
-
if (norm->bytes != NULL)
|
1095
|
+
if (norm->bytes != NULL) {
|
1096
|
+
free(norm->bytes);
|
1097
|
+
}
|
1067
1098
|
free(norm);
|
1068
1099
|
}
|
1069
1100
|
|
1070
1101
|
void norm_rewrite(Norm *norm, Store *store, char *segment,
|
1071
1102
|
int doc_count, Store *cfs_store)
|
1072
1103
|
{
|
1073
|
-
|
1074
|
-
return; // These norms do not need to be rewritten
|
1075
|
-
|
1104
|
+
OutStream *os;
|
1076
1105
|
char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
|
1077
1106
|
char norm_fname[SEGMENT_NAME_MAX_LENGTH];
|
1107
|
+
|
1108
|
+
if (norm->bytes == NULL) {
|
1109
|
+
return; /* These norms do not need to be rewritten */
|
1110
|
+
}
|
1111
|
+
|
1078
1112
|
sprintf(tmp_fname, "%s.tmp", segment);
|
1079
|
-
|
1113
|
+
os = store->create_output(store, tmp_fname);
|
1080
1114
|
TRY
|
1081
1115
|
os_write_bytes(os, norm->bytes, doc_count);
|
1082
1116
|
XFINALLY
|
@@ -1097,7 +1131,7 @@ void norm_rewrite(Norm *norm, Store *store, char *segment,
|
|
1097
1131
|
*
|
1098
1132
|
****************************************************************************/
|
1099
1133
|
|
1100
|
-
#define GET_SR SegmentReader *sr = (SegmentReader *)ir->data
|
1134
|
+
#define GET_SR SegmentReader *sr = (SegmentReader *)ir->data
|
1101
1135
|
|
1102
1136
|
int sr_max_doc(IndexReader *ir)
|
1103
1137
|
{
|
@@ -1129,8 +1163,8 @@ void sr_close(IndexReader *ir)
|
|
1129
1163
|
|
1130
1164
|
if (sr->freq_in) is_close(sr->freq_in);
|
1131
1165
|
if (sr->prox_in) is_close(sr->prox_in);
|
1166
|
+
|
1132
1167
|
fis_destroy(sr->fis);
|
1133
|
-
|
1134
1168
|
sr_close_norms(sr);
|
1135
1169
|
|
1136
1170
|
if (sr->orig_tvr) {
|
@@ -1139,7 +1173,7 @@ void sr_close(IndexReader *ir)
|
|
1139
1173
|
ary_destroy(sr->tvr_bucket);
|
1140
1174
|
}
|
1141
1175
|
if (sr->deleted_docs) bv_destroy(sr->deleted_docs);
|
1142
|
-
if (sr->cfs_store)
|
1176
|
+
if (sr->cfs_store) store_deref(sr->cfs_store);
|
1143
1177
|
if (sr->fake_norms) free(sr->fake_norms);
|
1144
1178
|
free(sr->segment);
|
1145
1179
|
free(sr);
|
@@ -1175,8 +1209,8 @@ bool sr_is_deleted(IndexReader *ir, int doc_num)
|
|
1175
1209
|
|
1176
1210
|
bool sr_has_norms(IndexReader *ir, char *field)
|
1177
1211
|
{
|
1178
|
-
bool has_norms;
|
1179
1212
|
GET_SR;
|
1213
|
+
bool has_norms;
|
1180
1214
|
mutex_lock(&ir->mutex);
|
1181
1215
|
has_norms = h_has_key(sr->norms, field);
|
1182
1216
|
mutex_unlock(&ir->mutex);
|
@@ -1215,13 +1249,13 @@ TermEnum *sr_terms_from(IndexReader *ir, Term *term)
|
|
1215
1249
|
|
1216
1250
|
Document *sr_get_doc(IndexReader *ir, int doc_num)
|
1217
1251
|
{
|
1252
|
+
GET_SR;
|
1218
1253
|
Document *doc;
|
1219
1254
|
mutex_lock(&ir->mutex);
|
1220
1255
|
if (sr_is_deleted_internal(ir, doc_num)) {
|
1221
1256
|
mutex_unlock(&ir->mutex);
|
1222
1257
|
RAISE(STATE_ERROR, DELETED_DOC_ERROR_MSG);
|
1223
1258
|
}
|
1224
|
-
GET_SR;
|
1225
1259
|
doc = fr_get_doc(sr->fr, doc_num);
|
1226
1260
|
mutex_unlock(&ir->mutex);
|
1227
1261
|
return doc;
|
@@ -1234,11 +1268,11 @@ sr_get_norms_into_internal(IndexReader *ir, char *field, uchar *buf, int offset)
|
|
1234
1268
|
Norm *norm = h_get(sr->norms, field);
|
1235
1269
|
if (norm == NULL) {
|
1236
1270
|
memset(buf + offset*sizeof(uchar), 0, sr_max_doc(ir)*sizeof(uchar));
|
1237
|
-
} else if (norm->bytes != NULL) {
|
1271
|
+
} else if (norm->bytes != NULL) { /* can copy from cache */
|
1238
1272
|
memcpy(buf + offset*sizeof(uchar), norm->bytes, sr_max_doc(ir)*sizeof(uchar));
|
1239
1273
|
} else {
|
1240
1274
|
InStream *norm_in = is_clone(norm->is);
|
1241
|
-
|
1275
|
+
/* read from disk */
|
1242
1276
|
is_seek(norm_in, 0);
|
1243
1277
|
is_read_bytes(norm_in, buf, offset, sr_max_doc(ir));
|
1244
1278
|
is_close(norm_in);
|
@@ -1256,13 +1290,14 @@ static inline uchar *sr_get_norms_internal(IndexReader *ir, char *field)
|
|
1256
1290
|
{
|
1257
1291
|
GET_SR;
|
1258
1292
|
Norm *norm = h_get(sr->norms, field);
|
1259
|
-
if (norm == NULL)
|
1293
|
+
if (norm == NULL) { /* not an indexed field */
|
1260
1294
|
return NULL;
|
1295
|
+
}
|
1261
1296
|
|
1262
|
-
if (norm->bytes == NULL) {
|
1297
|
+
if (norm->bytes == NULL) { /* value not yet read */
|
1263
1298
|
uchar *bytes = ALLOC_N(uchar, ir->max_doc(ir));
|
1264
1299
|
sr_get_norms_into_internal(ir, field, bytes, 0);
|
1265
|
-
norm->bytes = bytes;
|
1300
|
+
norm->bytes = bytes; /* cache it */
|
1266
1301
|
}
|
1267
1302
|
return norm->bytes;
|
1268
1303
|
}
|
@@ -1278,8 +1313,8 @@ uchar *sr_get_norms(IndexReader *ir, char *field)
|
|
1278
1313
|
|
1279
1314
|
static inline uchar *sr_get_norms_always(IndexReader *ir, char *field)
|
1280
1315
|
{
|
1281
|
-
uchar *bytes;
|
1282
1316
|
GET_SR;
|
1317
|
+
uchar *bytes;
|
1283
1318
|
mutex_lock(&ir->mutex);
|
1284
1319
|
|
1285
1320
|
bytes = sr_get_norms_internal(ir, field);
|
@@ -1303,7 +1338,7 @@ void sr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
|
|
1303
1338
|
|
1304
1339
|
norm = h_get(sr->norms, field);
|
1305
1340
|
if (norm != NULL) { /* an indexed field */
|
1306
|
-
norm->is_dirty = true;
|
1341
|
+
norm->is_dirty = true; /* mark it dirty */
|
1307
1342
|
sr->norms_dirty = true;
|
1308
1343
|
|
1309
1344
|
sr_get_norms_internal(ir, field)[doc_num] = val;
|
@@ -1318,13 +1353,15 @@ int sr_doc_freq(IndexReader *ir, Term *t)
|
|
1318
1353
|
int df = ti->doc_freq;
|
1319
1354
|
ti_destroy(ti);
|
1320
1355
|
return df;
|
1321
|
-
} else
|
1356
|
+
} else {
|
1357
|
+
return 0;
|
1358
|
+
}
|
1322
1359
|
}
|
1323
1360
|
|
1324
1361
|
Array *sr_file_names(IndexReader *ir)
|
1325
1362
|
{
|
1326
1363
|
GET_SR;
|
1327
|
-
Array *file_names = ary_create(0, &
|
1364
|
+
Array *file_names = ary_create(0, &free);
|
1328
1365
|
FieldInfo *fi;
|
1329
1366
|
int i;
|
1330
1367
|
char fname[SEGMENT_NAME_MAX_LENGTH];
|
@@ -1352,8 +1389,8 @@ Array *sr_file_names(IndexReader *ir)
|
|
1352
1389
|
|
1353
1390
|
HashSet *sr_get_field_names(IndexReader *ir, int field_type)
|
1354
1391
|
{
|
1355
|
-
int i;
|
1356
1392
|
GET_SR;
|
1393
|
+
int i;
|
1357
1394
|
HashSet *field_set = hs_str_create(NULL);
|
1358
1395
|
FieldInfo *fi;
|
1359
1396
|
for (i = 0; i < sr->fis->fcnt; i++) {
|
@@ -1396,9 +1433,10 @@ HashSet *sr_get_field_names(IndexReader *ir, int field_type)
|
|
1396
1433
|
int sr_num_docs(IndexReader *ir)
|
1397
1434
|
{
|
1398
1435
|
GET_SR;
|
1399
|
-
|
1436
|
+
int num_docs;
|
1437
|
+
|
1400
1438
|
mutex_lock(&ir->mutex);
|
1401
|
-
|
1439
|
+
num_docs = sr_max_doc(ir);
|
1402
1440
|
if (sr->deleted_docs != NULL)
|
1403
1441
|
num_docs -= sr->deleted_docs->count;
|
1404
1442
|
mutex_unlock(&ir->mutex);
|
@@ -1444,8 +1482,9 @@ TermVector *sr_get_term_vector(IndexReader *ir, int doc_num, char *field)
|
|
1444
1482
|
FieldInfo *fi = (FieldInfo *)ht_get(sr->fis->by_name, field);
|
1445
1483
|
TermVectorsReader *tvr;
|
1446
1484
|
|
1447
|
-
if (fi == NULL || !fi->store_tv || !sr->orig_tvr || !(tvr = sr_tvr(sr)))
|
1485
|
+
if (fi == NULL || !fi->store_tv || !sr->orig_tvr || !(tvr = sr_tvr(sr))) {
|
1448
1486
|
return NULL;
|
1487
|
+
}
|
1449
1488
|
|
1450
1489
|
return tvr_get_field_tv(tvr, doc_num, field);
|
1451
1490
|
}
|
@@ -1454,8 +1493,9 @@ Array *sr_get_term_vectors(IndexReader *ir, int doc_num)
|
|
1454
1493
|
{
|
1455
1494
|
GET_SR;
|
1456
1495
|
TermVectorsReader *tvr;
|
1457
|
-
if (sr->orig_tvr == NULL || (tvr = sr_tvr(sr)) == NULL)
|
1496
|
+
if (sr->orig_tvr == NULL || (tvr = sr_tvr(sr)) == NULL) {
|
1458
1497
|
return NULL;
|
1498
|
+
}
|
1459
1499
|
|
1460
1500
|
return tvr_get_tv(tvr, doc_num);
|
1461
1501
|
}
|
@@ -1465,16 +1505,17 @@ void sr_commit(IndexReader *ir)
|
|
1465
1505
|
GET_SR;
|
1466
1506
|
char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
|
1467
1507
|
char del_fname[SEGMENT_NAME_MAX_LENGTH];
|
1508
|
+
|
1468
1509
|
sprintf(del_fname, "%s.del", sr->segment);
|
1469
1510
|
|
1470
|
-
if (sr->deleted_docs_dirty) {
|
1511
|
+
if (sr->deleted_docs_dirty) { /* re-write deleted */
|
1471
1512
|
sprintf(tmp_fname, "%s.tmp", sr->segment);
|
1472
1513
|
bv_write(sr->deleted_docs, ir->store, tmp_fname);
|
1473
1514
|
ir->store->rename(ir->store, tmp_fname, del_fname);
|
1474
1515
|
}
|
1475
1516
|
if (sr->undelete_all && ir->store->exists(ir->store, del_fname))
|
1476
1517
|
ir->store->remove(ir->store, del_fname);
|
1477
|
-
if (sr->norms_dirty) {
|
1518
|
+
if (sr->norms_dirty) {/* re-write norms */
|
1478
1519
|
int i;
|
1479
1520
|
FieldInfo *fi;
|
1480
1521
|
for (i = 0; i < sr->fis->fcnt; i++) {
|
@@ -1494,6 +1535,8 @@ IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
|
|
1494
1535
|
{
|
1495
1536
|
Store *store = si->store;
|
1496
1537
|
SegmentReader *sr = ALLOC(SegmentReader);
|
1538
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
1539
|
+
|
1497
1540
|
ir->get_term_vector = &sr_get_term_vector;
|
1498
1541
|
ir->get_term_vectors = &sr_get_term_vectors;
|
1499
1542
|
ir->num_docs = &sr_num_docs;
|
@@ -1518,7 +1561,6 @@ IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
|
|
1518
1561
|
ir->do_close = &sr_close;
|
1519
1562
|
ir->data = sr;
|
1520
1563
|
sr->segment = estrdup(si->name);
|
1521
|
-
char fname[SEGMENT_NAME_MAX_LENGTH];
|
1522
1564
|
sr->cfs_store = NULL;
|
1523
1565
|
sr->fake_norms = NULL;
|
1524
1566
|
sprintf(fname, "%s.cfs", sr->segment);
|
@@ -1545,13 +1587,13 @@ IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
|
|
1545
1587
|
sr->freq_in = store->open_input(store, fname);
|
1546
1588
|
sprintf(fname, "%s.prx", sr->segment);
|
1547
1589
|
sr->prox_in = store->open_input(store, fname);
|
1548
|
-
sr->norms = h_new_str(NULL, &norm_destroy);
|
1590
|
+
sr->norms = h_new_str((free_ft)NULL, (free_ft)&norm_destroy);
|
1549
1591
|
sr_open_norms(ir, store);
|
1550
1592
|
|
1551
1593
|
if (fis_has_vectors(sr->fis)) {
|
1552
1594
|
sr->orig_tvr = tvr_open(store, sr->segment, sr->fis);
|
1553
1595
|
thread_key_create(&sr->thread_tvr, NULL);
|
1554
|
-
sr->tvr_bucket = ary_create(1, (
|
1596
|
+
sr->tvr_bucket = ary_create(1, (free_ft)&tvr_close);
|
1555
1597
|
} else {
|
1556
1598
|
sr->orig_tvr = NULL;
|
1557
1599
|
}
|
@@ -1560,16 +1602,19 @@ IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
|
|
1560
1602
|
|
1561
1603
|
IndexReader *sr_open_si(SegmentInfo *si)
|
1562
1604
|
{
|
1563
|
-
IndexReader *ir = ir_create(si->store, NULL, false
|
1605
|
+
IndexReader *ir = ir_create(si->store, NULL, false);
|
1606
|
+
ref(si->store);
|
1564
1607
|
return sr_open_internal(ir, si);
|
1565
1608
|
}
|
1566
1609
|
|
1567
|
-
IndexReader *sr_open(SegmentInfos *sis, int si_num,
|
1610
|
+
IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner)
|
1568
1611
|
{
|
1569
1612
|
SegmentInfo *si = sis->segs[si_num];
|
1570
|
-
IndexReader *ir = ir_create(si->store, sis, is_owner
|
1613
|
+
IndexReader *ir = ir_create(si->store, sis, is_owner);
|
1614
|
+
ref(si->store);
|
1571
1615
|
return sr_open_internal(ir, si);
|
1572
1616
|
}
|
1617
|
+
|
1573
1618
|
/****************************************************************************
|
1574
1619
|
*
|
1575
1620
|
* MultiReader
|
@@ -1579,14 +1624,14 @@ IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_stor
|
|
1579
1624
|
#define GET_MR MultiReader *mr = (MultiReader *)ir->data
|
1580
1625
|
#define GET_READER(doc_num) MultiReader *mr = (MultiReader *)ir->data;\
|
1581
1626
|
int i = mr_reader_index(mr, doc_num);\
|
1582
|
-
IndexReader *reader = mr->sub_readers[i]
|
1627
|
+
IndexReader *reader = mr->sub_readers[i]
|
1583
1628
|
|
1584
1629
|
|
1585
1630
|
|
1586
1631
|
int mr_reader_index(MultiReader *mr, int doc_num)
|
1587
1632
|
{
|
1588
|
-
int lo = 0;
|
1589
|
-
int hi = mr->rcnt - 1;
|
1633
|
+
int lo = 0; /* search @starts array */
|
1634
|
+
int hi = mr->rcnt - 1; /* for first element less */
|
1590
1635
|
int mid;
|
1591
1636
|
int mid_value;
|
1592
1637
|
|
@@ -1597,9 +1642,9 @@ int mr_reader_index(MultiReader *mr, int doc_num)
|
|
1597
1642
|
hi = mid - 1;
|
1598
1643
|
} else if (doc_num > mid_value) {
|
1599
1644
|
lo = mid + 1;
|
1600
|
-
} else {
|
1645
|
+
} else { /* found a match */
|
1601
1646
|
while ((mid+1 < mr->rcnt) && (mr->starts[mid+1] == mid_value))
|
1602
|
-
mid += 1;
|
1647
|
+
mid += 1; /* scan to last match in case we have empty segments */
|
1603
1648
|
return mid;
|
1604
1649
|
}
|
1605
1650
|
}
|
@@ -1652,10 +1697,11 @@ Document *mr_get_doc(IndexReader *ir, int doc_num)
|
|
1652
1697
|
void mr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
|
1653
1698
|
{
|
1654
1699
|
int i;
|
1700
|
+
uchar *bytes;
|
1655
1701
|
GET_MR;
|
1656
1702
|
|
1657
1703
|
mutex_lock(&ir->mutex);
|
1658
|
-
|
1704
|
+
bytes = h_get(mr->norms_cache, field);
|
1659
1705
|
if (bytes != NULL) {
|
1660
1706
|
memcpy(buf + offset, bytes, mr->max_doc);
|
1661
1707
|
} else {
|
@@ -1684,7 +1730,7 @@ uchar *mr_get_norms(IndexReader *ir, char *field)
|
|
1684
1730
|
reader = mr->sub_readers[i];
|
1685
1731
|
reader->get_norms_into(reader, field, bytes, mr->starts[i]);
|
1686
1732
|
}
|
1687
|
-
h_set(mr->norms_cache, field, bytes);
|
1733
|
+
h_set(mr->norms_cache, field, bytes); /* update cache */
|
1688
1734
|
}
|
1689
1735
|
mutex_unlock(&ir->mutex);
|
1690
1736
|
|
@@ -1694,7 +1740,7 @@ uchar *mr_get_norms(IndexReader *ir, char *field)
|
|
1694
1740
|
void mr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
|
1695
1741
|
{
|
1696
1742
|
GET_READER(doc_num);
|
1697
|
-
h_del(mr->norms_cache, field);
|
1743
|
+
h_del(mr->norms_cache, field); /* clear cache */
|
1698
1744
|
ir_set_norm(reader, doc_num - mr->starts[i], field, val);
|
1699
1745
|
}
|
1700
1746
|
|
@@ -1712,7 +1758,7 @@ TermEnum *mr_terms_from(IndexReader *ir, Term *term)
|
|
1712
1758
|
|
1713
1759
|
int mr_doc_freq(IndexReader *ir, Term *t)
|
1714
1760
|
{
|
1715
|
-
int total = 0, i;
|
1761
|
+
int total = 0, i; /* sum freqs in segments */
|
1716
1762
|
GET_MR;
|
1717
1763
|
|
1718
1764
|
IndexReader *reader;
|
@@ -1738,9 +1784,10 @@ TermDocEnum *mr_term_positions(IndexReader *ir)
|
|
1738
1784
|
void mr_delete_doc(IndexReader *ir, int doc_num)
|
1739
1785
|
{
|
1740
1786
|
GET_READER(doc_num);
|
1741
|
-
mr->num_docs_cache = -1;
|
1787
|
+
mr->num_docs_cache = -1; /* invalidate cache */
|
1742
1788
|
|
1743
|
-
|
1789
|
+
/* dispatch to segment reader */
|
1790
|
+
reader->do_delete_doc(reader, doc_num - mr->starts[i]);
|
1744
1791
|
mr->has_deletions = true;
|
1745
1792
|
}
|
1746
1793
|
|
@@ -1778,8 +1825,9 @@ void mr_undelete_all(IndexReader *ir)
|
|
1778
1825
|
{
|
1779
1826
|
int i;
|
1780
1827
|
GET_MR;
|
1781
|
-
mr->num_docs_cache = -1; // invalidate cache
|
1782
1828
|
IndexReader *reader;
|
1829
|
+
|
1830
|
+
mr->num_docs_cache = -1; /* invalidate cache */
|
1783
1831
|
for (i = 0; i < mr->rcnt; i++) {
|
1784
1832
|
reader = mr->sub_readers[i];
|
1785
1833
|
reader->do_undelete_all(reader);
|
@@ -1829,12 +1877,12 @@ void mr_close(IndexReader *ir)
|
|
1829
1877
|
IndexReader *mr_open(Store *store,
|
1830
1878
|
SegmentInfos *sis,
|
1831
1879
|
IndexReader **sub_readers,
|
1832
|
-
int rcnt
|
1833
|
-
int close_store)
|
1880
|
+
int rcnt)
|
1834
1881
|
{
|
1835
1882
|
int i;
|
1836
1883
|
MultiReader *mr = ALLOC(MultiReader);
|
1837
1884
|
IndexReader *sub_reader;
|
1885
|
+
IndexReader *ir;
|
1838
1886
|
mr->sub_readers = sub_readers;
|
1839
1887
|
mr->rcnt = rcnt;
|
1840
1888
|
|
@@ -1846,15 +1894,16 @@ IndexReader *mr_open(Store *store,
|
|
1846
1894
|
for (i = 0; i < rcnt; i++) {
|
1847
1895
|
sub_reader = sub_readers[i];
|
1848
1896
|
mr->starts[i] = mr->max_doc;
|
1849
|
-
mr->max_doc += sub_reader->max_doc(sub_reader);
|
1897
|
+
mr->max_doc += sub_reader->max_doc(sub_reader); /* compute max_docs */
|
1850
1898
|
|
1851
|
-
if (sub_reader->has_deletions(sub_reader))
|
1899
|
+
if (sub_reader->has_deletions(sub_reader)) {
|
1852
1900
|
mr->has_deletions = true;
|
1901
|
+
}
|
1853
1902
|
}
|
1854
1903
|
mr->starts[rcnt] = mr->max_doc;
|
1855
|
-
mr->norms_cache = h_new_str(NULL, &
|
1904
|
+
mr->norms_cache = h_new_str(NULL, &free);
|
1856
1905
|
|
1857
|
-
|
1906
|
+
ir = ir_create(store, sis, true);
|
1858
1907
|
ir->get_term_vector = &mr_get_term_vector;
|
1859
1908
|
ir->get_term_vectors = &mr_get_term_vectors;
|
1860
1909
|
ir->num_docs = &mr_num_docs;
|
@@ -1888,11 +1937,8 @@ IndexReader *mr_open(Store *store,
|
|
1888
1937
|
*
|
1889
1938
|
****************************************************************************/
|
1890
1939
|
|
1891
|
-
bool smi_lt(
|
1940
|
+
bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2)
|
1892
1941
|
{
|
1893
|
-
SegmentMergeInfo *smi1 = (SegmentMergeInfo *)p1;
|
1894
|
-
SegmentMergeInfo *smi2 = (SegmentMergeInfo *)p2;
|
1895
|
-
|
1896
1942
|
int cmpres = tb_cmp(smi1->tb, smi2->tb);
|
1897
1943
|
if (cmpres == 0) {
|
1898
1944
|
return smi1->base < smi2->base;
|
@@ -1906,8 +1952,9 @@ int *smi_load_doc_map(SegmentMergeInfo *smi)
|
|
1906
1952
|
IndexReader *ir = smi->ir;
|
1907
1953
|
if (ir->has_deletions(ir) && (smi->doc_map == NULL)) {
|
1908
1954
|
int max_doc = ir->max_doc(ir);
|
1909
|
-
smi->doc_map = ALLOC_N(int, max_doc);
|
1910
1955
|
int j = 0, i;
|
1956
|
+
|
1957
|
+
smi->doc_map = ALLOC_N(int, max_doc);
|
1911
1958
|
for (i = 0; i < max_doc; i++) {
|
1912
1959
|
if (ir->is_deleted(ir, i)) {
|
1913
1960
|
smi->doc_map[i] = -1;
|
@@ -1931,9 +1978,8 @@ SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir)
|
|
1931
1978
|
return smi;
|
1932
1979
|
}
|
1933
1980
|
|
1934
|
-
void smi_destroy(
|
1981
|
+
void smi_destroy(SegmentMergeInfo *smi)
|
1935
1982
|
{
|
1936
|
-
SegmentMergeInfo *smi = (SegmentMergeInfo *)p;
|
1937
1983
|
smi->postings->close(smi->postings);
|
1938
1984
|
smi->te->close(smi->te);
|
1939
1985
|
if (smi->doc_map != NULL)
|
@@ -1957,7 +2003,7 @@ SegmentMerger *sm_create(Store *store, char *name, int term_index_interval)
|
|
1957
2003
|
SegmentMerger *sm = ALLOC(SegmentMerger);
|
1958
2004
|
sm->store = store;
|
1959
2005
|
sm->name = estrdup(name);
|
1960
|
-
sm->readers = ary_create(config.merge_factor, &
|
2006
|
+
sm->readers = ary_create(config.merge_factor, (free_ft)&ir_close);
|
1961
2007
|
sm->fis = NULL;
|
1962
2008
|
sm->freq_out = NULL;
|
1963
2009
|
sm->prox_out = NULL;
|
@@ -1976,8 +2022,9 @@ void sm_close(SegmentMerger *sm)
|
|
1976
2022
|
if (sm->freq_out != NULL) os_close(sm->freq_out);
|
1977
2023
|
if (sm->prox_out != NULL) os_close(sm->prox_out);
|
1978
2024
|
if (sm->tiw != NULL) {
|
1979
|
-
for (i = 0; i < sm->terms_buf_size; i++)
|
2025
|
+
for (i = 0; i < sm->terms_buf_size; i++) {
|
1980
2026
|
free(sm->terms_buf[i].text);
|
2027
|
+
}
|
1981
2028
|
free(sm->terms_buf);
|
1982
2029
|
tiw_close(sm->tiw);
|
1983
2030
|
}
|
@@ -1988,9 +2035,8 @@ void sm_close(SegmentMerger *sm)
|
|
1988
2035
|
sm->queue = NULL;
|
1989
2036
|
}
|
1990
2037
|
|
1991
|
-
void sm_destroy(
|
2038
|
+
void sm_destroy(SegmentMerger *sm)
|
1992
2039
|
{
|
1993
|
-
SegmentMerger *sm = (SegmentMerger *)p;
|
1994
2040
|
if (sm->fis != NULL) fis_destroy(sm->fis);
|
1995
2041
|
ary_destroy(sm->readers);
|
1996
2042
|
sm_close(sm);
|
@@ -2028,6 +2074,8 @@ int sm_merge_fields(SegmentMerger *sm)
|
|
2028
2074
|
FieldInfos *fis = sm->fis = fis_create();
|
2029
2075
|
int doc_count = 0;
|
2030
2076
|
Document *doc;
|
2077
|
+
FieldsWriter *fw;
|
2078
|
+
|
2031
2079
|
for (i = 0; i < sm->readers->size; i++) {
|
2032
2080
|
IndexReader *ir = sm->readers->elems[i];
|
2033
2081
|
|
@@ -2049,15 +2097,15 @@ int sm_merge_fields(SegmentMerger *sm)
|
|
2049
2097
|
}
|
2050
2098
|
fis_write(fis, sm->store, sm->name, ".fnm");
|
2051
2099
|
|
2052
|
-
|
2053
|
-
|
2100
|
+
/* merge field values */
|
2101
|
+
fw = fw_open(sm->store, sm->name, fis);
|
2054
2102
|
|
2055
2103
|
TRY
|
2056
2104
|
for (i = 0; i < sm->readers->size; i++) {
|
2057
2105
|
IndexReader *ir = sm->readers->elems[i];
|
2058
2106
|
maxdoc = ir->max_doc(ir);
|
2059
2107
|
for (j = 0; j < maxdoc; j++) {
|
2060
|
-
if (!ir->is_deleted(ir, j)) {
|
2108
|
+
if (!ir->is_deleted(ir, j)) { /* skip deleted docs */
|
2061
2109
|
doc = ir->get_doc(ir, j);
|
2062
2110
|
fw_add_doc(fw, doc);
|
2063
2111
|
doc_destroy(doc);
|
@@ -2098,7 +2146,7 @@ int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
|
|
2098
2146
|
int i, j;
|
2099
2147
|
int last_doc = 0, base, doc, doc_code, freq, last_position, position;
|
2100
2148
|
int *doc_map = NULL;
|
2101
|
-
int df = 0;
|
2149
|
+
int df = 0; /* number of docs w/ term */
|
2102
2150
|
TermDocEnum *postings;
|
2103
2151
|
SegmentMergeInfo *smi;
|
2104
2152
|
sm_reset_skip(sm);
|
@@ -2111,31 +2159,34 @@ int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
|
|
2111
2159
|
stde_seek_ti(postings, smi->te->ti_curr);
|
2112
2160
|
while (postings->next(postings)) {
|
2113
2161
|
doc = postings->doc_num(postings);
|
2114
|
-
if (doc_map != NULL)
|
2115
|
-
doc = doc_map[doc];
|
2116
|
-
|
2162
|
+
if (doc_map != NULL) {
|
2163
|
+
doc = doc_map[doc]; /* work around deletions */
|
2164
|
+
}
|
2165
|
+
doc += base; /* convert to merged space */
|
2117
2166
|
|
2118
|
-
if (doc < last_doc)
|
2167
|
+
if (doc < last_doc) {
|
2119
2168
|
RAISE(STATE_ERROR, DOC_ORDER_ERROR_MSG);
|
2169
|
+
}
|
2120
2170
|
|
2121
2171
|
df++;
|
2122
2172
|
|
2123
|
-
if ((df % sm->skip_interval) == 0)
|
2173
|
+
if ((df % sm->skip_interval) == 0) {
|
2124
2174
|
sm_buffer_skip(sm, last_doc);
|
2175
|
+
}
|
2125
2176
|
|
2126
|
-
doc_code = (doc - last_doc) << 1;
|
2177
|
+
doc_code = (doc - last_doc) << 1; /* use low bit to flag freq=1 */
|
2127
2178
|
last_doc = doc;
|
2128
2179
|
|
2129
2180
|
freq = postings->freq(postings);
|
2130
2181
|
if (freq == 1) {
|
2131
|
-
os_write_vint(sm->freq_out, doc_code | 1);
|
2182
|
+
os_write_vint(sm->freq_out, doc_code | 1); /* write doc & freq=1 */
|
2132
2183
|
} else {
|
2133
|
-
os_write_vint(sm->freq_out, doc_code);
|
2134
|
-
os_write_vint(sm->freq_out, freq);
|
2184
|
+
os_write_vint(sm->freq_out, doc_code); /* write doc */
|
2185
|
+
os_write_vint(sm->freq_out, freq); /* write freqency in doc */
|
2135
2186
|
}
|
2136
2187
|
|
2137
2188
|
|
2138
|
-
last_position = 0;
|
2189
|
+
last_position = 0; /* write position deltas */
|
2139
2190
|
for (j = 0; j < freq; j++) {
|
2140
2191
|
position = postings->next_position(postings);
|
2141
2192
|
os_write_vint(sm->prox_out, position - last_position);
|
@@ -2167,12 +2218,12 @@ void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
|
|
2167
2218
|
int freq_pointer = os_pos(sm->freq_out);
|
2168
2219
|
int prox_pointer = os_pos(sm->prox_out);
|
2169
2220
|
|
2170
|
-
int df = sm_append_postings(sm, smis, cnt);
|
2221
|
+
int df = sm_append_postings(sm, smis, cnt); /* append posting data */
|
2171
2222
|
|
2172
2223
|
int skip_pointer = sm_write_skip(sm);
|
2173
2224
|
|
2174
2225
|
if (df > 0) {
|
2175
|
-
|
2226
|
+
/* add an entry to the dictionary with pointers to prox and freq files */
|
2176
2227
|
ti_set(sm->ti, df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer));
|
2177
2228
|
tiw_add(sm->tiw, sm_tb_to_term(sm, smis[0]->tb), sm->ti);
|
2178
2229
|
}
|
@@ -2184,7 +2235,7 @@ void sm_merge_term_infos(SegmentMerger *sm)
|
|
2184
2235
|
int i, match_size;
|
2185
2236
|
IndexReader *ir;
|
2186
2237
|
TermEnum *te;
|
2187
|
-
SegmentMergeInfo *smi, *top;
|
2238
|
+
SegmentMergeInfo *smi, *top, **match;
|
2188
2239
|
TermBuffer *tb;
|
2189
2240
|
|
2190
2241
|
for (i = 0; i < sm->readers->size; i++) {
|
@@ -2192,20 +2243,23 @@ void sm_merge_term_infos(SegmentMerger *sm)
|
|
2192
2243
|
te = ir->terms(ir);
|
2193
2244
|
smi = smi_create(base, te, ir);
|
2194
2245
|
base += ir->num_docs(ir);
|
2195
|
-
if (smi_next(smi) != NULL)
|
2196
|
-
pq_push(sm->queue, smi);
|
2197
|
-
else
|
2246
|
+
if (smi_next(smi) != NULL) {
|
2247
|
+
pq_push(sm->queue, smi); /* initialize @queue */
|
2248
|
+
} else {
|
2198
2249
|
smi_destroy(smi);
|
2250
|
+
}
|
2199
2251
|
}
|
2200
2252
|
|
2201
|
-
|
2253
|
+
match = ALLOC_N(SegmentMergeInfo *, sm->readers->size);
|
2202
2254
|
|
2203
2255
|
while (sm->queue->count > 0) {
|
2204
|
-
|
2205
|
-
|
2206
|
-
|
2207
|
-
|
2208
|
-
|
2256
|
+
/*
|
2257
|
+
for (i = 1; i <= sm->queue->count; i++) {
|
2258
|
+
printf("<{%s:%s}>", ((SegmentMergeInfo *)sm->queue->heap[i])->tb->field,
|
2259
|
+
((SegmentMergeInfo *)sm->queue->heap[i])->tb->text);
|
2260
|
+
}printf("\n\n");
|
2261
|
+
*/
|
2262
|
+
match_size = 0; /* pop matching terms */
|
2209
2263
|
match[match_size] = pq_pop(sm->queue);
|
2210
2264
|
match_size++;
|
2211
2265
|
tb = match[0]->tb;
|
@@ -2216,16 +2270,17 @@ void sm_merge_term_infos(SegmentMerger *sm)
|
|
2216
2270
|
top = pq_top(sm->queue);
|
2217
2271
|
}
|
2218
2272
|
|
2219
|
-
|
2220
|
-
sm_merge_term_info(sm, match, match_size);
|
2273
|
+
/* printf(">%s:%s<\n", match[0]->tb->field, match[0]->tb->text); */
|
2274
|
+
sm_merge_term_info(sm, match, match_size); /* add new TermInfo */
|
2221
2275
|
|
2222
2276
|
while (match_size > 0) {
|
2223
2277
|
match_size--;
|
2224
2278
|
smi = match[match_size];
|
2225
|
-
if (smi_next(smi) != NULL)
|
2226
|
-
pq_push(sm->queue, smi);
|
2227
|
-
else
|
2228
|
-
smi_destroy(smi);
|
2279
|
+
if (smi_next(smi) != NULL) {
|
2280
|
+
pq_push(sm->queue, smi); /* restore queue */
|
2281
|
+
} else {
|
2282
|
+
smi_destroy(smi); /* done with a segment */
|
2283
|
+
}
|
2229
2284
|
}
|
2230
2285
|
}
|
2231
2286
|
free(match);
|
@@ -2242,10 +2297,10 @@ void sm_merge_terms(SegmentMerger *sm)
|
|
2242
2297
|
sprintf(fname, "%s.prx", sm->name);
|
2243
2298
|
sm->prox_out = sm->store->create_output(sm->store, fname);
|
2244
2299
|
sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2300
|
+
/* terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
|
2301
|
+
* to keep the last index_interval terms so that it can compare the last term
|
2302
|
+
* put in the index with the next one. So the size of the buffer must by
|
2303
|
+
* index_interval + 2. */
|
2249
2304
|
sm->terms_buf_pointer = 0;
|
2250
2305
|
sm->terms_buf_size = sm->tiw->index_interval + 2;
|
2251
2306
|
sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
|
@@ -2254,7 +2309,7 @@ void sm_merge_terms(SegmentMerger *sm)
|
|
2254
2309
|
sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
|
2255
2310
|
}
|
2256
2311
|
sm->skip_interval = sm->tiw->skip_interval;
|
2257
|
-
sm->queue = pq_create(sm->readers->size, &smi_lt);
|
2312
|
+
sm->queue = pq_create(sm->readers->size, (lt_ft)&smi_lt);
|
2258
2313
|
|
2259
2314
|
sm_merge_term_infos(sm);
|
2260
2315
|
|
@@ -2308,11 +2363,13 @@ void sm_merge_vectors(SegmentMerger *sm)
|
|
2308
2363
|
ir = sm->readers->elems[i];
|
2309
2364
|
max_doc = ir->max_doc(ir);
|
2310
2365
|
for (j = 0; j < max_doc; j++) {
|
2311
|
-
|
2366
|
+
/* skip deleted docs */
|
2312
2367
|
if (! ir->is_deleted(ir, j)) {
|
2313
2368
|
tvs = ir->get_term_vectors(ir, j);
|
2314
|
-
|
2315
|
-
|
2369
|
+
if (tvs) {
|
2370
|
+
tvw_add_all_doc_vectors(tvw, tvs);
|
2371
|
+
ary_destroy(tvs);
|
2372
|
+
}
|
2316
2373
|
}
|
2317
2374
|
}
|
2318
2375
|
}
|
@@ -2333,7 +2390,7 @@ int sm_merge(SegmentMerger *sm)
|
|
2333
2390
|
|
2334
2391
|
Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
|
2335
2392
|
{
|
2336
|
-
Array *files = ary_create(0, &
|
2393
|
+
Array *files = ary_create(0, &free);
|
2337
2394
|
CompoundWriter *cw = open_cw(sm->store, file_name);
|
2338
2395
|
FieldInfo *fi;
|
2339
2396
|
char fname[SEGMENT_NAME_MAX_LENGTH];
|
@@ -2344,7 +2401,7 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
|
|
2344
2401
|
ary_append(files, estrdup(fname));
|
2345
2402
|
}
|
2346
2403
|
|
2347
|
-
|
2404
|
+
/* Field norm files */
|
2348
2405
|
for (i = 0; i < sm->fis->fcnt; i++) {
|
2349
2406
|
fi = sm->fis->by_number[i];
|
2350
2407
|
if (fi->is_indexed && !fi->omit_norms) {
|
@@ -2353,7 +2410,7 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
|
|
2353
2410
|
}
|
2354
2411
|
}
|
2355
2412
|
|
2356
|
-
|
2413
|
+
/* Vector files */
|
2357
2414
|
if (fis_has_vectors(sm->fis)) {
|
2358
2415
|
for (i = 0; i < NELEMS(VECTOR_EXTENSIONS); i++) {
|
2359
2416
|
sprintf(fname, "%s.%s", sm->name, VECTOR_EXTENSIONS[i]);
|
@@ -2361,12 +2418,12 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
|
|
2361
2418
|
}
|
2362
2419
|
}
|
2363
2420
|
|
2364
|
-
|
2421
|
+
/* Now merge all added files */
|
2365
2422
|
for (i = 0; i < files->size; i++) {
|
2366
2423
|
cw_add_file(cw, (char *)files->elems[i]);
|
2367
2424
|
}
|
2368
2425
|
|
2369
|
-
|
2426
|
+
/* Perform the merge */
|
2370
2427
|
cw_close(cw);
|
2371
2428
|
|
2372
2429
|
return files;
|
@@ -2386,11 +2443,11 @@ void ir_acquire_write_lock(IndexReader *ir)
|
|
2386
2443
|
|
2387
2444
|
if (ir->write_lock == NULL) {
|
2388
2445
|
ir->write_lock = ir->store->open_lock(ir->store, WRITE_LOCK_NAME);
|
2389
|
-
if (!ir->write_lock->obtain(ir->write_lock))
|
2446
|
+
if (!ir->write_lock->obtain(ir->write_lock)) /* obtain write lock */
|
2390
2447
|
RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
|
2391
2448
|
|
2392
|
-
|
2393
|
-
|
2449
|
+
/* we have to check whether index has changed since this reader was opened.
|
2450
|
+
* if so, this reader is no longer valid for deletion */
|
2394
2451
|
if (sis_read_current_version(ir->store) > ir->sis->version) {
|
2395
2452
|
ir->is_stale = true;
|
2396
2453
|
ir->write_lock->release(ir->write_lock);
|
@@ -2401,7 +2458,7 @@ void ir_acquire_write_lock(IndexReader *ir)
|
|
2401
2458
|
}
|
2402
2459
|
}
|
2403
2460
|
|
2404
|
-
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner
|
2461
|
+
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner)
|
2405
2462
|
{
|
2406
2463
|
IndexReader *ir = ALLOC(IndexReader);
|
2407
2464
|
|
@@ -2414,7 +2471,6 @@ IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_
|
|
2414
2471
|
}
|
2415
2472
|
|
2416
2473
|
ir->store = store;
|
2417
|
-
ir->close_store = close_store;
|
2418
2474
|
ir->sis = sis;
|
2419
2475
|
ir->has_changes = false;
|
2420
2476
|
ir->is_stale = false;
|
@@ -2424,7 +2480,11 @@ IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_
|
|
2424
2480
|
return ir;
|
2425
2481
|
}
|
2426
2482
|
|
2427
|
-
|
2483
|
+
/**
|
2484
|
+
* Will keep a reference to the store. To let this method delete the store
|
2485
|
+
* make sure you deref the store that you pass to it
|
2486
|
+
*/
|
2487
|
+
IndexReader *ir_open(Store *store)
|
2428
2488
|
{
|
2429
2489
|
int i;
|
2430
2490
|
IndexReader *ir;
|
@@ -2434,13 +2494,14 @@ IndexReader *ir_open(Store *store, int close_store)
|
|
2434
2494
|
sis = sis_create();
|
2435
2495
|
sis_read(sis, store);
|
2436
2496
|
if (sis->scnt == 1) {
|
2437
|
-
ir = sr_open(sis, 0, true
|
2497
|
+
ir = sr_open(sis, 0, true);
|
2438
2498
|
} else {
|
2439
2499
|
IndexReader **readers = ALLOC_N(IndexReader *, sis->scnt);
|
2440
2500
|
for (i = 0; i < sis->scnt; i++) {
|
2441
|
-
readers[i] = sr_open(sis, i, false
|
2501
|
+
readers[i] = sr_open(sis, i, false);
|
2442
2502
|
}
|
2443
|
-
|
2503
|
+
ref(store);
|
2504
|
+
ir = mr_open(store, sis, readers, sis->scnt);
|
2444
2505
|
}
|
2445
2506
|
mutex_unlock(&store->mutex);
|
2446
2507
|
return ir;
|
@@ -2481,11 +2542,13 @@ void ir_delete_doc(IndexReader *ir, int doc_num)
|
|
2481
2542
|
Document *ir_get_doc_with_term(IndexReader *ir, Term *term)
|
2482
2543
|
{
|
2483
2544
|
TermDocEnum *tde = ir_term_docs_for(ir, term);
|
2545
|
+
Document *doc = NULL;
|
2546
|
+
|
2484
2547
|
if (!tde) return NULL;
|
2485
2548
|
|
2486
|
-
|
2487
|
-
if (tde->next(tde))
|
2549
|
+
if (tde->next(tde)) {
|
2488
2550
|
doc = ir->get_doc(ir, tde->doc_num(tde));
|
2551
|
+
}
|
2489
2552
|
tde->close(tde);
|
2490
2553
|
return doc;
|
2491
2554
|
}
|
@@ -2508,11 +2571,13 @@ void ir_commit_internal(IndexReader *ir)
|
|
2508
2571
|
{
|
2509
2572
|
if (ir->has_changes) {
|
2510
2573
|
if (ir->is_owner) {
|
2574
|
+
Lock *commit_lock;
|
2511
2575
|
|
2512
2576
|
mutex_lock(&ir->store->mutex);
|
2513
|
-
|
2514
|
-
if (!commit_lock->obtain(commit_lock))
|
2577
|
+
commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
|
2578
|
+
if (!commit_lock->obtain(commit_lock)) { /* obtain write lock */
|
2515
2579
|
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
2580
|
+
}
|
2516
2581
|
|
2517
2582
|
ir->do_commit(ir);
|
2518
2583
|
sis_write(ir->sis, ir->store);
|
@@ -2522,7 +2587,7 @@ void ir_commit_internal(IndexReader *ir)
|
|
2522
2587
|
mutex_unlock(&ir->store->mutex);
|
2523
2588
|
|
2524
2589
|
if (ir->write_lock != NULL) {
|
2525
|
-
ir->write_lock->release(ir->write_lock);
|
2590
|
+
ir->write_lock->release(ir->write_lock); /* release write lock */
|
2526
2591
|
ir->store->close_lock(ir->write_lock);
|
2527
2592
|
ir->write_lock = NULL;
|
2528
2593
|
}
|
@@ -2545,9 +2610,7 @@ void ir_close(IndexReader *ir)
|
|
2545
2610
|
mutex_lock(&ir->mutex);
|
2546
2611
|
ir_commit_internal(ir);
|
2547
2612
|
ir->do_close(ir);
|
2548
|
-
|
2549
|
-
ir->store->close(ir->store);
|
2550
|
-
}
|
2613
|
+
store_deref(ir->store);
|
2551
2614
|
if (ir->is_owner) {
|
2552
2615
|
sis_destroy(ir->sis);
|
2553
2616
|
}
|
@@ -2562,12 +2625,6 @@ void ir_close(IndexReader *ir)
|
|
2562
2625
|
free(ir);
|
2563
2626
|
}
|
2564
2627
|
|
2565
|
-
void ir_destroy(void *p)
|
2566
|
-
{
|
2567
|
-
IndexReader *ir = (IndexReader *)p;
|
2568
|
-
ir_close(ir);
|
2569
|
-
}
|
2570
|
-
|
2571
2628
|
/**
|
2572
2629
|
* Don't call this method if the cache already exists
|
2573
2630
|
**/
|