ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/index_rw.c
ADDED
@@ -0,0 +1,2543 @@
|
|
1
|
+
#include <index.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <array.h>
|
5
|
+
|
6
|
+
const char *INDEX_EXTENSIONS[] = {
|
7
|
+
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
|
8
|
+
"tvx", "tvd", "tvf", "tvp"
|
9
|
+
};
|
10
|
+
|
11
|
+
const char *COMPOUND_EXTENSIONS[] = {
|
12
|
+
"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
|
13
|
+
};
|
14
|
+
|
15
|
+
const char *VECTOR_EXTENSIONS[] = {
|
16
|
+
"tvx", "tvd", "tvf"
|
17
|
+
};
|
18
|
+
|
19
|
+
FerretConfig config = {
|
20
|
+
10, // default merge_factor
|
21
|
+
10, // default min_merge_docs
|
22
|
+
INT_MAX, // default max_merge_docs
|
23
|
+
10000, // default max_field_length
|
24
|
+
128 // default term_index_interval
|
25
|
+
};
|
26
|
+
|
27
|
+
/***************************************************************************
|
28
|
+
*
|
29
|
+
* CacheObject
|
30
|
+
*
|
31
|
+
***************************************************************************/
|
32
|
+
|
33
|
+
unsigned int co_hash(const void *key)
|
34
|
+
{
|
35
|
+
return (unsigned int)key;
|
36
|
+
}
|
37
|
+
|
38
|
+
int co_eq(const void *key1, const void *key2)
|
39
|
+
{
|
40
|
+
return (key1 == key2);
|
41
|
+
}
|
42
|
+
|
43
|
+
void co_destroy(void *p)
|
44
|
+
{
|
45
|
+
CacheObject *co = (CacheObject *)p;
|
46
|
+
h_rem(co->ref_tab1, co->ref2, false);
|
47
|
+
h_rem(co->ref_tab2, co->ref1, false);
|
48
|
+
co->destroy(co->obj);
|
49
|
+
free(co);
|
50
|
+
}
|
51
|
+
|
52
|
+
CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
|
53
|
+
void *ref1, void *ref2, void (*destroy)(void *p), void *obj)
|
54
|
+
{
|
55
|
+
CacheObject *co = ALLOC(CacheObject);
|
56
|
+
h_set(ref_tab1, ref2, co);
|
57
|
+
h_set(ref_tab2, ref1, co);
|
58
|
+
co->ref_tab1 = ref_tab1;
|
59
|
+
co->ref_tab2 = ref_tab2;
|
60
|
+
co->ref1 = ref1;
|
61
|
+
co->ref2 = ref2;
|
62
|
+
co->destroy = destroy;
|
63
|
+
co->obj = obj;
|
64
|
+
return co;
|
65
|
+
}
|
66
|
+
|
67
|
+
HshTable *co_hsh_create()
|
68
|
+
{
|
69
|
+
return h_new(&co_hash, &co_eq, NULL, &co_destroy);
|
70
|
+
}
|
71
|
+
|
72
|
+
/***************************************************************************
|
73
|
+
*
|
74
|
+
* Posting
|
75
|
+
*
|
76
|
+
***************************************************************************/
|
77
|
+
|
78
|
+
Posting *p_create(Term *term, int position, TVOffsetInfo *offset)
|
79
|
+
{
|
80
|
+
Posting *p = ALLOC(Posting);
|
81
|
+
p->freq = 1;
|
82
|
+
p->size = 1;
|
83
|
+
p->term = term;
|
84
|
+
p->positions = ALLOC(int);
|
85
|
+
p->positions[0] = position;
|
86
|
+
p->offsets = ALLOC(TVOffsetInfo *);
|
87
|
+
p->offsets[0] = offset;
|
88
|
+
return p;
|
89
|
+
}
|
90
|
+
|
91
|
+
void p_destroy(void *p)
|
92
|
+
{
|
93
|
+
// the positions and offsets will be put in a TVTerm so no need to free
|
94
|
+
int i;
|
95
|
+
Posting *post = (Posting *)p;
|
96
|
+
free(post->positions);
|
97
|
+
for (i = 0; i < post->freq; i++)
|
98
|
+
tvoi_destroy(post->offsets[i]);
|
99
|
+
free(post->offsets);
|
100
|
+
free(p);
|
101
|
+
}
|
102
|
+
|
103
|
+
void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset)
|
104
|
+
{
|
105
|
+
if (p->freq >= p->size) {
|
106
|
+
p->size *= 2;
|
107
|
+
REALLOC_N(p->positions, int, p->size);
|
108
|
+
REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
|
109
|
+
}
|
110
|
+
p->positions[p->freq] = position;
|
111
|
+
p->offsets[p->freq] = offset;
|
112
|
+
p->freq++;
|
113
|
+
}
|
114
|
+
|
115
|
+
inline int p_cmp(const void *const p1, const void *const p2)
|
116
|
+
{
|
117
|
+
Term *t1 = (*(Posting **)p1)->term;
|
118
|
+
Term *t2 = (*(Posting **)p2)->term;
|
119
|
+
int res = strcmp(t1->field, t2->field);
|
120
|
+
if (res != 0) {
|
121
|
+
return res;
|
122
|
+
} else {
|
123
|
+
return strcmp(t1->text, t2->text);
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
DocumentWriter *dw_open(Store *store,
|
128
|
+
Analyzer *analyzer,
|
129
|
+
Similarity *similarity,
|
130
|
+
int max_field_length,
|
131
|
+
int term_index_interval)
|
132
|
+
{
|
133
|
+
DocumentWriter *dw = ALLOC(DocumentWriter);
|
134
|
+
dw->store = store;
|
135
|
+
dw->analyzer = analyzer;
|
136
|
+
dw->similarity = similarity;
|
137
|
+
dw->fis = NULL;
|
138
|
+
dw->postingtable = h_new(&term_hash, &term_eq, &term_destroy, &p_destroy);
|
139
|
+
dw->max_field_length = max_field_length;
|
140
|
+
dw->term_index_interval = term_index_interval;
|
141
|
+
return dw;
|
142
|
+
}
|
143
|
+
|
144
|
+
void dw_close(DocumentWriter *dw)
|
145
|
+
{
|
146
|
+
if (dw->fis) fis_destroy(dw->fis);
|
147
|
+
h_destroy(dw->postingtable);
|
148
|
+
free(dw);
|
149
|
+
}
|
150
|
+
|
151
|
+
void dw_add_position(DocumentWriter *dw, char *field, char *text,
|
152
|
+
int position, TVOffsetInfo *offset)
|
153
|
+
{
|
154
|
+
Term termbuf = {field, text}, *term;
|
155
|
+
Posting *p = (Posting *)h_get(dw->postingtable, &termbuf);
|
156
|
+
|
157
|
+
if (p) { // word seen before
|
158
|
+
// double the size of posting to make room for more posts.
|
159
|
+
if (p->freq >= p->size) {
|
160
|
+
p->size <<= 1;
|
161
|
+
REALLOC_N(p->positions, int, p->size);
|
162
|
+
p->offsets = REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
|
163
|
+
}
|
164
|
+
p->positions[p->freq] = position; // add new position
|
165
|
+
p->offsets[p->freq] = offset; // add new position
|
166
|
+
p->freq++; // update frequency
|
167
|
+
} else { // word not seen before
|
168
|
+
term = term_create(field, text);
|
169
|
+
h_set(dw->postingtable, term, p_create(term, position, offset));
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
void dw_invert_doc(DocumentWriter *dw, Document *doc)
|
174
|
+
{
|
175
|
+
int i;
|
176
|
+
int dfcnt = doc->dfcnt;
|
177
|
+
char *field_name, *text;
|
178
|
+
int field_number, length, position, offset, slen;
|
179
|
+
TokenStream *stream;
|
180
|
+
Token *token;
|
181
|
+
FieldInfo *fi;
|
182
|
+
|
183
|
+
DocField **fields = doc->df_arr, *field;
|
184
|
+
for (i = 0; i < dfcnt; i++) {
|
185
|
+
field = fields[i];
|
186
|
+
field_name = field->name;
|
187
|
+
fi = ((FieldInfo *)ht_get(dw->fis->by_name, field_name));
|
188
|
+
field_number = fi->number;
|
189
|
+
|
190
|
+
length = dw->field_lengths[field_number];
|
191
|
+
offset = dw->field_offsets[field_number];
|
192
|
+
position = dw->field_positions[field_number];
|
193
|
+
|
194
|
+
if (fi->is_indexed) {
|
195
|
+
if (!field->is_tokenized) {// un-tokenized field
|
196
|
+
text = field->data;
|
197
|
+
slen = strlen(text);
|
198
|
+
if (fi->store_offset) {
|
199
|
+
dw_add_position(dw, field_name, text, position,
|
200
|
+
tvoi_create(offset, offset+slen));
|
201
|
+
} else {
|
202
|
+
dw_add_position(dw, field_name, text, position, NULL);
|
203
|
+
}
|
204
|
+
offset += slen;
|
205
|
+
length++;
|
206
|
+
} else {
|
207
|
+
|
208
|
+
// Tokenize field and add to posting_table
|
209
|
+
stream = a_get_ts(dw->analyzer, field_name, field->data);
|
210
|
+
|
211
|
+
while ((token = ts_next(stream)) != NULL) {
|
212
|
+
position += (token->pos_inc - 1);
|
213
|
+
|
214
|
+
if (fi->store_offset) {
|
215
|
+
dw_add_position(dw,
|
216
|
+
field_name,
|
217
|
+
token->text,
|
218
|
+
position,
|
219
|
+
tvoi_create(offset + token->start, offset + token->end));
|
220
|
+
position++;
|
221
|
+
} else {
|
222
|
+
dw_add_position(dw, field_name, token->text, position, NULL);
|
223
|
+
position++;
|
224
|
+
}
|
225
|
+
|
226
|
+
length++;
|
227
|
+
// stop if we reach the max field length
|
228
|
+
if (length > dw->max_field_length)
|
229
|
+
break;
|
230
|
+
}
|
231
|
+
|
232
|
+
if (token)
|
233
|
+
offset += token->end + 1;
|
234
|
+
}
|
235
|
+
dw->field_lengths[field_number] = length;
|
236
|
+
dw->field_offsets[field_number] = offset;
|
237
|
+
dw->field_positions[field_number] = position;
|
238
|
+
dw->field_boosts[field_number] *= field->boost;
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
Posting **dw_sort_posting_table(DocumentWriter *dw)
|
244
|
+
{
|
245
|
+
HshTable *ht = dw->postingtable;
|
246
|
+
int i;
|
247
|
+
dw->pcnt = i = ht->used;
|
248
|
+
Posting **postings = ALLOC_N(Posting *, i);
|
249
|
+
HshEntry *he = ht->table;
|
250
|
+
while (i > 0) {
|
251
|
+
if (he->value != NULL) {
|
252
|
+
i--;
|
253
|
+
postings[i] = (Posting *)he->value;
|
254
|
+
}
|
255
|
+
he++;
|
256
|
+
}
|
257
|
+
qsort(postings, dw->pcnt, sizeof(Posting *), &p_cmp);
|
258
|
+
return postings;
|
259
|
+
}
|
260
|
+
|
261
|
+
void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
|
262
|
+
{
|
263
|
+
OutStream *freq_out, *prox_out;
|
264
|
+
TermInfosWriter *tiw;
|
265
|
+
TermVectorsWriter *tvw = NULL;
|
266
|
+
FieldInfo *fi;
|
267
|
+
Store *store = dw->store;
|
268
|
+
TermInfo *ti;
|
269
|
+
Posting *posting;
|
270
|
+
int i, j, posting_freq, position, last_position;
|
271
|
+
char fname[SEGMENT_NAME_MAX_LENGTH], *curr_field = NULL, *term_field;
|
272
|
+
strcpy(fname, segment);
|
273
|
+
|
274
|
+
//open files for inverse index storage
|
275
|
+
sprintf(fname, "%s.frq", segment);
|
276
|
+
freq_out = store->create_output(store, fname);
|
277
|
+
sprintf(fname, "%s.prx", segment);
|
278
|
+
prox_out = store->create_output(store, fname);
|
279
|
+
tiw = tiw_open(store, segment, dw->fis, dw->term_index_interval);
|
280
|
+
ti = ti_create(0, 0, 0, 0);
|
281
|
+
|
282
|
+
for (i = 0; i < dw->pcnt; i++) {
|
283
|
+
posting = postings[i];
|
284
|
+
|
285
|
+
// add an entry to the dictionary with pointers to prox and freq_out files
|
286
|
+
ti_set(ti, 1, os_pos(freq_out), os_pos(prox_out), -1);
|
287
|
+
tiw_add(tiw, posting->term, ti);
|
288
|
+
|
289
|
+
// add an entry to the freq_out file
|
290
|
+
posting_freq = posting->freq;
|
291
|
+
if (posting_freq == 1) { // optimize freq=1
|
292
|
+
os_write_vint(freq_out, 1); // set low bit of doc num.
|
293
|
+
} else {
|
294
|
+
os_write_vint(freq_out, 0); // the doc number
|
295
|
+
os_write_vint(freq_out, posting_freq); // frequency in doc
|
296
|
+
}
|
297
|
+
|
298
|
+
last_position = 0; // write positions
|
299
|
+
|
300
|
+
for (j = 0; j < posting_freq; j++) {
|
301
|
+
position = posting->positions[j];
|
302
|
+
os_write_vint(prox_out, position - last_position);
|
303
|
+
last_position = position;
|
304
|
+
}
|
305
|
+
|
306
|
+
// check to see if we switched to a new field
|
307
|
+
term_field = posting->term->field;
|
308
|
+
if (curr_field != term_field) {
|
309
|
+
// changing field - see if there is something to save
|
310
|
+
curr_field = term_field;
|
311
|
+
fi = (FieldInfo *)ht_get(dw->fis->by_name, curr_field);
|
312
|
+
if (fi->store_tv) {
|
313
|
+
if (tvw == NULL) {
|
314
|
+
tvw = tvw_open(store, segment, dw->fis);
|
315
|
+
tvw_open_doc(tvw);
|
316
|
+
}
|
317
|
+
tvw_open_field(tvw, curr_field);
|
318
|
+
|
319
|
+
} else if (tvw != NULL) {
|
320
|
+
tvw_close_field(tvw);
|
321
|
+
}
|
322
|
+
}
|
323
|
+
// tvw->curr_field != NULL implies field is still open
|
324
|
+
if (tvw != NULL && tvw->curr_field != NULL) {
|
325
|
+
tvw_add_term(tvw, posting->term->text, posting_freq, posting->positions, posting->offsets);
|
326
|
+
}
|
327
|
+
}
|
328
|
+
if (tvw != NULL) {
|
329
|
+
tvw_close_doc(tvw);
|
330
|
+
tvw_close(tvw);
|
331
|
+
}
|
332
|
+
// make an effort to close all streams we can but remember and re-raise
|
333
|
+
// the last exception encountered in this process
|
334
|
+
os_close(freq_out);
|
335
|
+
os_close(prox_out);
|
336
|
+
tiw_close(tiw);
|
337
|
+
ti_destroy(ti);
|
338
|
+
}
|
339
|
+
|
340
|
+
void dw_write_norms(DocumentWriter *dw, char *segment)
|
341
|
+
{
|
342
|
+
int i;
|
343
|
+
float norm;
|
344
|
+
OutStream *norms_out;
|
345
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
346
|
+
FieldInfos *fis = dw->fis;
|
347
|
+
FieldInfo *fi;
|
348
|
+
|
349
|
+
for (i = 0; i < fis->fcnt; i++) {
|
350
|
+
fi = fis->by_number[i];
|
351
|
+
|
352
|
+
if (fi->is_indexed && !fi->omit_norms) {
|
353
|
+
norm = dw->field_boosts[i] * sim_length_norm(dw->similarity, fi->name, dw->field_lengths[i]);
|
354
|
+
sprintf(fname, "%s.f%d", segment, i);
|
355
|
+
norms_out = dw->store->create_output(dw->store, fname);
|
356
|
+
os_write_byte(norms_out, sim_encode_norm(dw->similarity, norm));
|
357
|
+
os_close(norms_out);
|
358
|
+
}
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc)
|
363
|
+
{
|
364
|
+
int i;
|
365
|
+
// write field names
|
366
|
+
dw->fis = fis_create();
|
367
|
+
fis_add_doc(dw->fis, doc);
|
368
|
+
fis_write(dw->fis, dw->store, segment, ".fnm");
|
369
|
+
|
370
|
+
// write field values
|
371
|
+
FieldsWriter *fw = fw_open(dw->store, segment, dw->fis);
|
372
|
+
fw_add_doc(fw, doc);
|
373
|
+
fw_close(fw);
|
374
|
+
|
375
|
+
// invert doc into posting_table
|
376
|
+
h_clear(dw->postingtable); // clear posting_table
|
377
|
+
|
378
|
+
dw->field_boosts = ALLOC_N(float, dw->fis->fcnt);
|
379
|
+
dw->field_lengths = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
|
380
|
+
dw->field_offsets = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
|
381
|
+
dw->field_positions = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
|
382
|
+
|
383
|
+
for (i = 0; i < dw->fis->fcnt; i++)
|
384
|
+
dw->field_boosts[i] = doc->boost;
|
385
|
+
|
386
|
+
dw_invert_doc(dw, doc);
|
387
|
+
|
388
|
+
// sort posting_table into an array
|
389
|
+
Posting **postings = dw_sort_posting_table(dw);
|
390
|
+
|
391
|
+
// write postings
|
392
|
+
dw_write_postings(dw, postings, segment);
|
393
|
+
free(postings);
|
394
|
+
|
395
|
+
// write norms of indexed fields
|
396
|
+
dw_write_norms(dw, segment);
|
397
|
+
|
398
|
+
free(dw->field_boosts);
|
399
|
+
free(dw->field_lengths);
|
400
|
+
free(dw->field_offsets);
|
401
|
+
free(dw->field_positions);
|
402
|
+
}
|
403
|
+
|
404
|
+
/****************************************************************************
|
405
|
+
*
|
406
|
+
* SegmentInfo
|
407
|
+
*
|
408
|
+
****************************************************************************/
|
409
|
+
|
410
|
+
SegmentInfo *si_create(char *name, int doc_cnt, Store *store)
|
411
|
+
{
|
412
|
+
SegmentInfo *si = ALLOC(SegmentInfo);
|
413
|
+
si->name = name;
|
414
|
+
si->doc_cnt = doc_cnt;
|
415
|
+
si->store = store;
|
416
|
+
return si;
|
417
|
+
}
|
418
|
+
|
419
|
+
void si_destroy(void *p)
|
420
|
+
{
|
421
|
+
SegmentInfo *si = (SegmentInfo *)p;
|
422
|
+
free(si->name);
|
423
|
+
free(si);
|
424
|
+
}
|
425
|
+
|
426
|
+
bool si_has_deletions(SegmentInfo *si)
|
427
|
+
{
|
428
|
+
char del_file_name[SEGMENT_NAME_MAX_LENGTH];
|
429
|
+
sprintf(del_file_name, "%s.del", si->name);
|
430
|
+
return si->store->exists(si->store, del_file_name);
|
431
|
+
}
|
432
|
+
|
433
|
+
bool si_uses_compound_file(SegmentInfo *si)
|
434
|
+
{
|
435
|
+
char compound_file_name[SEGMENT_NAME_MAX_LENGTH];
|
436
|
+
sprintf(compound_file_name, "%s.cfs", si->name);
|
437
|
+
return si->store->exists(si->store, compound_file_name);
|
438
|
+
}
|
439
|
+
|
440
|
+
struct NormTester {
|
441
|
+
bool has_norm_file;
|
442
|
+
char *segment_name;
|
443
|
+
};
|
444
|
+
void is_norm_file(char *fname, void *arg)
|
445
|
+
{
|
446
|
+
struct NormTester *nt = (struct NormTester *)arg;
|
447
|
+
char norm_file_pattern[SEGMENT_NAME_MAX_LENGTH];
|
448
|
+
sprintf(norm_file_pattern, "%s.s", nt->segment_name);
|
449
|
+
if (strncmp(fname, norm_file_pattern, strlen(norm_file_pattern)) == 0) {
|
450
|
+
nt->has_norm_file = true;
|
451
|
+
}
|
452
|
+
}
|
453
|
+
|
454
|
+
bool si_has_separate_norms(SegmentInfo *si)
|
455
|
+
{
|
456
|
+
struct NormTester nt;
|
457
|
+
nt.segment_name = si->name;
|
458
|
+
nt.has_norm_file = false;
|
459
|
+
si->store->each(si->store, &is_norm_file, &nt);
|
460
|
+
|
461
|
+
return nt.has_norm_file;
|
462
|
+
}
|
463
|
+
|
464
|
+
|
465
|
+
/****************************************************************************
|
466
|
+
*
|
467
|
+
* SegmentInfos
|
468
|
+
*
|
469
|
+
****************************************************************************/
|
470
|
+
|
471
|
+
#include <time.h>
|
472
|
+
#define FORMAT -1
|
473
|
+
#define SEGMENT_FILENAME "segments"
|
474
|
+
#define TEMPORARY_SEGMENT_FILENAME "segments.new"
|
475
|
+
|
476
|
+
SegmentInfos *sis_create()
|
477
|
+
{
|
478
|
+
SegmentInfos *sis = ALLOC(SegmentInfos);
|
479
|
+
sis->format = FORMAT;
|
480
|
+
sis->version = (unsigned int)time(NULL);
|
481
|
+
sis->scnt = 0;
|
482
|
+
sis->counter = 0;
|
483
|
+
sis->size = 4;
|
484
|
+
sis->segs = ALLOC_N(SegmentInfo *, sis->size);
|
485
|
+
return sis;
|
486
|
+
}
|
487
|
+
|
488
|
+
void sis_destroy_not_infos(void *p)
|
489
|
+
{
|
490
|
+
SegmentInfos *sis = (SegmentInfos *)p;
|
491
|
+
free(sis->segs);
|
492
|
+
free(p);
|
493
|
+
}
|
494
|
+
|
495
|
+
void sis_destroy(void *p)
|
496
|
+
{
|
497
|
+
int i;
|
498
|
+
SegmentInfos *sis = (SegmentInfos *)p;
|
499
|
+
for (i = 0; i < sis->scnt; i++)
|
500
|
+
si_destroy(sis->segs[i]);
|
501
|
+
free(sis->segs);
|
502
|
+
free(p);
|
503
|
+
}
|
504
|
+
|
505
|
+
void sis_add_si(SegmentInfos *sis, SegmentInfo *si)
|
506
|
+
{
|
507
|
+
if (sis->scnt >= sis->size) {
|
508
|
+
sis->size = sis->scnt * 2;
|
509
|
+
REALLOC_N(sis->segs, SegmentInfo *, sis->size);
|
510
|
+
}
|
511
|
+
sis->segs[sis->scnt] = si;
|
512
|
+
sis->scnt++;
|
513
|
+
}
|
514
|
+
|
515
|
+
void sis_del_at(SegmentInfos *sis, int at)
|
516
|
+
{
|
517
|
+
int i;
|
518
|
+
si_destroy(sis->segs[at]);
|
519
|
+
sis->scnt--;
|
520
|
+
for (i = at; i < sis->scnt; i++)
|
521
|
+
sis->segs[i] = sis->segs[i+1];
|
522
|
+
}
|
523
|
+
|
524
|
+
void sis_del_from_to(SegmentInfos *sis, int from, int to)
|
525
|
+
{
|
526
|
+
int i, num_to_del = to - from;
|
527
|
+
sis->scnt -= num_to_del;
|
528
|
+
for (i = from; i < to; i++) {
|
529
|
+
si_destroy(sis->segs[i]);
|
530
|
+
}
|
531
|
+
for (i = from; i < sis->scnt; i++) {
|
532
|
+
sis->segs[i] = sis->segs[i+num_to_del];
|
533
|
+
}
|
534
|
+
}
|
535
|
+
|
536
|
+
void sis_clear(SegmentInfos *sis)
|
537
|
+
{
|
538
|
+
int i;
|
539
|
+
for (i = 0; i < sis->scnt; i++) {
|
540
|
+
si_destroy(sis->segs[i]);
|
541
|
+
}
|
542
|
+
sis->scnt = 0;
|
543
|
+
}
|
544
|
+
|
545
|
+
void sis_read(SegmentInfos *sis, Store *store)
|
546
|
+
{
|
547
|
+
int doc_cnt;
|
548
|
+
char *name;
|
549
|
+
InStream *is = store->open_input(store, SEGMENT_FILENAME);
|
550
|
+
sis->format = is_read_int(is);
|
551
|
+
|
552
|
+
if (sis->format < 0) { // file contains explicit format info
|
553
|
+
// check that it is a format we can understand
|
554
|
+
if (sis->format < FORMAT)
|
555
|
+
eprintf(ERROR, "Unknown format version: %ld", sis->format);
|
556
|
+
sis->version = is_read_long(is);
|
557
|
+
sis->counter = is_read_int(is);
|
558
|
+
} else { // file is in old format without explicit format info
|
559
|
+
sis->counter = sis->format;
|
560
|
+
}
|
561
|
+
|
562
|
+
int seg_count = is_read_int(is);
|
563
|
+
int i;
|
564
|
+
for (i = 0; i < seg_count; i++) {
|
565
|
+
name = is_read_string(is);
|
566
|
+
doc_cnt = is_read_int(is);
|
567
|
+
sis_add_si(sis, si_create(name, doc_cnt, store));
|
568
|
+
}
|
569
|
+
|
570
|
+
if (sis->format >= 0) {
|
571
|
+
// in old format the version number may be at the end of the file
|
572
|
+
if (is_pos(is) >= is_length(is))
|
573
|
+
sis->version = 0; // old file format without version number
|
574
|
+
else
|
575
|
+
sis->version = is_read_long(is); // read version
|
576
|
+
}
|
577
|
+
is_close(is);
|
578
|
+
}
|
579
|
+
|
580
|
+
void sis_write(SegmentInfos *sis, Store *store)
|
581
|
+
{
|
582
|
+
int i;
|
583
|
+
SegmentInfo *si;
|
584
|
+
OutStream *os = store->create_output(store, TEMPORARY_SEGMENT_FILENAME);
|
585
|
+
os_write_int(os, FORMAT);
|
586
|
+
os_write_long(os, ++(sis->version)); // every write changes the index
|
587
|
+
os_write_int(os, sis->counter);
|
588
|
+
os_write_int(os, sis->scnt);
|
589
|
+
for (i = 0; i < sis->scnt; i++) {
|
590
|
+
si = sis->segs[i];
|
591
|
+
os_write_string(os, si->name);
|
592
|
+
os_write_int(os, si->doc_cnt);
|
593
|
+
}
|
594
|
+
|
595
|
+
os_close(os);
|
596
|
+
|
597
|
+
//install new segment info
|
598
|
+
store->rename(store, TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME);
|
599
|
+
}
|
600
|
+
|
601
|
+
int sis_read_current_version(Store *store)
|
602
|
+
{
|
603
|
+
if (!store->exists(store, SEGMENT_FILENAME))
|
604
|
+
return 0;
|
605
|
+
InStream *is = store->open_input(store, SEGMENT_FILENAME);
|
606
|
+
int format = 0;
|
607
|
+
int version = 0;
|
608
|
+
format = is_read_int(is);
|
609
|
+
if (format < 0) {
|
610
|
+
if (format < FORMAT)
|
611
|
+
eprintf(ERROR, "Unknown format version: %ld", format);
|
612
|
+
version = is_read_long(is);
|
613
|
+
}
|
614
|
+
is_close(is);
|
615
|
+
|
616
|
+
if (format < 0)
|
617
|
+
return version;
|
618
|
+
|
619
|
+
// We cannot be sure about the format of the file.
|
620
|
+
// Therefore we have to read the whole file and cannot simply
|
621
|
+
// seek to the version entry.
|
622
|
+
|
623
|
+
SegmentInfos *sis = sis_create();
|
624
|
+
sis_read(sis, store);
|
625
|
+
version = sis->version;
|
626
|
+
sis_destroy(sis);
|
627
|
+
return version;
|
628
|
+
}
|
629
|
+
|
630
|
+
/****************************************************************************
|
631
|
+
*
|
632
|
+
* IndexWriter
|
633
|
+
*
|
634
|
+
****************************************************************************/
|
635
|
+
|
636
|
+
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
637
|
+
bool create, bool close_store, bool close_analyzer)
|
638
|
+
{
|
639
|
+
IndexWriter *iw = ALLOC(IndexWriter);
|
640
|
+
if (create)
|
641
|
+
store->clear_all(store);
|
642
|
+
mutex_init(&iw->mutex, NULL);
|
643
|
+
iw->merge_factor = config.merge_factor;
|
644
|
+
iw->min_merge_docs = config.min_merge_docs;
|
645
|
+
iw->max_merge_docs = config.max_merge_docs;
|
646
|
+
iw->max_field_length = config.max_field_length;
|
647
|
+
iw->term_index_interval = config.term_index_interval;
|
648
|
+
iw->use_compound_file = true;
|
649
|
+
iw->store = store;
|
650
|
+
iw->close_store = close_store;
|
651
|
+
iw->close_analyzer = close_analyzer;
|
652
|
+
iw->analyzer = analyzer;
|
653
|
+
iw->sis = sis_create();
|
654
|
+
iw->similarity = sim_create_default();
|
655
|
+
iw->ram_store = open_ram_store();
|
656
|
+
|
657
|
+
mutex_lock(&store->mutex);
|
658
|
+
// keep the write_lock obtained until the IndexWriter is closed.
|
659
|
+
iw->write_lock = store->open_lock(store, WRITE_LOCK_NAME);
|
660
|
+
if (!iw->write_lock->obtain(iw->write_lock)) {
|
661
|
+
eprintf(STATE_ERROR,
|
662
|
+
"Could not obtain write lock when trying to write index");
|
663
|
+
}
|
664
|
+
|
665
|
+
if (create) {
|
666
|
+
Lock *commit_lock = store->open_lock(store, COMMIT_LOCK_NAME);
|
667
|
+
if (!commit_lock->obtain(commit_lock)) {
|
668
|
+
eprintf(STATE_ERROR,
|
669
|
+
"Could not obtain commit lock when trying to commit index");
|
670
|
+
}
|
671
|
+
// commit the index
|
672
|
+
store->clear(store);
|
673
|
+
sis_write(iw->sis, store);
|
674
|
+
//
|
675
|
+
commit_lock->release(commit_lock);
|
676
|
+
store->close_lock(commit_lock);
|
677
|
+
} else {
|
678
|
+
sis_read(iw->sis, store);
|
679
|
+
}
|
680
|
+
mutex_unlock(&store->mutex);
|
681
|
+
return iw;
|
682
|
+
}
|
683
|
+
|
684
|
+
const char base36_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
|
685
|
+
|
686
|
+
char *new_segment_name(int counter)
|
687
|
+
{
|
688
|
+
char buf[SEGMENT_NAME_MAX_LENGTH];
|
689
|
+
buf[SEGMENT_NAME_MAX_LENGTH - 1] = '\0';
|
690
|
+
int i;
|
691
|
+
for (i = SEGMENT_NAME_MAX_LENGTH - 2; ; i--) {
|
692
|
+
buf[i] = base36_digitmap[counter%36];
|
693
|
+
counter /= 36;
|
694
|
+
if (counter == 0) break;
|
695
|
+
}
|
696
|
+
i--;
|
697
|
+
buf[i] = '_';
|
698
|
+
return estrdup(&buf[i]);
|
699
|
+
}
|
700
|
+
|
701
|
+
int iw_doc_count(IndexWriter *iw)
|
702
|
+
{
|
703
|
+
int i, doc_cnt = 0;
|
704
|
+
mutex_lock(&iw->mutex);
|
705
|
+
for (i = 0; i < iw->sis->scnt; i++)
|
706
|
+
doc_cnt += iw->sis->segs[i]->doc_cnt;
|
707
|
+
mutex_unlock(&iw->mutex);
|
708
|
+
return doc_cnt;
|
709
|
+
}
|
710
|
+
|
711
|
+
void delete_files(Array *file_names, Store *store)
|
712
|
+
{
|
713
|
+
int i;
|
714
|
+
for (i = 0; i < file_names->size; i++) {
|
715
|
+
store->remove(store, (char *)file_names->elems[i]);
|
716
|
+
}
|
717
|
+
ary_destroy(file_names);
|
718
|
+
}
|
719
|
+
|
720
|
+
|
721
|
+
Array *sr_file_names(IndexReader *ir);
|
722
|
+
void iw_delete_segments(IndexWriter *iw, IndexReader **segment_readers, int del_cnt)
|
723
|
+
{
|
724
|
+
// The java version keeps a record of files that it couldn't delete. This
|
725
|
+
// shouldn't be a problem on linux I hope.
|
726
|
+
IndexReader *ir;
|
727
|
+
int i;
|
728
|
+
for (i = 0; i < del_cnt; i++) {
|
729
|
+
ir = segment_readers[i];
|
730
|
+
delete_files(sr_file_names(ir), ir->store);
|
731
|
+
}
|
732
|
+
}
|
733
|
+
|
734
|
+
void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merger)
|
735
|
+
{
|
736
|
+
char merged_tmp[SEGMENT_NAME_MAX_LENGTH], merged_cfs[SEGMENT_NAME_MAX_LENGTH];
|
737
|
+
|
738
|
+
mutex_lock(&iw->store->mutex);
|
739
|
+
sprintf(merged_tmp, "%s.tmp", merged_name);
|
740
|
+
sprintf(merged_cfs, "%s.cfs", merged_name);
|
741
|
+
|
742
|
+
Array *files_to_delete = sm_create_compound_file(merger, merged_tmp);
|
743
|
+
Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
744
|
+
|
745
|
+
if (!commit_lock->obtain(commit_lock)) {
|
746
|
+
eprintf(STATE_ERROR,
|
747
|
+
"Could not obtain commit lock when trying to commit index");
|
748
|
+
}
|
749
|
+
|
750
|
+
// make compound file visible for SegmentReaders
|
751
|
+
iw->store->rename(iw->store, merged_tmp, merged_cfs);
|
752
|
+
// delete now unused files of segment
|
753
|
+
delete_files(files_to_delete, iw->store);
|
754
|
+
|
755
|
+
commit_lock->release(commit_lock);
|
756
|
+
iw->store->close_lock(commit_lock);
|
757
|
+
mutex_unlock(&iw->store->mutex);
|
758
|
+
}
|
759
|
+
|
760
|
+
void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segment)
|
761
|
+
{
|
762
|
+
int i;
|
763
|
+
IndexReader *segments_to_delete[max_segment - min_segment];
|
764
|
+
int del_cnt = 0;
|
765
|
+
|
766
|
+
char *merged_name = new_segment_name(iw->sis->counter++);
|
767
|
+
|
768
|
+
SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
|
769
|
+
IndexReader *reader;
|
770
|
+
|
771
|
+
|
772
|
+
for (i = min_segment; i < max_segment; i++) {
|
773
|
+
reader = sr_open(iw->sis, i, false, false);
|
774
|
+
sm_add(merger, reader);
|
775
|
+
if ((reader->store == iw->store) || // if we own the directory
|
776
|
+
(reader->store == iw->ram_store)) {
|
777
|
+
segments_to_delete[del_cnt++] = reader; // queue segment for deletion
|
778
|
+
}
|
779
|
+
}
|
780
|
+
|
781
|
+
int merged_doc_count = sm_merge(merger);
|
782
|
+
|
783
|
+
sis_del_from_to(iw->sis, min_segment, max_segment);
|
784
|
+
|
785
|
+
sis_add_si(iw->sis, si_create(merged_name, merged_doc_count, iw->store));
|
786
|
+
|
787
|
+
// close readers before we attempt to delete now-obsolete segments
|
788
|
+
|
789
|
+
mutex_lock(&iw->store->mutex);
|
790
|
+
Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
791
|
+
if (!commit_lock->obtain(commit_lock)) {
|
792
|
+
eprintf(STATE_ERROR,
|
793
|
+
"Could not obtain commit lock when trying to commit index");
|
794
|
+
}
|
795
|
+
// commit the index
|
796
|
+
sis_write(iw->sis, iw->store);
|
797
|
+
iw_delete_segments(iw, segments_to_delete, del_cnt);
|
798
|
+
//
|
799
|
+
commit_lock->release(commit_lock);
|
800
|
+
iw->store->close_lock(commit_lock);
|
801
|
+
mutex_unlock(&iw->store->mutex);
|
802
|
+
|
803
|
+
if (iw->use_compound_file) {
|
804
|
+
make_compound_file(iw, merged_name, merger);
|
805
|
+
}
|
806
|
+
|
807
|
+
sm_destroy(merger);
|
808
|
+
}
|
809
|
+
|
810
|
+
void iw_merge_segments(IndexWriter *iw, int min_segment)
|
811
|
+
{
|
812
|
+
iw_merge_segments_with_max(iw, min_segment, iw->sis->scnt);
|
813
|
+
}
|
814
|
+
|
815
|
+
void iw_maybe_merge_segments(IndexWriter *iw)
|
816
|
+
{
|
817
|
+
int target_merge_docs = iw->min_merge_docs;
|
818
|
+
int min_segment, merge_docs;
|
819
|
+
SegmentInfo *si;
|
820
|
+
|
821
|
+
while (target_merge_docs <= iw->max_merge_docs) {
|
822
|
+
// find segments smaller than current target size
|
823
|
+
min_segment = iw->sis->scnt - 1;
|
824
|
+
merge_docs = 0;
|
825
|
+
while (min_segment >= 0) {
|
826
|
+
si = iw->sis->segs[min_segment];
|
827
|
+
if (si->doc_cnt >= target_merge_docs)
|
828
|
+
break;
|
829
|
+
merge_docs += si->doc_cnt;
|
830
|
+
min_segment -= 1;
|
831
|
+
}
|
832
|
+
|
833
|
+
if (merge_docs >= target_merge_docs) // found a merge to do
|
834
|
+
iw_merge_segments(iw, min_segment + 1);
|
835
|
+
else
|
836
|
+
break;
|
837
|
+
|
838
|
+
target_merge_docs *= iw->merge_factor; // increase target size
|
839
|
+
}
|
840
|
+
}
|
841
|
+
|
842
|
+
void iw_flush_ram_segments(IndexWriter *iw)
|
843
|
+
{
|
844
|
+
int min_segment = iw->sis->scnt-1;
|
845
|
+
int doc_count = 0;
|
846
|
+
SegmentInfo **segs = iw->sis->segs;
|
847
|
+
while ((min_segment >= 0) &&
|
848
|
+
(segs[min_segment]->store == iw->ram_store)) {
|
849
|
+
doc_count += segs[min_segment]->doc_cnt;
|
850
|
+
min_segment--;
|
851
|
+
}
|
852
|
+
/* the following if statement is actually incrementing for different
|
853
|
+
* reasons. If min_segment < 0 then we must increment as we searched
|
854
|
+
* off the end. If the top segment is not ram_store there are no
|
855
|
+
* ram segments to flush so we increment so the next check will return
|
856
|
+
* us from this function. Lastly, the min_segment stopped at a segment
|
857
|
+
* that wasn't the ram segment. But if it fit's in with the merge
|
858
|
+
* factor, why not merge it. Otherwise we leave it and increment min_seg
|
859
|
+
*/
|
860
|
+
if (min_segment < 0 || // add one FS segment?
|
861
|
+
(doc_count + segs[min_segment]->doc_cnt) > iw->merge_factor ||
|
862
|
+
(segs[iw->sis->scnt-1]->store != iw->ram_store))
|
863
|
+
min_segment++;
|
864
|
+
if (min_segment >= iw->sis->scnt)
|
865
|
+
return;
|
866
|
+
iw_merge_segments(iw, min_segment);
|
867
|
+
}
|
868
|
+
|
869
|
+
void iw_add_doc(IndexWriter *iw, Document *doc)
|
870
|
+
{
|
871
|
+
DocumentWriter *dw;
|
872
|
+
char *segment_name;
|
873
|
+
|
874
|
+
mutex_lock(&iw->mutex);
|
875
|
+
dw = dw_open(iw->ram_store,
|
876
|
+
iw->analyzer,
|
877
|
+
iw->similarity,
|
878
|
+
iw->max_field_length,
|
879
|
+
iw->term_index_interval);
|
880
|
+
segment_name = new_segment_name(iw->sis->counter++);
|
881
|
+
dw_add_doc(dw, segment_name, doc);
|
882
|
+
dw_close(dw);
|
883
|
+
sis_add_si(iw->sis, si_create(segment_name, 1, iw->ram_store));
|
884
|
+
iw_maybe_merge_segments(iw);
|
885
|
+
mutex_unlock(&iw->mutex);
|
886
|
+
}
|
887
|
+
|
888
|
+
static inline void iw_optimize_internal(IndexWriter *iw)
|
889
|
+
{
|
890
|
+
int min_segment;
|
891
|
+
iw_flush_ram_segments(iw);
|
892
|
+
while (iw->sis->scnt > 1 ||
|
893
|
+
(iw->sis->scnt == 1 &&
|
894
|
+
( si_has_deletions(iw->sis->segs[0]) ||
|
895
|
+
(iw->sis->segs[0]->store != iw->store) ||
|
896
|
+
(iw->use_compound_file &&
|
897
|
+
(!si_uses_compound_file(iw->sis->segs[0]) ||
|
898
|
+
si_has_separate_norms(iw->sis->segs[0])))))) {
|
899
|
+
min_segment = iw->sis->scnt - iw->merge_factor;
|
900
|
+
iw_merge_segments(iw, min_segment < 0 ? 0 : min_segment);
|
901
|
+
}
|
902
|
+
}
|
903
|
+
void iw_optimize(IndexWriter *iw)
|
904
|
+
{
|
905
|
+
mutex_lock(&iw->mutex);
|
906
|
+
iw_optimize_internal(iw);
|
907
|
+
mutex_unlock(&iw->mutex);
|
908
|
+
}
|
909
|
+
|
910
|
+
void iw_close(IndexWriter *iw)
|
911
|
+
{
|
912
|
+
mutex_lock(&iw->mutex);
|
913
|
+
iw_flush_ram_segments(iw);
|
914
|
+
ram_close(iw->ram_store);
|
915
|
+
sis_destroy(iw->sis);
|
916
|
+
|
917
|
+
sim_destroy(iw->similarity);
|
918
|
+
if (iw->close_analyzer) a_destroy(iw->analyzer);
|
919
|
+
|
920
|
+
iw->write_lock->release(iw->write_lock);
|
921
|
+
iw->store->close_lock(iw->write_lock);
|
922
|
+
|
923
|
+
if (iw->close_store)
|
924
|
+
store_close(iw->store);
|
925
|
+
mutex_destroy(&iw->mutex);
|
926
|
+
free(iw);
|
927
|
+
}
|
928
|
+
|
929
|
+
void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
|
930
|
+
{
|
931
|
+
int i, j, end, start;
|
932
|
+
|
933
|
+
mutex_lock(&iw->mutex);
|
934
|
+
iw_optimize_internal(iw); // start with zero or 1 seg
|
935
|
+
|
936
|
+
start = iw->sis->scnt;
|
937
|
+
|
938
|
+
for (i = 0; i < cnt; i++) {
|
939
|
+
Store *store = stores[i];
|
940
|
+
SegmentInfos *sis = sis_create(); // read infos from dir
|
941
|
+
sis_read(sis, store);
|
942
|
+
|
943
|
+
for (j = 0; j < sis->scnt; j++) {
|
944
|
+
SegmentInfo *si = sis->segs[j];
|
945
|
+
sis_add_si(iw->sis, si);
|
946
|
+
}
|
947
|
+
sis_destroy_not_infos(sis);
|
948
|
+
}
|
949
|
+
|
950
|
+
// merge newly added segments in log(n) passes
|
951
|
+
while (iw->sis->scnt > start + iw->merge_factor) {
|
952
|
+
for (i = start + 1; i < iw->sis->scnt; i++) {
|
953
|
+
end = MIN(iw->sis->scnt, i + iw->merge_factor);
|
954
|
+
if (end - i > 1) {
|
955
|
+
iw_merge_segments_with_max(iw, i, end);
|
956
|
+
}
|
957
|
+
}
|
958
|
+
}
|
959
|
+
|
960
|
+
// final cleanup
|
961
|
+
iw_optimize_internal(iw);
|
962
|
+
mutex_unlock(&iw->mutex);
|
963
|
+
}
|
964
|
+
|
965
|
+
|
966
|
+
/**
|
967
|
+
* This adds an array of readers to the index leaving the added readers open.
|
968
|
+
*/
|
969
|
+
void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
|
970
|
+
{
|
971
|
+
IndexReader *ir = NULL;
|
972
|
+
int i, del_cnt = 0;
|
973
|
+
|
974
|
+
mutex_lock(&iw->mutex);
|
975
|
+
iw_optimize_internal(iw); // start with zero or 1 seg
|
976
|
+
|
977
|
+
char *merged_name = new_segment_name(iw->sis->counter++);
|
978
|
+
|
979
|
+
SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
|
980
|
+
merger->readers->free_elem = NULL; // don't close readers
|
981
|
+
|
982
|
+
if (iw->sis->scnt == 1) {// add existing index, if any
|
983
|
+
ir = sr_open_si(iw->sis->segs[0]);
|
984
|
+
sm_add(merger, ir);
|
985
|
+
del_cnt = 1;
|
986
|
+
}
|
987
|
+
|
988
|
+
for (i = 0; i < cnt; i++) {
|
989
|
+
sm_add(merger, irs[i]);
|
990
|
+
}
|
991
|
+
|
992
|
+
int doc_count = sm_merge(merger); // merge 'em
|
993
|
+
|
994
|
+
// pop old infos and add new ones.
|
995
|
+
sis_clear(iw->sis);
|
996
|
+
sis_add_si(iw->sis, si_create(merged_name, doc_count, iw->store));
|
997
|
+
|
998
|
+
|
999
|
+
Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
1000
|
+
if (!commit_lock->obtain(commit_lock)) // obtain write lock
|
1001
|
+
eprintf(STATE_ERROR, "Index locked for commit: %s", COMMIT_LOCK_NAME);
|
1002
|
+
|
1003
|
+
sis_write(iw->sis, iw->store); // commit changes
|
1004
|
+
iw_delete_segments(iw, &ir, del_cnt);
|
1005
|
+
if (ir) ir_close(ir);
|
1006
|
+
|
1007
|
+
commit_lock->release(commit_lock);
|
1008
|
+
iw->store->close_lock(commit_lock);
|
1009
|
+
|
1010
|
+
if (iw->use_compound_file) {
|
1011
|
+
make_compound_file(iw, merged_name, merger);
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
iw_optimize_internal(iw);
|
1015
|
+
sm_destroy(merger);
|
1016
|
+
|
1017
|
+
mutex_unlock(&iw->mutex);
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
/****************************************************************************
|
1021
|
+
*
|
1022
|
+
* Norm
|
1023
|
+
*
|
1024
|
+
****************************************************************************/
|
1025
|
+
|
1026
|
+
Norm *norm_create(InStream *is, int field_num)
|
1027
|
+
{
|
1028
|
+
Norm *norm = ALLOC(Norm);
|
1029
|
+
norm->is = is;
|
1030
|
+
norm->field_num = field_num;
|
1031
|
+
norm->bytes = NULL;
|
1032
|
+
norm->is_dirty = false;
|
1033
|
+
return norm;
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
void norm_destroy(void *p)
|
1037
|
+
{
|
1038
|
+
Norm *norm = (Norm *)p;
|
1039
|
+
is_close(norm->is);
|
1040
|
+
if (norm->bytes != NULL) free(norm->bytes);
|
1041
|
+
free(norm);
|
1042
|
+
}
|
1043
|
+
|
1044
|
+
void norm_rewrite(Norm *norm, Store *store, char *segment,
|
1045
|
+
int doc_count, Store *cfs_store)
|
1046
|
+
{
|
1047
|
+
if (norm->bytes == NULL)
|
1048
|
+
return; // These norms do not need to be rewritten
|
1049
|
+
|
1050
|
+
char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
|
1051
|
+
char norm_fname[SEGMENT_NAME_MAX_LENGTH];
|
1052
|
+
sprintf(tmp_fname, "%s.tmp", segment);
|
1053
|
+
OutStream *os = store->create_output(store, tmp_fname);
|
1054
|
+
os_write_bytes(os, norm->bytes, doc_count);
|
1055
|
+
os_close(os);
|
1056
|
+
if (cfs_store) {
|
1057
|
+
sprintf(norm_fname, "%s.s%d", segment, norm->field_num);
|
1058
|
+
} else {
|
1059
|
+
sprintf(norm_fname, "%s.f%d", segment, norm->field_num);
|
1060
|
+
}
|
1061
|
+
store->rename(store, tmp_fname, norm_fname);
|
1062
|
+
norm->is_dirty = false;
|
1063
|
+
}
|
1064
|
+
|
1065
|
+
/****************************************************************************
|
1066
|
+
*
|
1067
|
+
* SegmentReader
|
1068
|
+
*
|
1069
|
+
****************************************************************************/
|
1070
|
+
|
1071
|
+
#define GET_SR SegmentReader *sr = (SegmentReader *)ir->data;
|
1072
|
+
|
1073
|
+
int sr_max_doc(IndexReader *ir)
|
1074
|
+
{
|
1075
|
+
return ((SegmentReader *)ir->data)->fr->len;
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
static inline void sr_close_norms(SegmentReader *sr)
|
1079
|
+
{
|
1080
|
+
h_destroy(sr->norms);
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
static inline TermVectorsReader *sr_tvr(SegmentReader *sr)
|
1084
|
+
{
|
1085
|
+
TermVectorsReader *tvr;
|
1086
|
+
if ((tvr = thread_getspecific(sr->thread_tvr)) == NULL) {
|
1087
|
+
tvr = tvr_clone(sr->orig_tvr);
|
1088
|
+
if (tvr == NULL) printf("scuk\n");
|
1089
|
+
ary_append(sr->tvr_bucket, tvr);
|
1090
|
+
thread_setspecific(sr->thread_tvr, tvr);
|
1091
|
+
}
|
1092
|
+
return tvr;
|
1093
|
+
}
|
1094
|
+
|
1095
|
+
void sr_close(IndexReader *ir)
|
1096
|
+
{
|
1097
|
+
GET_SR;
|
1098
|
+
fr_close(sr->fr);
|
1099
|
+
tir_close(sr->tir);
|
1100
|
+
|
1101
|
+
if (sr->freq_in) is_close(sr->freq_in);
|
1102
|
+
if (sr->prox_in) is_close(sr->prox_in);
|
1103
|
+
fis_destroy(sr->fis);
|
1104
|
+
|
1105
|
+
sr_close_norms(sr);
|
1106
|
+
|
1107
|
+
if (sr->orig_tvr) {
|
1108
|
+
tvr_close(sr->orig_tvr);
|
1109
|
+
thread_key_delete(sr->thread_tvr);
|
1110
|
+
ary_destroy(sr->tvr_bucket);
|
1111
|
+
}
|
1112
|
+
if (sr->deleted_docs) bv_destroy(sr->deleted_docs);
|
1113
|
+
if (sr->cfs_store) sr->cfs_store->close(sr->cfs_store);
|
1114
|
+
if (sr->fake_norms) free(sr->fake_norms);
|
1115
|
+
free(sr->segment);
|
1116
|
+
free(sr);
|
1117
|
+
}
|
1118
|
+
|
1119
|
+
void sr_delete_doc(IndexReader *ir, int doc_num)
|
1120
|
+
{
|
1121
|
+
GET_SR;
|
1122
|
+
if (sr->deleted_docs == NULL)
|
1123
|
+
sr->deleted_docs = bv_create();
|
1124
|
+
|
1125
|
+
sr->deleted_docs_dirty = true;
|
1126
|
+
sr->undelete_all = false;
|
1127
|
+
bv_set(sr->deleted_docs, doc_num);
|
1128
|
+
}
|
1129
|
+
|
1130
|
+
static inline bool sr_is_deleted_internal(IndexReader *ir, int doc_num)
|
1131
|
+
{
|
1132
|
+
GET_SR;
|
1133
|
+
return (sr->deleted_docs != NULL && bv_get(sr->deleted_docs, doc_num));
|
1134
|
+
}
|
1135
|
+
|
1136
|
+
bool sr_is_deleted(IndexReader *ir, int doc_num)
|
1137
|
+
{
|
1138
|
+
bool is_del;
|
1139
|
+
|
1140
|
+
mutex_lock(&ir->mutex);
|
1141
|
+
is_del = sr_is_deleted_internal(ir, doc_num);
|
1142
|
+
mutex_unlock(&ir->mutex);
|
1143
|
+
|
1144
|
+
return is_del;
|
1145
|
+
}
|
1146
|
+
|
1147
|
+
bool sr_has_norms(IndexReader *ir, char *field)
|
1148
|
+
{
|
1149
|
+
bool has_norms;
|
1150
|
+
GET_SR;
|
1151
|
+
mutex_lock(&ir->mutex);
|
1152
|
+
has_norms = h_has_key(sr->norms, field);
|
1153
|
+
mutex_unlock(&ir->mutex);
|
1154
|
+
|
1155
|
+
return has_norms;
|
1156
|
+
}
|
1157
|
+
|
1158
|
+
bool sr_has_deletions(IndexReader *ir)
|
1159
|
+
{
|
1160
|
+
GET_SR;
|
1161
|
+
return (sr->deleted_docs != NULL);
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
void sr_undelete_all(IndexReader *ir)
|
1165
|
+
{
|
1166
|
+
GET_SR;
|
1167
|
+
sr->undelete_all = true;
|
1168
|
+
sr->deleted_docs_dirty = false;
|
1169
|
+
if (sr->deleted_docs != NULL) bv_destroy(sr->deleted_docs);
|
1170
|
+
sr->deleted_docs = NULL;
|
1171
|
+
}
|
1172
|
+
|
1173
|
+
TermEnum *sr_terms(IndexReader *ir)
|
1174
|
+
{
|
1175
|
+
TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
|
1176
|
+
return te->clone(te);
|
1177
|
+
}
|
1178
|
+
|
1179
|
+
TermEnum *sr_terms_from(IndexReader *ir, Term *term)
|
1180
|
+
{
|
1181
|
+
TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
|
1182
|
+
TermEnum *ret_te = te->clone(te);
|
1183
|
+
te_skip_to(ret_te, term);
|
1184
|
+
return ret_te;
|
1185
|
+
}
|
1186
|
+
|
1187
|
+
Document *sr_get_doc(IndexReader *ir, int doc_num)
|
1188
|
+
{
|
1189
|
+
Document *doc;
|
1190
|
+
mutex_lock(&ir->mutex);
|
1191
|
+
if (sr_is_deleted_internal(ir, doc_num)) {
|
1192
|
+
mutex_unlock(&ir->mutex);
|
1193
|
+
eprintf(STATE_ERROR,
|
1194
|
+
"Tried to get doc <%ld> that has already been deleted", doc_num);
|
1195
|
+
}
|
1196
|
+
GET_SR;
|
1197
|
+
doc = fr_get_doc(sr->fr, doc_num);
|
1198
|
+
mutex_unlock(&ir->mutex);
|
1199
|
+
return doc;
|
1200
|
+
}
|
1201
|
+
|
1202
|
+
static inline void
|
1203
|
+
sr_get_norms_into_internal(IndexReader *ir, char *field, uchar *buf, int offset)
|
1204
|
+
{
|
1205
|
+
GET_SR;
|
1206
|
+
Norm *norm = h_get(sr->norms, field);
|
1207
|
+
if (norm == NULL) {
|
1208
|
+
memset(buf + offset*sizeof(uchar), 0, sr_max_doc(ir)*sizeof(uchar));
|
1209
|
+
} else if (norm->bytes != NULL) { // can copy from cache
|
1210
|
+
memcpy(buf + offset*sizeof(uchar), norm->bytes, sr_max_doc(ir)*sizeof(uchar));
|
1211
|
+
} else {
|
1212
|
+
InStream *norm_in = is_clone(norm->is);
|
1213
|
+
// read from disk
|
1214
|
+
is_seek(norm_in, 0);
|
1215
|
+
is_read_bytes(norm_in, buf, offset, sr_max_doc(ir));
|
1216
|
+
is_close(norm_in);
|
1217
|
+
}
|
1218
|
+
}
|
1219
|
+
|
1220
|
+
void sr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
|
1221
|
+
{
|
1222
|
+
mutex_lock(&ir->mutex);
|
1223
|
+
sr_get_norms_into_internal(ir, field, buf, offset);
|
1224
|
+
mutex_unlock(&ir->mutex);
|
1225
|
+
}
|
1226
|
+
|
1227
|
+
static inline uchar *sr_get_norms_internal(IndexReader *ir, char *field)
|
1228
|
+
{
|
1229
|
+
GET_SR;
|
1230
|
+
Norm *norm = h_get(sr->norms, field);
|
1231
|
+
if (norm == NULL) // not an indexed field
|
1232
|
+
return NULL;
|
1233
|
+
|
1234
|
+
if (norm->bytes == NULL) { // value not yet read
|
1235
|
+
uchar *bytes = ALLOC_N(uchar, ir->max_doc(ir));
|
1236
|
+
sr_get_norms_into_internal(ir, field, bytes, 0);
|
1237
|
+
norm->bytes = bytes; // cache it
|
1238
|
+
}
|
1239
|
+
return norm->bytes;
|
1240
|
+
}
|
1241
|
+
|
1242
|
+
uchar *sr_get_norms(IndexReader *ir, char *field)
|
1243
|
+
{
|
1244
|
+
uchar *norms;
|
1245
|
+
mutex_lock(&ir->mutex);
|
1246
|
+
norms = sr_get_norms_internal(ir, field);
|
1247
|
+
mutex_unlock(&ir->mutex);
|
1248
|
+
return norms;
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
static inline uchar *sr_get_norms_always(IndexReader *ir, char *field)
|
1252
|
+
{
|
1253
|
+
uchar *bytes;
|
1254
|
+
GET_SR;
|
1255
|
+
mutex_lock(&ir->mutex);
|
1256
|
+
|
1257
|
+
bytes = sr_get_norms_internal(ir, field);
|
1258
|
+
if (bytes == NULL) {
|
1259
|
+
if (sr->fake_norms) {
|
1260
|
+
bytes = sr->fake_norms;
|
1261
|
+
} else {
|
1262
|
+
int len = ir->max_doc(ir);
|
1263
|
+
sr->fake_norms = bytes = ALLOC_N(uchar, len);
|
1264
|
+
memset(bytes, 0, len);
|
1265
|
+
}
|
1266
|
+
}
|
1267
|
+
mutex_unlock(&ir->mutex);
|
1268
|
+
return bytes;
|
1269
|
+
}
|
1270
|
+
|
1271
|
+
void sr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
|
1272
|
+
{
|
1273
|
+
GET_SR;
|
1274
|
+
Norm *norm;
|
1275
|
+
|
1276
|
+
norm = h_get(sr->norms, field);
|
1277
|
+
if (norm != NULL) { /* an indexed field */
|
1278
|
+
norm->is_dirty = true; // mark it dirty
|
1279
|
+
sr->norms_dirty = true;
|
1280
|
+
|
1281
|
+
sr_get_norms_internal(ir, field)[doc_num] = val;
|
1282
|
+
}
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
int sr_doc_freq(IndexReader *ir, Term *t)
|
1286
|
+
{
|
1287
|
+
GET_SR;
|
1288
|
+
TermInfo *ti = tir_get_ti(sr->tir, t);
|
1289
|
+
if (ti != NULL) {
|
1290
|
+
int df = ti->doc_freq;
|
1291
|
+
ti_destroy(ti);
|
1292
|
+
return df;
|
1293
|
+
} else return 0;
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
Array *sr_file_names(IndexReader *ir)
|
1297
|
+
{
|
1298
|
+
GET_SR;
|
1299
|
+
Array *file_names = ary_create(0, &efree);
|
1300
|
+
FieldInfo *fi;
|
1301
|
+
int i;
|
1302
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
1303
|
+
|
1304
|
+
for (i = 0; i < NELEMS(INDEX_EXTENSIONS); i++) {
|
1305
|
+
sprintf(fname, "%s.%s", sr->segment, INDEX_EXTENSIONS[i]);
|
1306
|
+
if (ir->store->exists(ir->store, fname))
|
1307
|
+
ary_append(file_names, estrdup(fname));
|
1308
|
+
}
|
1309
|
+
|
1310
|
+
for (i = 0; i < sr->fis->fcnt; i++) {
|
1311
|
+
fi = sr->fis->by_number[i];
|
1312
|
+
if (fi->is_indexed && !fi->omit_norms) {
|
1313
|
+
if (sr->cfs_store) {
|
1314
|
+
sprintf(fname, "%s.s%d", sr->segment, i);
|
1315
|
+
} else {
|
1316
|
+
sprintf(fname, "%s.f%d", sr->segment, i);
|
1317
|
+
}
|
1318
|
+
if (ir->store->exists(ir->store, fname))
|
1319
|
+
ary_append(file_names, estrdup(fname));
|
1320
|
+
}
|
1321
|
+
}
|
1322
|
+
return file_names;
|
1323
|
+
}
|
1324
|
+
|
1325
|
+
HashSet *sr_get_field_names(IndexReader *ir, int field_type)
|
1326
|
+
{
|
1327
|
+
int i;
|
1328
|
+
GET_SR;
|
1329
|
+
HashSet *field_set = hs_str_create(NULL);
|
1330
|
+
FieldInfo *fi;
|
1331
|
+
for (i = 0; i < sr->fis->fcnt; i++) {
|
1332
|
+
fi = sr->fis->by_number[i];
|
1333
|
+
switch(field_type) {
|
1334
|
+
case IR_ALL:
|
1335
|
+
hs_add(field_set, fi->name);
|
1336
|
+
break;
|
1337
|
+
case IR_UNINDEXED:
|
1338
|
+
if (!fi->is_indexed) hs_add(field_set, fi->name);
|
1339
|
+
break;
|
1340
|
+
case IR_INDEXED:
|
1341
|
+
if (fi->is_indexed) hs_add(field_set, fi->name);
|
1342
|
+
break;
|
1343
|
+
case IR_INDEXED_NO_TERM_VECTOR:
|
1344
|
+
if (fi->is_indexed && !fi->store_tv) hs_add(field_set, fi->name);
|
1345
|
+
break;
|
1346
|
+
case IR_TERM_VECTOR:
|
1347
|
+
if (fi->store_tv && !fi->store_pos && !fi->store_offset)
|
1348
|
+
hs_add(field_set, fi->name);
|
1349
|
+
break;
|
1350
|
+
case IR_INDEXED_WITH_TERM_VECTOR:
|
1351
|
+
if (fi->is_indexed && fi->store_tv) hs_add(field_set, fi->name);
|
1352
|
+
break;
|
1353
|
+
case IR_TERM_VECTOR_WITH_POSITION:
|
1354
|
+
if (fi->store_pos && !fi->store_offset) hs_add(field_set, fi->name);
|
1355
|
+
break;
|
1356
|
+
case IR_TERM_VECTOR_WITH_OFFSET:
|
1357
|
+
if (!fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
|
1358
|
+
case IR_TERM_VECTOR_WITH_POSITION_OFFSET:
|
1359
|
+
if (fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
|
1360
|
+
break;
|
1361
|
+
default:
|
1362
|
+
eprintf(ARG_ERROR, "Invalid field_type <%ld>.", field_type);
|
1363
|
+
}
|
1364
|
+
}
|
1365
|
+
return field_set;
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
int sr_num_docs(IndexReader *ir)
|
1369
|
+
{
|
1370
|
+
GET_SR;
|
1371
|
+
|
1372
|
+
mutex_lock(&ir->mutex);
|
1373
|
+
int num_docs = sr_max_doc(ir);
|
1374
|
+
if (sr->deleted_docs != NULL)
|
1375
|
+
num_docs -= sr->deleted_docs->count;
|
1376
|
+
mutex_unlock(&ir->mutex);
|
1377
|
+
return num_docs;
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
TermDocEnum *sr_term_docs(IndexReader *ir)
|
1381
|
+
{
|
1382
|
+
return stde_create(ir);
|
1383
|
+
}
|
1384
|
+
|
1385
|
+
TermDocEnum *sr_term_positions(IndexReader *ir)
|
1386
|
+
{
|
1387
|
+
return stpe_create(ir);
|
1388
|
+
}
|
1389
|
+
|
1390
|
+
void sr_open_norms(IndexReader *ir, Store *cfs_store)
|
1391
|
+
{
|
1392
|
+
GET_SR;
|
1393
|
+
int i;
|
1394
|
+
FieldInfo *fi;
|
1395
|
+
Store *tmp_store;
|
1396
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
1397
|
+
for (i = 0; i < sr->fis->fcnt; i++) {
|
1398
|
+
tmp_store = ir->store;
|
1399
|
+
fi = sr->fis->by_number[i];
|
1400
|
+
if (fi->is_indexed && !fi->omit_norms) {
|
1401
|
+
sprintf(fname, "%s.s%d", sr->segment, fi->number);
|
1402
|
+
if (! tmp_store->exists(tmp_store, fname)) {
|
1403
|
+
sprintf(fname, "%s.f%d", sr->segment, fi->number);
|
1404
|
+
tmp_store = cfs_store;
|
1405
|
+
}
|
1406
|
+
h_set(sr->norms, fi->name,
|
1407
|
+
norm_create(tmp_store->open_input(tmp_store, fname), fi->number));
|
1408
|
+
}
|
1409
|
+
}
|
1410
|
+
sr->norms_dirty = false;
|
1411
|
+
}
|
1412
|
+
|
1413
|
+
TermVector *sr_get_term_vector(IndexReader *ir, int doc_num, char *field)
|
1414
|
+
{
|
1415
|
+
GET_SR;
|
1416
|
+
FieldInfo *fi = (FieldInfo *)ht_get(sr->fis->by_name, field);
|
1417
|
+
TermVectorsReader *tvr;
|
1418
|
+
|
1419
|
+
if (fi == NULL || !fi->store_tv || !sr->orig_tvr || !(tvr = sr_tvr(sr)))
|
1420
|
+
return NULL;
|
1421
|
+
|
1422
|
+
return tvr_get_field_tv(tvr, doc_num, field);
|
1423
|
+
}
|
1424
|
+
|
1425
|
+
Array *sr_get_term_vectors(IndexReader *ir, int doc_num)
|
1426
|
+
{
|
1427
|
+
GET_SR;
|
1428
|
+
TermVectorsReader *tvr;
|
1429
|
+
if (sr->orig_tvr == NULL || (tvr = sr_tvr(sr)) == NULL)
|
1430
|
+
return NULL;
|
1431
|
+
|
1432
|
+
return tvr_get_tv(tvr, doc_num);
|
1433
|
+
}
|
1434
|
+
|
1435
|
+
void sr_commit(IndexReader *ir)
|
1436
|
+
{
|
1437
|
+
GET_SR;
|
1438
|
+
char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
|
1439
|
+
char del_fname[SEGMENT_NAME_MAX_LENGTH];
|
1440
|
+
sprintf(del_fname, "%s.del", sr->segment);
|
1441
|
+
|
1442
|
+
if (sr->deleted_docs_dirty) { // re-write deleted
|
1443
|
+
sprintf(tmp_fname, "%s.tmp", sr->segment);
|
1444
|
+
bv_write(sr->deleted_docs, ir->store, tmp_fname);
|
1445
|
+
ir->store->rename(ir->store, tmp_fname, del_fname);
|
1446
|
+
}
|
1447
|
+
if (sr->undelete_all && ir->store->exists(ir->store, del_fname))
|
1448
|
+
ir->store->remove(ir->store, del_fname);
|
1449
|
+
if (sr->norms_dirty) {// re-write norms
|
1450
|
+
int i;
|
1451
|
+
FieldInfo *fi;
|
1452
|
+
for (i = 0; i < sr->fis->fcnt; i++) {
|
1453
|
+
fi = sr->fis->by_number[i];
|
1454
|
+
if (fi->is_indexed) {
|
1455
|
+
norm_rewrite((Norm *)h_get(sr->norms, fi->name), ir->store,
|
1456
|
+
sr->segment, sr_max_doc(ir), sr->cfs_store);
|
1457
|
+
}
|
1458
|
+
}
|
1459
|
+
}
|
1460
|
+
sr->deleted_docs_dirty = false;
|
1461
|
+
sr->norms_dirty = false;
|
1462
|
+
sr->undelete_all = false;
|
1463
|
+
}
|
1464
|
+
|
1465
|
+
IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
|
1466
|
+
{
|
1467
|
+
Store *store = si->store;
|
1468
|
+
SegmentReader *sr = ALLOC(SegmentReader);
|
1469
|
+
ir->get_term_vector = &sr_get_term_vector;
|
1470
|
+
ir->get_term_vectors = &sr_get_term_vectors;
|
1471
|
+
ir->num_docs = &sr_num_docs;
|
1472
|
+
ir->max_doc = &sr_max_doc;
|
1473
|
+
ir->get_doc = &sr_get_doc;
|
1474
|
+
ir->get_norms_into = &sr_get_norms_into;
|
1475
|
+
ir->get_norms = &sr_get_norms;
|
1476
|
+
ir->get_norms_always = &sr_get_norms_always;
|
1477
|
+
ir->do_set_norm = &sr_set_norm;
|
1478
|
+
ir->terms = &sr_terms;
|
1479
|
+
ir->terms_from = &sr_terms_from;
|
1480
|
+
ir->doc_freq = &sr_doc_freq;
|
1481
|
+
ir->term_docs = &sr_term_docs;
|
1482
|
+
ir->term_positions = &sr_term_positions;
|
1483
|
+
ir->do_delete_doc = &sr_delete_doc;
|
1484
|
+
ir->is_deleted = &sr_is_deleted;
|
1485
|
+
ir->has_norms = &sr_has_norms;
|
1486
|
+
ir->has_deletions = &sr_has_deletions;
|
1487
|
+
ir->do_undelete_all = &sr_undelete_all;
|
1488
|
+
ir->get_field_names = &sr_get_field_names;
|
1489
|
+
ir->do_commit = &sr_commit;
|
1490
|
+
ir->do_close = &sr_close;
|
1491
|
+
ir->data = sr;
|
1492
|
+
sr->segment = estrdup(si->name);
|
1493
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
1494
|
+
sr->cfs_store = NULL;
|
1495
|
+
sr->fake_norms = NULL;
|
1496
|
+
sprintf(fname, "%s.cfs", sr->segment);
|
1497
|
+
if (store->exists(store, fname)) {
|
1498
|
+
sr->cfs_store = open_cmpd_store(store, fname);
|
1499
|
+
store = sr->cfs_store;
|
1500
|
+
}
|
1501
|
+
|
1502
|
+
sprintf(fname, "%s.fnm", sr->segment);
|
1503
|
+
sr->fis = fis_open(store, fname);
|
1504
|
+
sr->fr = fr_open(store, sr->segment, sr->fis);
|
1505
|
+
|
1506
|
+
sr->tir = tir_open(store, sr->segment, sr->fis);
|
1507
|
+
sr->deleted_docs = NULL;
|
1508
|
+
sr->deleted_docs_dirty = false;
|
1509
|
+
sr->undelete_all = false;
|
1510
|
+
if (si_has_deletions(si)) {
|
1511
|
+
sprintf(fname, "%s.del", sr->segment);
|
1512
|
+
sr->deleted_docs = bv_read(si->store, fname);
|
1513
|
+
}
|
1514
|
+
|
1515
|
+
sprintf(fname, "%s.frq", sr->segment);
|
1516
|
+
sr->freq_in = store->open_input(store, fname);
|
1517
|
+
sprintf(fname, "%s.prx", sr->segment);
|
1518
|
+
sr->prox_in = store->open_input(store, fname);
|
1519
|
+
sr->norms = h_new_str(NULL, &norm_destroy);
|
1520
|
+
sr_open_norms(ir, store);
|
1521
|
+
|
1522
|
+
if (fis_has_vectors(sr->fis)) {
|
1523
|
+
sr->orig_tvr = tvr_open(store, sr->segment, sr->fis);
|
1524
|
+
thread_key_create(&sr->thread_tvr, NULL);
|
1525
|
+
sr->tvr_bucket = ary_create(1, (destroy_func_t)&tvr_close);
|
1526
|
+
} else {
|
1527
|
+
sr->orig_tvr = NULL;
|
1528
|
+
}
|
1529
|
+
return ir;
|
1530
|
+
}
|
1531
|
+
|
1532
|
+
IndexReader *sr_open_si(SegmentInfo *si)
|
1533
|
+
{
|
1534
|
+
IndexReader *ir = ir_create(si->store, NULL, false, false);
|
1535
|
+
return sr_open_internal(ir, si);
|
1536
|
+
}
|
1537
|
+
|
1538
|
+
IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_store)
|
1539
|
+
{
|
1540
|
+
SegmentInfo *si = sis->segs[si_num];
|
1541
|
+
IndexReader *ir = ir_create(si->store, sis, is_owner, close_store);
|
1542
|
+
return sr_open_internal(ir, si);
|
1543
|
+
}
|
1544
|
+
/****************************************************************************
|
1545
|
+
*
|
1546
|
+
* MultiReader
|
1547
|
+
*
|
1548
|
+
****************************************************************************/
|
1549
|
+
|
1550
|
+
#define GET_MR MultiReader *mr = (MultiReader *)ir->data
|
1551
|
+
#define GET_READER(doc_num) MultiReader *mr = (MultiReader *)ir->data;\
|
1552
|
+
int i = mr_reader_index(mr, doc_num);\
|
1553
|
+
IndexReader *reader = mr->sub_readers[i];
|
1554
|
+
|
1555
|
+
|
1556
|
+
|
1557
|
+
int mr_reader_index(MultiReader *mr, int doc_num)
|
1558
|
+
{
|
1559
|
+
int lo = 0; // search @starts array
|
1560
|
+
int hi = mr->rcnt - 1; // for first element less
|
1561
|
+
int mid;
|
1562
|
+
int mid_value;
|
1563
|
+
|
1564
|
+
while (hi >= lo) {
|
1565
|
+
mid = (lo + hi) >> 1;
|
1566
|
+
mid_value = mr->starts[mid];
|
1567
|
+
if (doc_num < mid_value) {
|
1568
|
+
hi = mid - 1;
|
1569
|
+
} else if (doc_num > mid_value) {
|
1570
|
+
lo = mid + 1;
|
1571
|
+
} else { // found a match
|
1572
|
+
while ((mid+1 < mr->rcnt) && (mr->starts[mid+1] == mid_value))
|
1573
|
+
mid += 1; // scan to last match in case we have empty segments
|
1574
|
+
return mid;
|
1575
|
+
}
|
1576
|
+
}
|
1577
|
+
return hi;
|
1578
|
+
}
|
1579
|
+
|
1580
|
+
TermVector *mr_get_term_vector(IndexReader *ir, int doc_num, char *field)
|
1581
|
+
{
|
1582
|
+
GET_READER(doc_num);
|
1583
|
+
return reader->get_term_vector(reader, doc_num - mr->starts[i], field);
|
1584
|
+
}
|
1585
|
+
|
1586
|
+
Array *mr_get_term_vectors(IndexReader *ir, int doc_num)
|
1587
|
+
{
|
1588
|
+
GET_READER(doc_num);
|
1589
|
+
return reader->get_term_vectors(reader, doc_num - mr->starts[i]);
|
1590
|
+
}
|
1591
|
+
|
1592
|
+
int mr_num_docs(IndexReader *ir)
|
1593
|
+
{
|
1594
|
+
int i, num_docs;
|
1595
|
+
GET_MR;
|
1596
|
+
mutex_lock(&ir->mutex);
|
1597
|
+
if (mr->num_docs_cache == -1) {
|
1598
|
+
IndexReader *reader;
|
1599
|
+
mr->num_docs_cache = 0;
|
1600
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1601
|
+
reader = mr->sub_readers[i];
|
1602
|
+
mr->num_docs_cache += reader->num_docs(reader);
|
1603
|
+
}
|
1604
|
+
}
|
1605
|
+
num_docs = mr->num_docs_cache;
|
1606
|
+
mutex_unlock(&ir->mutex);
|
1607
|
+
|
1608
|
+
return num_docs;
|
1609
|
+
}
|
1610
|
+
|
1611
|
+
int mr_max_doc(IndexReader *ir)
|
1612
|
+
{
|
1613
|
+
GET_MR;
|
1614
|
+
return mr->max_doc;
|
1615
|
+
}
|
1616
|
+
|
1617
|
+
Document *mr_get_doc(IndexReader *ir, int doc_num)
|
1618
|
+
{
|
1619
|
+
GET_READER(doc_num);
|
1620
|
+
return reader->get_doc(reader, doc_num - mr->starts[i]);
|
1621
|
+
}
|
1622
|
+
|
1623
|
+
void mr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
|
1624
|
+
{
|
1625
|
+
int i;
|
1626
|
+
GET_MR;
|
1627
|
+
|
1628
|
+
mutex_lock(&ir->mutex);
|
1629
|
+
uchar *bytes = h_get(mr->norms_cache, field);
|
1630
|
+
if (bytes != NULL) {
|
1631
|
+
memcpy(buf + offset, bytes, mr->max_doc);
|
1632
|
+
} else {
|
1633
|
+
IndexReader *reader;
|
1634
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1635
|
+
reader = mr->sub_readers[i];
|
1636
|
+
reader->get_norms_into(reader, field, buf, offset + mr->starts[i]);
|
1637
|
+
}
|
1638
|
+
}
|
1639
|
+
mutex_unlock(&ir->mutex);
|
1640
|
+
}
|
1641
|
+
|
1642
|
+
uchar *mr_get_norms(IndexReader *ir, char *field)
|
1643
|
+
{
|
1644
|
+
int i;
|
1645
|
+
GET_MR;
|
1646
|
+
uchar *bytes;
|
1647
|
+
IndexReader *reader;
|
1648
|
+
|
1649
|
+
mutex_lock(&ir->mutex);
|
1650
|
+
bytes = h_get(mr->norms_cache, field);
|
1651
|
+
if (bytes == NULL) {
|
1652
|
+
bytes = ALLOC_N(uchar, mr->max_doc);
|
1653
|
+
|
1654
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1655
|
+
reader = mr->sub_readers[i];
|
1656
|
+
reader->get_norms_into(reader, field, bytes, mr->starts[i]);
|
1657
|
+
}
|
1658
|
+
h_set(mr->norms_cache, field, bytes); // update cache
|
1659
|
+
}
|
1660
|
+
mutex_unlock(&ir->mutex);
|
1661
|
+
|
1662
|
+
return bytes;
|
1663
|
+
}
|
1664
|
+
|
1665
|
+
void mr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
|
1666
|
+
{
|
1667
|
+
GET_READER(doc_num);
|
1668
|
+
h_del(mr->norms_cache, field); // clear cache
|
1669
|
+
ir_set_norm(reader, doc_num - mr->starts[i], field, val);
|
1670
|
+
}
|
1671
|
+
|
1672
|
+
TermEnum *mr_terms(IndexReader *ir)
|
1673
|
+
{
|
1674
|
+
GET_MR;
|
1675
|
+
return mte_create(mr->sub_readers, mr->starts, mr->rcnt, NULL);
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
TermEnum *mr_terms_from(IndexReader *ir, Term *term)
|
1679
|
+
{
|
1680
|
+
GET_MR;
|
1681
|
+
return mte_create(mr->sub_readers, mr->starts, mr->rcnt, term);
|
1682
|
+
}
|
1683
|
+
|
1684
|
+
int mr_doc_freq(IndexReader *ir, Term *t)
|
1685
|
+
{
|
1686
|
+
int total = 0, i; // sum freqs in segments
|
1687
|
+
GET_MR;
|
1688
|
+
|
1689
|
+
IndexReader *reader;
|
1690
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1691
|
+
reader = mr->sub_readers[i];
|
1692
|
+
total += reader->doc_freq(reader, t);
|
1693
|
+
}
|
1694
|
+
return total;
|
1695
|
+
}
|
1696
|
+
|
1697
|
+
TermDocEnum *mr_term_docs(IndexReader *ir)
|
1698
|
+
{
|
1699
|
+
GET_MR;
|
1700
|
+
return mtde_create(mr->sub_readers, mr->starts, mr->rcnt);
|
1701
|
+
}
|
1702
|
+
|
1703
|
+
TermDocEnum *mr_term_positions(IndexReader *ir)
|
1704
|
+
{
|
1705
|
+
GET_MR;
|
1706
|
+
return mtpe_create(mr->sub_readers, mr->starts, mr->rcnt);
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
void mr_delete_doc(IndexReader *ir, int doc_num)
|
1710
|
+
{
|
1711
|
+
GET_READER(doc_num);
|
1712
|
+
mr->num_docs_cache = -1; // invalidate cache
|
1713
|
+
|
1714
|
+
reader->do_delete_doc(reader, doc_num - mr->starts[i]); // dispatch to segment reader
|
1715
|
+
mr->has_deletions = true;
|
1716
|
+
}
|
1717
|
+
|
1718
|
+
bool mr_is_deleted(IndexReader *ir, int doc_num)
|
1719
|
+
{
|
1720
|
+
GET_READER(doc_num);
|
1721
|
+
return reader->is_deleted(reader, doc_num - mr->starts[i]);
|
1722
|
+
}
|
1723
|
+
|
1724
|
+
bool mr_has_norms(IndexReader *ir, char *field)
|
1725
|
+
{
|
1726
|
+
bool has_norms = false;
|
1727
|
+
int i;
|
1728
|
+
GET_MR;
|
1729
|
+
|
1730
|
+
IndexReader *reader;
|
1731
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1732
|
+
reader = mr->sub_readers[i];
|
1733
|
+
if (reader->has_norms(reader, field)) {
|
1734
|
+
has_norms = true;
|
1735
|
+
break;
|
1736
|
+
}
|
1737
|
+
}
|
1738
|
+
|
1739
|
+
return has_norms;
|
1740
|
+
}
|
1741
|
+
|
1742
|
+
bool mr_has_deletions(IndexReader *ir)
|
1743
|
+
{
|
1744
|
+
GET_MR;
|
1745
|
+
return mr->has_deletions;
|
1746
|
+
}
|
1747
|
+
|
1748
|
+
void mr_undelete_all(IndexReader *ir)
|
1749
|
+
{
|
1750
|
+
int i;
|
1751
|
+
GET_MR;
|
1752
|
+
mr->num_docs_cache = -1; // invalidate cache
|
1753
|
+
IndexReader *reader;
|
1754
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1755
|
+
reader = mr->sub_readers[i];
|
1756
|
+
reader->do_undelete_all(reader);
|
1757
|
+
}
|
1758
|
+
mr->has_deletions = false;
|
1759
|
+
}
|
1760
|
+
|
1761
|
+
HashSet *mr_get_field_names(IndexReader *ir, int field_type)
|
1762
|
+
{
|
1763
|
+
int i;
|
1764
|
+
GET_MR;
|
1765
|
+
HashSet *field_set = hs_str_create(NULL);
|
1766
|
+
IndexReader *reader;
|
1767
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1768
|
+
reader = mr->sub_readers[i];
|
1769
|
+
hs_merge(field_set, reader->get_field_names(reader, field_type));
|
1770
|
+
}
|
1771
|
+
return field_set;
|
1772
|
+
}
|
1773
|
+
|
1774
|
+
void mr_commit(IndexReader *ir)
|
1775
|
+
{
|
1776
|
+
GET_MR;
|
1777
|
+
int i;
|
1778
|
+
IndexReader *reader;
|
1779
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1780
|
+
reader = mr->sub_readers[i];
|
1781
|
+
reader->do_commit(reader);
|
1782
|
+
}
|
1783
|
+
}
|
1784
|
+
|
1785
|
+
void mr_close(IndexReader *ir)
|
1786
|
+
{
|
1787
|
+
GET_MR;
|
1788
|
+
int i;
|
1789
|
+
IndexReader *reader;
|
1790
|
+
for (i = 0; i < mr->rcnt; i++) {
|
1791
|
+
reader = mr->sub_readers[i];
|
1792
|
+
ir_close(reader);
|
1793
|
+
}
|
1794
|
+
free(mr->sub_readers);
|
1795
|
+
h_destroy(mr->norms_cache);
|
1796
|
+
free(mr->starts);
|
1797
|
+
free(mr);
|
1798
|
+
}
|
1799
|
+
|
1800
|
+
IndexReader *mr_open(Store *store,
|
1801
|
+
SegmentInfos *sis,
|
1802
|
+
IndexReader **sub_readers,
|
1803
|
+
int rcnt,
|
1804
|
+
int close_store)
|
1805
|
+
{
|
1806
|
+
int i;
|
1807
|
+
MultiReader *mr = ALLOC(MultiReader);
|
1808
|
+
IndexReader *sub_reader;
|
1809
|
+
mr->sub_readers = sub_readers;
|
1810
|
+
mr->rcnt = rcnt;
|
1811
|
+
|
1812
|
+
mr->max_doc = 0;
|
1813
|
+
mr->num_docs_cache = -1;
|
1814
|
+
mr->has_deletions = false;
|
1815
|
+
|
1816
|
+
mr->starts = ALLOC_N(int, (rcnt+1));
|
1817
|
+
for (i = 0; i < rcnt; i++) {
|
1818
|
+
sub_reader = sub_readers[i];
|
1819
|
+
mr->starts[i] = mr->max_doc;
|
1820
|
+
mr->max_doc += sub_reader->max_doc(sub_reader); // compute max_docs
|
1821
|
+
|
1822
|
+
if (sub_reader->has_deletions(sub_reader))
|
1823
|
+
mr->has_deletions = true;
|
1824
|
+
}
|
1825
|
+
mr->starts[rcnt] = mr->max_doc;
|
1826
|
+
mr->norms_cache = h_new_str(NULL, &efree);
|
1827
|
+
|
1828
|
+
IndexReader *ir = ir_create(store, sis, true, close_store);
|
1829
|
+
ir->get_term_vector = &mr_get_term_vector;
|
1830
|
+
ir->get_term_vectors = &mr_get_term_vectors;
|
1831
|
+
ir->num_docs = &mr_num_docs;
|
1832
|
+
ir->max_doc = &mr_max_doc;
|
1833
|
+
ir->get_doc = &mr_get_doc;
|
1834
|
+
ir->get_norms_into = &mr_get_norms_into;
|
1835
|
+
ir->get_norms = &mr_get_norms;
|
1836
|
+
ir->get_norms_always = &mr_get_norms;
|
1837
|
+
ir->do_set_norm = &mr_set_norm;
|
1838
|
+
ir->terms = &mr_terms;
|
1839
|
+
ir->terms_from = &mr_terms_from;
|
1840
|
+
ir->doc_freq = &mr_doc_freq;
|
1841
|
+
ir->term_docs = &mr_term_docs;
|
1842
|
+
ir->term_positions = &mr_term_positions;
|
1843
|
+
ir->do_delete_doc = &mr_delete_doc;
|
1844
|
+
ir->is_deleted = &mr_is_deleted;
|
1845
|
+
ir->has_norms = &mr_has_norms;
|
1846
|
+
ir->has_deletions = &mr_has_deletions;
|
1847
|
+
ir->do_undelete_all = &mr_undelete_all;
|
1848
|
+
ir->get_field_names = &mr_get_field_names;
|
1849
|
+
ir->do_commit = &mr_commit;
|
1850
|
+
ir->do_close = &mr_close;
|
1851
|
+
ir->data = mr;
|
1852
|
+
|
1853
|
+
return ir;
|
1854
|
+
}
|
1855
|
+
|
1856
|
+
/****************************************************************************
|
1857
|
+
*
|
1858
|
+
* SegmentMergeInfo
|
1859
|
+
*
|
1860
|
+
****************************************************************************/
|
1861
|
+
|
1862
|
+
bool smi_lt(void *p1, void *p2)
|
1863
|
+
{
|
1864
|
+
SegmentMergeInfo *smi1 = (SegmentMergeInfo *)p1;
|
1865
|
+
SegmentMergeInfo *smi2 = (SegmentMergeInfo *)p2;
|
1866
|
+
|
1867
|
+
int cmpres = tb_cmp(smi1->tb, smi2->tb);
|
1868
|
+
if (cmpres == 0) {
|
1869
|
+
return smi1->base < smi2->base;
|
1870
|
+
} else {
|
1871
|
+
return cmpres < 0;
|
1872
|
+
}
|
1873
|
+
}
|
1874
|
+
|
1875
|
+
int *smi_load_doc_map(SegmentMergeInfo *smi)
|
1876
|
+
{
|
1877
|
+
IndexReader *ir = smi->ir;
|
1878
|
+
if (ir->has_deletions(ir) && (smi->doc_map == NULL)) {
|
1879
|
+
int max_doc = ir->max_doc(ir);
|
1880
|
+
smi->doc_map = ALLOC_N(int, max_doc);
|
1881
|
+
int j = 0, i;
|
1882
|
+
for (i = 0; i < max_doc; i++) {
|
1883
|
+
if (ir->is_deleted(ir, i)) {
|
1884
|
+
smi->doc_map[i] = -1;
|
1885
|
+
} else {
|
1886
|
+
smi->doc_map[i] = j++;
|
1887
|
+
}
|
1888
|
+
}
|
1889
|
+
}
|
1890
|
+
return smi->doc_map;
|
1891
|
+
}
|
1892
|
+
|
1893
|
+
SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir)
|
1894
|
+
{
|
1895
|
+
SegmentMergeInfo *smi = ALLOC(SegmentMergeInfo);
|
1896
|
+
smi->base = base;
|
1897
|
+
smi->ir = ir;
|
1898
|
+
smi->te = te;
|
1899
|
+
smi->tb = te->tb_curr;
|
1900
|
+
smi->postings = ir->term_positions(ir);
|
1901
|
+
smi->doc_map = NULL;
|
1902
|
+
return smi;
|
1903
|
+
}
|
1904
|
+
|
1905
|
+
void smi_destroy(void *p)
|
1906
|
+
{
|
1907
|
+
SegmentMergeInfo *smi = (SegmentMergeInfo *)p;
|
1908
|
+
smi->postings->close(smi->postings);
|
1909
|
+
smi->te->close(smi->te);
|
1910
|
+
if (smi->doc_map != NULL)
|
1911
|
+
free(smi->doc_map);
|
1912
|
+
free(smi);
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
TermBuffer *smi_next(SegmentMergeInfo *smi)
|
1916
|
+
{
|
1917
|
+
return (smi->tb = smi->te->next(smi->te));
|
1918
|
+
}
|
1919
|
+
|
1920
|
+
/****************************************************************************
|
1921
|
+
*
|
1922
|
+
* SegmentMerger
|
1923
|
+
*
|
1924
|
+
****************************************************************************/
|
1925
|
+
|
1926
|
+
SegmentMerger *sm_create(Store *store, char *name, int term_index_interval)
|
1927
|
+
{
|
1928
|
+
SegmentMerger *sm = ALLOC(SegmentMerger);
|
1929
|
+
sm->store = store;
|
1930
|
+
sm->name = estrdup(name);
|
1931
|
+
sm->readers = ary_create(config.merge_factor, &ir_destroy);
|
1932
|
+
sm->fis = NULL;
|
1933
|
+
sm->freq_out = NULL;
|
1934
|
+
sm->prox_out = NULL;
|
1935
|
+
sm->tiw = NULL;
|
1936
|
+
sm->queue = NULL;
|
1937
|
+
sm->ti = ti_create(0, 0, 0, 0);
|
1938
|
+
sm->term_index_interval = term_index_interval;
|
1939
|
+
sm->skip_buffer = ram_create_buffer();
|
1940
|
+
sm->skip_interval = -1;
|
1941
|
+
return sm;
|
1942
|
+
}
|
1943
|
+
|
1944
|
+
void sm_close(SegmentMerger *sm)
|
1945
|
+
{
|
1946
|
+
int i;
|
1947
|
+
if (sm->freq_out != NULL) os_close(sm->freq_out);
|
1948
|
+
if (sm->prox_out != NULL) os_close(sm->prox_out);
|
1949
|
+
if (sm->tiw != NULL) {
|
1950
|
+
for (i = 0; i < sm->terms_buf_size; i++)
|
1951
|
+
free(sm->terms_buf[i].text);
|
1952
|
+
free(sm->terms_buf);
|
1953
|
+
tiw_close(sm->tiw);
|
1954
|
+
}
|
1955
|
+
if (sm->queue != NULL) pq_destroy(sm->queue);
|
1956
|
+
sm->freq_out = NULL;
|
1957
|
+
sm->prox_out = NULL;
|
1958
|
+
sm->tiw = NULL;
|
1959
|
+
sm->queue = NULL;
|
1960
|
+
}
|
1961
|
+
|
1962
|
+
void sm_destroy(void *p)
|
1963
|
+
{
|
1964
|
+
SegmentMerger *sm = (SegmentMerger *)p;
|
1965
|
+
if (sm->fis != NULL) fis_destroy(sm->fis);
|
1966
|
+
ary_destroy(sm->readers);
|
1967
|
+
sm_close(sm);
|
1968
|
+
free(sm->name);
|
1969
|
+
ti_destroy(sm->ti);
|
1970
|
+
ram_destroy_buffer(sm->skip_buffer);
|
1971
|
+
free(sm);
|
1972
|
+
}
|
1973
|
+
|
1974
|
+
void sm_add(SegmentMerger *sm, IndexReader *ir)
|
1975
|
+
{
|
1976
|
+
ary_append(sm->readers, ir);
|
1977
|
+
}
|
1978
|
+
|
1979
|
+
static inline void sm_add_indexed(IndexReader *ir,
|
1980
|
+
FieldInfos *fis,
|
1981
|
+
HashSet *fields,
|
1982
|
+
bool store_tv,
|
1983
|
+
bool store_pos,
|
1984
|
+
bool store_offset)
|
1985
|
+
{
|
1986
|
+
int i;
|
1987
|
+
char *field;
|
1988
|
+
for (i = 0; i < fields->size; i++) {
|
1989
|
+
field = (char *)fields->elems[i];
|
1990
|
+
fis_add(fis, field, true, store_tv, store_pos, store_offset,
|
1991
|
+
!ir->has_norms(ir, field));
|
1992
|
+
}
|
1993
|
+
hs_destroy(fields);
|
1994
|
+
}
|
1995
|
+
|
1996
|
+
int sm_merge_fields(SegmentMerger *sm)
|
1997
|
+
{
|
1998
|
+
int i, j, maxdoc;
|
1999
|
+
FieldInfos *fis = sm->fis = fis_create();
|
2000
|
+
int doc_count = 0;
|
2001
|
+
Document *doc;
|
2002
|
+
for (i = 0; i < sm->readers->size; i++) {
|
2003
|
+
IndexReader *ir = sm->readers->elems[i];
|
2004
|
+
|
2005
|
+
sm_add_indexed(ir, fis,
|
2006
|
+
ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION_OFFSET),
|
2007
|
+
true, true, true);
|
2008
|
+
sm_add_indexed(ir, fis,
|
2009
|
+
ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION),
|
2010
|
+
true, true, false);
|
2011
|
+
sm_add_indexed(ir, fis,
|
2012
|
+
ir->get_field_names(ir, IR_TERM_VECTOR_WITH_OFFSET),
|
2013
|
+
true, false, true);
|
2014
|
+
sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_TERM_VECTOR),
|
2015
|
+
true, false, false);
|
2016
|
+
sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_INDEXED),
|
2017
|
+
false, false, false);
|
2018
|
+
fis_add_fields(fis, ir->get_field_names(ir, IR_UNINDEXED),
|
2019
|
+
false, false, false, false, false);
|
2020
|
+
}
|
2021
|
+
fis_write(fis, sm->store, sm->name, ".fnm");
|
2022
|
+
|
2023
|
+
// merge field values
|
2024
|
+
FieldsWriter *fw = fw_open(sm->store, sm->name, fis);
|
2025
|
+
|
2026
|
+
for (i = 0; i < sm->readers->size; i++) {
|
2027
|
+
IndexReader *ir = sm->readers->elems[i];
|
2028
|
+
maxdoc = ir->max_doc(ir);
|
2029
|
+
for (j = 0; j < maxdoc; j++) {
|
2030
|
+
if (!ir->is_deleted(ir, j)) { // skip deleted docs
|
2031
|
+
doc = ir->get_doc(ir, j);
|
2032
|
+
fw_add_doc(fw, doc);
|
2033
|
+
doc_destroy(doc);
|
2034
|
+
doc_count++;
|
2035
|
+
}
|
2036
|
+
}
|
2037
|
+
}
|
2038
|
+
fw_close(fw);
|
2039
|
+
return doc_count;
|
2040
|
+
}
|
2041
|
+
|
2042
|
+
void sm_reset_skip(SegmentMerger *sm)
|
2043
|
+
{
|
2044
|
+
ramo_reset(sm->skip_buffer);
|
2045
|
+
sm->last_skip_doc = 0;
|
2046
|
+
sm->last_skip_freq_pointer = os_pos(sm->freq_out);
|
2047
|
+
sm->last_skip_prox_pointer = os_pos(sm->prox_out);
|
2048
|
+
}
|
2049
|
+
|
2050
|
+
inline void sm_buffer_skip(SegmentMerger *sm, int doc)
|
2051
|
+
{
|
2052
|
+
int freq_pointer = os_pos(sm->freq_out);
|
2053
|
+
int prox_pointer = os_pos(sm->prox_out);
|
2054
|
+
|
2055
|
+
os_write_vint(sm->skip_buffer, doc - sm->last_skip_doc);
|
2056
|
+
os_write_vint(sm->skip_buffer, freq_pointer - sm->last_skip_freq_pointer);
|
2057
|
+
os_write_vint(sm->skip_buffer, prox_pointer - sm->last_skip_prox_pointer);
|
2058
|
+
|
2059
|
+
sm->last_skip_doc = doc;
|
2060
|
+
sm->last_skip_freq_pointer = freq_pointer;
|
2061
|
+
sm->last_skip_prox_pointer = prox_pointer;
|
2062
|
+
}
|
2063
|
+
|
2064
|
+
int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
|
2065
|
+
{
|
2066
|
+
int i, j;
|
2067
|
+
int last_doc = 0, base, doc, doc_code, freq, last_position, position;
|
2068
|
+
int *doc_map = NULL;
|
2069
|
+
int df = 0; // number of docs w/ term
|
2070
|
+
TermDocEnum *postings;
|
2071
|
+
SegmentMergeInfo *smi;
|
2072
|
+
sm_reset_skip(sm);
|
2073
|
+
for (i = 0; i < cnt; i++) {
|
2074
|
+
smi = smis[i];
|
2075
|
+
postings = smi->postings;
|
2076
|
+
base = smi->base;
|
2077
|
+
doc_map = smi_load_doc_map(smi);
|
2078
|
+
|
2079
|
+
stde_seek_ti(postings, smi->te->ti_curr);
|
2080
|
+
while (postings->next(postings)) {
|
2081
|
+
doc = postings->doc_num(postings);
|
2082
|
+
if (doc_map != NULL)
|
2083
|
+
doc = doc_map[doc]; // work around deletions
|
2084
|
+
doc += base; // convert to merged space
|
2085
|
+
|
2086
|
+
if (doc < last_doc)
|
2087
|
+
eprintf(STATE_ERROR,
|
2088
|
+
"docs out of order curent doc = %ld and previous doc = %ld",
|
2089
|
+
doc, last_doc);
|
2090
|
+
|
2091
|
+
df++;
|
2092
|
+
|
2093
|
+
if ((df % sm->skip_interval) == 0)
|
2094
|
+
sm_buffer_skip(sm, last_doc);
|
2095
|
+
|
2096
|
+
doc_code = (doc - last_doc) << 1; // use low bit to flag freq=1
|
2097
|
+
last_doc = doc;
|
2098
|
+
|
2099
|
+
freq = postings->freq(postings);
|
2100
|
+
if (freq == 1) {
|
2101
|
+
os_write_vint(sm->freq_out, doc_code | 1); // write doc & freq=1
|
2102
|
+
} else {
|
2103
|
+
os_write_vint(sm->freq_out, doc_code); // write doc
|
2104
|
+
os_write_vint(sm->freq_out, freq); // write freqency in doc
|
2105
|
+
}
|
2106
|
+
|
2107
|
+
|
2108
|
+
last_position = 0; // write position deltas
|
2109
|
+
for (j = 0; j < freq; j++) {
|
2110
|
+
position = postings->next_position(postings);
|
2111
|
+
os_write_vint(sm->prox_out, position - last_position);
|
2112
|
+
last_position = position;
|
2113
|
+
}
|
2114
|
+
}
|
2115
|
+
}
|
2116
|
+
return df;
|
2117
|
+
}
|
2118
|
+
|
2119
|
+
int sm_write_skip(SegmentMerger *sm)
|
2120
|
+
{
|
2121
|
+
int skip_pointer = os_pos(sm->freq_out);
|
2122
|
+
ramo_write_to(sm->skip_buffer, sm->freq_out);
|
2123
|
+
return skip_pointer;
|
2124
|
+
}
|
2125
|
+
|
2126
|
+
Term *sm_tb_to_term(SegmentMerger *sm, TermBuffer *tb)
|
2127
|
+
{
|
2128
|
+
int index = sm->terms_buf_pointer % sm->terms_buf_size;
|
2129
|
+
sm->terms_buf_pointer++;
|
2130
|
+
sm->terms_buf[index].field = tb->field;
|
2131
|
+
strcpy(sm->terms_buf[index].text, tb->text);
|
2132
|
+
return &(sm->terms_buf[index]);
|
2133
|
+
}
|
2134
|
+
|
2135
|
+
void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
|
2136
|
+
{
|
2137
|
+
int freq_pointer = os_pos(sm->freq_out);
|
2138
|
+
int prox_pointer = os_pos(sm->prox_out);
|
2139
|
+
|
2140
|
+
int df = sm_append_postings(sm, smis, cnt); // append posting data
|
2141
|
+
|
2142
|
+
int skip_pointer = sm_write_skip(sm);
|
2143
|
+
|
2144
|
+
if (df > 0) {
|
2145
|
+
// add an entry to the dictionary with pointers to prox and freq files
|
2146
|
+
ti_set(sm->ti, df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer));
|
2147
|
+
tiw_add(sm->tiw, sm_tb_to_term(sm, smis[0]->tb), sm->ti);
|
2148
|
+
}
|
2149
|
+
}
|
2150
|
+
|
2151
|
+
void sm_merge_term_infos(SegmentMerger *sm)
|
2152
|
+
{
|
2153
|
+
int base = 0;
|
2154
|
+
int i, match_size;
|
2155
|
+
IndexReader *ir;
|
2156
|
+
TermEnum *te;
|
2157
|
+
SegmentMergeInfo *smi, *top;
|
2158
|
+
TermBuffer *tb;
|
2159
|
+
|
2160
|
+
for (i = 0; i < sm->readers->size; i++) {
|
2161
|
+
ir = sm->readers->elems[i];
|
2162
|
+
te = ir->terms(ir);
|
2163
|
+
smi = smi_create(base, te, ir);
|
2164
|
+
base += ir->num_docs(ir);
|
2165
|
+
if (smi_next(smi) != NULL)
|
2166
|
+
pq_push(sm->queue, smi); // initialize @queue
|
2167
|
+
else
|
2168
|
+
smi_destroy(smi);
|
2169
|
+
}
|
2170
|
+
|
2171
|
+
SegmentMergeInfo **match = ALLOC_N(SegmentMergeInfo *, sm->readers->size);
|
2172
|
+
|
2173
|
+
while (sm->queue->count > 0) {
|
2174
|
+
// for (i = 1; i <= sm->queue->count; i++) {
|
2175
|
+
// printf("<{%s:%s}>", ((SegmentMergeInfo *)sm->queue->heap[i])->tb->field,
|
2176
|
+
// ((SegmentMergeInfo *)sm->queue->heap[i])->tb->text);
|
2177
|
+
// }printf("\n\n");
|
2178
|
+
match_size = 0; // pop matching terms
|
2179
|
+
match[match_size] = pq_pop(sm->queue);
|
2180
|
+
match_size++;
|
2181
|
+
tb = match[0]->tb;
|
2182
|
+
top = pq_top(sm->queue);
|
2183
|
+
while ((top != NULL) && (tb_cmp(tb, top->tb) == 0)) {
|
2184
|
+
match[match_size] = pq_pop(sm->queue);
|
2185
|
+
match_size++;
|
2186
|
+
top = pq_top(sm->queue);
|
2187
|
+
}
|
2188
|
+
|
2189
|
+
//printf(">%s:%s<\n", match[0]->tb->field, match[0]->tb->text);
|
2190
|
+
sm_merge_term_info(sm, match, match_size); // add new TermInfo
|
2191
|
+
|
2192
|
+
while (match_size > 0) {
|
2193
|
+
match_size--;
|
2194
|
+
smi = match[match_size];
|
2195
|
+
if (smi_next(smi) != NULL)
|
2196
|
+
pq_push(sm->queue, smi); // restore queue
|
2197
|
+
else
|
2198
|
+
smi_destroy(smi); // done with a segment
|
2199
|
+
}
|
2200
|
+
}
|
2201
|
+
free(match);
|
2202
|
+
}
|
2203
|
+
|
2204
|
+
void sm_merge_terms(SegmentMerger *sm)
|
2205
|
+
{
|
2206
|
+
int i;
|
2207
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
2208
|
+
sprintf(fname, "%s.frq", sm->name);
|
2209
|
+
sm->freq_out = sm->store->create_output(sm->store, fname);
|
2210
|
+
sprintf(fname, "%s.prx", sm->name);
|
2211
|
+
sm->prox_out = sm->store->create_output(sm->store, fname);
|
2212
|
+
sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
|
2213
|
+
// terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
|
2214
|
+
// to keep the last index_interval terms so that it can compare the last term
|
2215
|
+
// put in the index with the next one. So the size of the buffer must by
|
2216
|
+
// index_interval + 2.
|
2217
|
+
sm->terms_buf_pointer = 0;
|
2218
|
+
sm->terms_buf_size = sm->tiw->index_interval + 2;
|
2219
|
+
sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
|
2220
|
+
for (i = 0; i < sm->terms_buf_size; i++) {
|
2221
|
+
sm->terms_buf[i].field = NULL;
|
2222
|
+
sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
|
2223
|
+
}
|
2224
|
+
sm->skip_interval = sm->tiw->skip_interval;
|
2225
|
+
sm->queue = pq_create(sm->readers->size, &smi_lt);
|
2226
|
+
|
2227
|
+
sm_merge_term_infos(sm);
|
2228
|
+
|
2229
|
+
sm_close(sm);
|
2230
|
+
}
|
2231
|
+
|
2232
|
+
void sm_merge_norms(SegmentMerger *sm)
|
2233
|
+
{
|
2234
|
+
int i, j, k, max_doc;
|
2235
|
+
uchar *norm_buf;
|
2236
|
+
FieldInfo *fi;
|
2237
|
+
OutStream *os;
|
2238
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
2239
|
+
IndexReader *ir;
|
2240
|
+
for (i = 0; i < sm->fis->fcnt; i++) {
|
2241
|
+
fi = sm->fis->by_number[i];
|
2242
|
+
if (fi->is_indexed && !fi->omit_norms) {
|
2243
|
+
sprintf(fname, "%s.f%d", sm->name, i);
|
2244
|
+
os = sm->store->create_output(sm->store, fname);
|
2245
|
+
for (j = 0; j < sm->readers->size; j++) {
|
2246
|
+
ir = sm->readers->elems[j];
|
2247
|
+
max_doc = ir->max_doc(ir);
|
2248
|
+
norm_buf = ALLOC_N(uchar, max_doc);
|
2249
|
+
memset(norm_buf, 0, sizeof(uchar) * max_doc);
|
2250
|
+
ir->get_norms_into(ir, fi->name, norm_buf, 0);
|
2251
|
+
for (k = 0; k < max_doc; k++) {
|
2252
|
+
if (!ir->is_deleted(ir, k)) {
|
2253
|
+
os_write_byte(os, norm_buf[k]);
|
2254
|
+
}
|
2255
|
+
}
|
2256
|
+
free(norm_buf);
|
2257
|
+
}
|
2258
|
+
os_close(os);
|
2259
|
+
}
|
2260
|
+
}
|
2261
|
+
}
|
2262
|
+
|
2263
|
+
void sm_merge_vectors(SegmentMerger *sm)
|
2264
|
+
{
|
2265
|
+
int i, j, max_doc;
|
2266
|
+
TermVectorsWriter *tvw = tvw_open(sm->store, sm->name, sm->fis);
|
2267
|
+
IndexReader *ir;
|
2268
|
+
Array *tvs;
|
2269
|
+
for (i = 0; i < sm->readers->size; i++) {
|
2270
|
+
ir = sm->readers->elems[i];
|
2271
|
+
max_doc = ir->max_doc(ir);
|
2272
|
+
for (j = 0; j < max_doc; j++) {
|
2273
|
+
// skip deleted docs
|
2274
|
+
if (! ir->is_deleted(ir, j)) {
|
2275
|
+
tvs = ir->get_term_vectors(ir, j);
|
2276
|
+
tvw_add_all_doc_vectors(tvw, tvs);
|
2277
|
+
ary_destroy(tvs);
|
2278
|
+
}
|
2279
|
+
}
|
2280
|
+
}
|
2281
|
+
tvw_close(tvw);
|
2282
|
+
}
|
2283
|
+
|
2284
|
+
int sm_merge(SegmentMerger *sm)
|
2285
|
+
{
|
2286
|
+
int doc_count = sm_merge_fields(sm);
|
2287
|
+
sm_merge_terms(sm);
|
2288
|
+
sm_merge_norms(sm);
|
2289
|
+
if (fis_has_vectors(sm->fis))
|
2290
|
+
sm_merge_vectors(sm);
|
2291
|
+
return doc_count;
|
2292
|
+
}
|
2293
|
+
|
2294
|
+
Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
|
2295
|
+
{
|
2296
|
+
Array *files = ary_create(0, &efree);
|
2297
|
+
CompoundWriter *cw = open_cw(sm->store, file_name);
|
2298
|
+
FieldInfo *fi;
|
2299
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
2300
|
+
|
2301
|
+
int i;
|
2302
|
+
for (i = 0; i < NELEMS(COMPOUND_EXTENSIONS); i++) {
|
2303
|
+
sprintf(fname, "%s.%s", sm->name, COMPOUND_EXTENSIONS[i]);
|
2304
|
+
ary_append(files, estrdup(fname));
|
2305
|
+
}
|
2306
|
+
|
2307
|
+
// Field norm files
|
2308
|
+
for (i = 0; i < sm->fis->fcnt; i++) {
|
2309
|
+
fi = sm->fis->by_number[i];
|
2310
|
+
if (fi->is_indexed && !fi->omit_norms) {
|
2311
|
+
sprintf(fname, "%s.f%d", sm->name, i);
|
2312
|
+
ary_append(files, estrdup(fname));
|
2313
|
+
}
|
2314
|
+
}
|
2315
|
+
|
2316
|
+
// Vector files
|
2317
|
+
if (fis_has_vectors(sm->fis)) {
|
2318
|
+
for (i = 0; i < NELEMS(VECTOR_EXTENSIONS); i++) {
|
2319
|
+
sprintf(fname, "%s.%s", sm->name, VECTOR_EXTENSIONS[i]);
|
2320
|
+
ary_append(files, estrdup(fname));
|
2321
|
+
}
|
2322
|
+
}
|
2323
|
+
|
2324
|
+
// Now merge all added files
|
2325
|
+
for (i = 0; i < files->size; i++) {
|
2326
|
+
cw_add_file(cw, (char *)files->elems[i]);
|
2327
|
+
}
|
2328
|
+
|
2329
|
+
// Perform the merge
|
2330
|
+
cw_close(cw);
|
2331
|
+
|
2332
|
+
return files;
|
2333
|
+
}
|
2334
|
+
|
2335
|
+
/****************************************************************************
|
2336
|
+
*
|
2337
|
+
* IndexReader
|
2338
|
+
*
|
2339
|
+
****************************************************************************/
|
2340
|
+
|
2341
|
+
void ir_acquire_not_necessary(IndexReader *ir) {}
|
2342
|
+
void ir_acquire_write_lock(IndexReader *ir)
|
2343
|
+
{
|
2344
|
+
if (ir->is_stale)
|
2345
|
+
eprintf(STATE_ERROR, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations");
|
2346
|
+
|
2347
|
+
if (ir->write_lock == NULL) {
|
2348
|
+
ir->write_lock = ir->store->open_lock(ir->store, WRITE_LOCK_NAME);
|
2349
|
+
if (!ir->write_lock->obtain(ir->write_lock)) // obtain write lock
|
2350
|
+
eprintf(STATE_ERROR, "Index locked for write: %s", WRITE_LOCK_NAME);
|
2351
|
+
|
2352
|
+
// we have to check whether index has changed since this reader was opened.
|
2353
|
+
// if so, this reader is no longer valid for deletion
|
2354
|
+
if (sis_read_current_version(ir->store) > ir->sis->version) {
|
2355
|
+
ir->is_stale = true;
|
2356
|
+
ir->write_lock->release(ir->write_lock);
|
2357
|
+
ir->store->close_lock(ir->write_lock);
|
2358
|
+
ir->write_lock = NULL;
|
2359
|
+
eprintf(STATE_ERROR, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations");
|
2360
|
+
}
|
2361
|
+
}
|
2362
|
+
}
|
2363
|
+
|
2364
|
+
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_store)
|
2365
|
+
{
|
2366
|
+
IndexReader *ir = ALLOC(IndexReader);
|
2367
|
+
|
2368
|
+
mutex_init(&ir->mutex, NULL);
|
2369
|
+
ir->is_owner = is_owner;
|
2370
|
+
if (is_owner) {
|
2371
|
+
ir->acquire_write_lock = &ir_acquire_write_lock;
|
2372
|
+
} else {
|
2373
|
+
ir->acquire_write_lock = &ir_acquire_not_necessary;
|
2374
|
+
}
|
2375
|
+
|
2376
|
+
ir->store = store;
|
2377
|
+
ir->close_store = close_store;
|
2378
|
+
ir->sis = sis;
|
2379
|
+
ir->has_changes = false;
|
2380
|
+
ir->is_stale = false;
|
2381
|
+
ir->write_lock = NULL;
|
2382
|
+
ir->cache = NULL;
|
2383
|
+
ir->sort_cache = NULL;
|
2384
|
+
return ir;
|
2385
|
+
}
|
2386
|
+
|
2387
|
+
IndexReader *ir_open(Store *store, int close_store)
|
2388
|
+
{
|
2389
|
+
int i;
|
2390
|
+
IndexReader *ir;
|
2391
|
+
SegmentInfos *sis;
|
2392
|
+
|
2393
|
+
mutex_lock(&store->mutex);
|
2394
|
+
sis = sis_create();
|
2395
|
+
sis_read(sis, store);
|
2396
|
+
if (sis->scnt == 1) {
|
2397
|
+
ir = sr_open(sis, 0, true, close_store);
|
2398
|
+
} else {
|
2399
|
+
IndexReader **readers = ALLOC_N(IndexReader *, sis->scnt);
|
2400
|
+
for (i = 0; i < sis->scnt; i++) {
|
2401
|
+
readers[i] = sr_open(sis, i, false, false);
|
2402
|
+
}
|
2403
|
+
ir = mr_open(store, sis, readers, sis->scnt, close_store);
|
2404
|
+
}
|
2405
|
+
mutex_unlock(&store->mutex);
|
2406
|
+
return ir;
|
2407
|
+
}
|
2408
|
+
|
2409
|
+
bool ir_index_exists(Store *store)
|
2410
|
+
{
|
2411
|
+
return store->exists(store, "segments");
|
2412
|
+
}
|
2413
|
+
|
2414
|
+
void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
|
2415
|
+
{
|
2416
|
+
mutex_lock(&ir->mutex);
|
2417
|
+
ir->acquire_write_lock(ir);
|
2418
|
+
ir->do_set_norm(ir, doc_num, field, val);
|
2419
|
+
ir->has_changes = true;
|
2420
|
+
mutex_unlock(&ir->mutex);
|
2421
|
+
}
|
2422
|
+
|
2423
|
+
void ir_undelete_all(IndexReader *ir)
|
2424
|
+
{
|
2425
|
+
mutex_lock(&ir->mutex);
|
2426
|
+
ir->acquire_write_lock(ir);
|
2427
|
+
ir->do_undelete_all(ir);
|
2428
|
+
ir->has_changes = true;
|
2429
|
+
mutex_unlock(&ir->mutex);
|
2430
|
+
}
|
2431
|
+
|
2432
|
+
void ir_delete_doc(IndexReader *ir, int doc_num)
|
2433
|
+
{
|
2434
|
+
mutex_lock(&ir->mutex);
|
2435
|
+
ir->acquire_write_lock(ir);
|
2436
|
+
ir->do_delete_doc(ir, doc_num);
|
2437
|
+
ir->has_changes = true;
|
2438
|
+
mutex_unlock(&ir->mutex);
|
2439
|
+
}
|
2440
|
+
|
2441
|
+
Document *ir_get_doc_with_term(IndexReader *ir, Term *term)
|
2442
|
+
{
|
2443
|
+
TermDocEnum *tde = ir_term_docs_for(ir, term);
|
2444
|
+
if (!tde) return NULL;
|
2445
|
+
|
2446
|
+
Document *doc = NULL;
|
2447
|
+
if (tde->next(tde))
|
2448
|
+
doc = ir->get_doc(ir, tde->doc_num(tde));
|
2449
|
+
tde->close(tde);
|
2450
|
+
return doc;
|
2451
|
+
}
|
2452
|
+
|
2453
|
+
TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term)
|
2454
|
+
{
|
2455
|
+
TermDocEnum *tde = ir->term_docs(ir);
|
2456
|
+
tde->seek(tde, term);
|
2457
|
+
return tde;
|
2458
|
+
}
|
2459
|
+
|
2460
|
+
TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term)
|
2461
|
+
{
|
2462
|
+
TermDocEnum *tde = ir->term_positions(ir);
|
2463
|
+
tde->seek(tde, term);
|
2464
|
+
return tde;
|
2465
|
+
}
|
2466
|
+
|
2467
|
+
void ir_commit_internal(IndexReader *ir)
|
2468
|
+
{
|
2469
|
+
if (ir->has_changes) {
|
2470
|
+
if (ir->is_owner) {
|
2471
|
+
|
2472
|
+
mutex_lock(&ir->store->mutex);
|
2473
|
+
Lock *commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
|
2474
|
+
if (!commit_lock->obtain(commit_lock)) // obtain write lock
|
2475
|
+
eprintf(STATE_ERROR, "Index locked for commit: %s", COMMIT_LOCK_NAME);
|
2476
|
+
|
2477
|
+
ir->do_commit(ir);
|
2478
|
+
sis_write(ir->sis, ir->store);
|
2479
|
+
|
2480
|
+
commit_lock->release(commit_lock);
|
2481
|
+
ir->store->close_lock(commit_lock);
|
2482
|
+
mutex_unlock(&ir->store->mutex);
|
2483
|
+
|
2484
|
+
if (ir->write_lock != NULL) {
|
2485
|
+
ir->write_lock->release(ir->write_lock); // release write lock
|
2486
|
+
ir->store->close_lock(ir->write_lock);
|
2487
|
+
ir->write_lock = NULL;
|
2488
|
+
}
|
2489
|
+
} else {
|
2490
|
+
ir->do_commit(ir);
|
2491
|
+
}
|
2492
|
+
ir->has_changes = false;
|
2493
|
+
}
|
2494
|
+
}
|
2495
|
+
|
2496
|
+
void ir_commit(IndexReader *ir)
|
2497
|
+
{
|
2498
|
+
mutex_lock(&ir->mutex);
|
2499
|
+
ir_commit_internal(ir);
|
2500
|
+
mutex_unlock(&ir->mutex);
|
2501
|
+
}
|
2502
|
+
|
2503
|
+
void ir_close(IndexReader *ir)
|
2504
|
+
{
|
2505
|
+
mutex_lock(&ir->mutex);
|
2506
|
+
ir_commit_internal(ir);
|
2507
|
+
ir->do_close(ir);
|
2508
|
+
if (ir->close_store) {
|
2509
|
+
ir->store->close(ir->store);
|
2510
|
+
}
|
2511
|
+
if (ir->is_owner) {
|
2512
|
+
sis_destroy(ir->sis);
|
2513
|
+
}
|
2514
|
+
if (ir->cache) {
|
2515
|
+
h_destroy(ir->cache);
|
2516
|
+
}
|
2517
|
+
if (ir->sort_cache) {
|
2518
|
+
h_destroy(ir->sort_cache);
|
2519
|
+
}
|
2520
|
+
|
2521
|
+
mutex_destroy(&ir->mutex);
|
2522
|
+
free(ir);
|
2523
|
+
}
|
2524
|
+
|
2525
|
+
void ir_destroy(void *p)
|
2526
|
+
{
|
2527
|
+
IndexReader *ir = (IndexReader *)p;
|
2528
|
+
ir_close(ir);
|
2529
|
+
}
|
2530
|
+
|
2531
|
+
/**
|
2532
|
+
* Don't call this method if the cache already exists
|
2533
|
+
**/
|
2534
|
+
void ir_add_cache(IndexReader *ir)
|
2535
|
+
{
|
2536
|
+
ir->cache = co_hsh_create();
|
2537
|
+
}
|
2538
|
+
|
2539
|
+
bool ir_is_latest(IndexReader *ir)
|
2540
|
+
{
|
2541
|
+
return sis_read_current_version(ir->store) == ir->sis->version;
|
2542
|
+
}
|
2543
|
+
|