ferret 0.3.2 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/termdocs.c
ADDED
@@ -0,0 +1,599 @@
|
|
1
|
+
#include <index.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
/****************************************************************************
|
5
|
+
*
|
6
|
+
* SegmentTermDocEnum
|
7
|
+
*
|
8
|
+
****************************************************************************/
|
9
|
+
|
10
|
+
|
11
|
+
void stde_close(TermDocEnum *tde)
|
12
|
+
{
|
13
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
14
|
+
|
15
|
+
is_close(stde->freq_in);
|
16
|
+
if (stde->skip_in != NULL)
|
17
|
+
is_close(stde->skip_in);
|
18
|
+
|
19
|
+
free(stde);
|
20
|
+
free(tde);
|
21
|
+
}
|
22
|
+
|
23
|
+
void stde_seek_ti(TermDocEnum *tde, TermInfo *ti)
|
24
|
+
{
|
25
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
26
|
+
if (ti == NULL) {
|
27
|
+
stde->doc_freq = 0;
|
28
|
+
} else {
|
29
|
+
stde->count = 0;
|
30
|
+
stde->doc_freq = ti->doc_freq;
|
31
|
+
stde->doc_num = 0;
|
32
|
+
stde->skip_doc = 0;
|
33
|
+
stde->skip_count = 0;
|
34
|
+
stde->num_skips = stde->doc_freq / stde->skip_interval;
|
35
|
+
stde->freq_pointer = ti->freq_pointer;
|
36
|
+
stde->prox_pointer = ti->prox_pointer;
|
37
|
+
stde->skip_pointer = ti->freq_pointer + ti->skip_offset;
|
38
|
+
is_seek(stde->freq_in, ti->freq_pointer);
|
39
|
+
stde->have_skipped = false;
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
void stde_seek(TermDocEnum *tde, Term *term)
|
44
|
+
{
|
45
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
46
|
+
TermInfo *ti = tir_get_ti(stde->parent->tir, term);
|
47
|
+
stde_seek_ti(tde, ti);
|
48
|
+
ti_destroy(ti);
|
49
|
+
}
|
50
|
+
|
51
|
+
void stde_skip_prox(SegmentTermDocEnum *stde) { }
|
52
|
+
void stde_seek_prox(SegmentTermDocEnum *stde, int prox_pointer) { }
|
53
|
+
|
54
|
+
bool stde_next(TermDocEnum *tde)
|
55
|
+
{
|
56
|
+
int doc_code;
|
57
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
58
|
+
while (true) {
|
59
|
+
|
60
|
+
if (stde->count >= stde->doc_freq)
|
61
|
+
return false;
|
62
|
+
|
63
|
+
doc_code = is_read_vint(stde->freq_in);
|
64
|
+
stde->doc_num += doc_code >> 1; // shift off low bit
|
65
|
+
if ((doc_code & 1) != 0) { // if low bit is set
|
66
|
+
stde->freq = 1; // freq is one
|
67
|
+
} else {
|
68
|
+
stde->freq = is_read_vint(stde->freq_in); // else read freq
|
69
|
+
}
|
70
|
+
|
71
|
+
stde->count++;
|
72
|
+
|
73
|
+
if (stde->deleted_docs == NULL ||
|
74
|
+
bv_get(stde->deleted_docs, stde->doc_num) == 0)
|
75
|
+
break; // We found an undeleted doc so return
|
76
|
+
|
77
|
+
stde->skip_prox(stde);
|
78
|
+
}
|
79
|
+
return true;
|
80
|
+
}
|
81
|
+
|
82
|
+
int stde_doc_num(TermDocEnum *tde)
|
83
|
+
{ return ((SegmentTermDocEnum *)tde->data)->doc_num; }
|
84
|
+
|
85
|
+
int stde_freq(TermDocEnum *tde)
|
86
|
+
{ return ((SegmentTermDocEnum *)tde->data)->freq; }
|
87
|
+
|
88
|
+
bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
|
89
|
+
{
|
90
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
91
|
+
if (stde->doc_freq >= stde->skip_interval) { // optimized case
|
92
|
+
|
93
|
+
if (stde->skip_in == NULL)
|
94
|
+
stde->skip_in = is_clone(stde->freq_in); // lazily clone
|
95
|
+
|
96
|
+
if (!stde->have_skipped) { // lazily seek skip stream
|
97
|
+
is_seek(stde->skip_in, stde->skip_pointer);
|
98
|
+
stde->have_skipped = true;
|
99
|
+
}
|
100
|
+
|
101
|
+
// scan skip data
|
102
|
+
int last_skip_doc = stde->skip_doc;
|
103
|
+
int last_freq_pointer = is_pos(stde->freq_in);
|
104
|
+
int last_prox_pointer = -1;
|
105
|
+
int num_skipped = -1 - (stde->count % stde->skip_interval);
|
106
|
+
|
107
|
+
while (target_doc_num > stde->skip_doc) {
|
108
|
+
last_skip_doc = stde->skip_doc;
|
109
|
+
last_freq_pointer = stde->freq_pointer;
|
110
|
+
last_prox_pointer = stde->prox_pointer;
|
111
|
+
|
112
|
+
if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num)
|
113
|
+
num_skipped += stde->skip_interval;
|
114
|
+
|
115
|
+
if(stde->skip_count >= stde->num_skips)
|
116
|
+
break;
|
117
|
+
|
118
|
+
stde->skip_doc += is_read_vint(stde->skip_in);
|
119
|
+
stde->freq_pointer += is_read_vint(stde->skip_in);
|
120
|
+
stde->prox_pointer += is_read_vint(stde->skip_in);
|
121
|
+
|
122
|
+
stde->skip_count++;
|
123
|
+
}
|
124
|
+
|
125
|
+
// if we found something to skip, so skip it
|
126
|
+
if (last_freq_pointer > is_pos(stde->freq_in)) {
|
127
|
+
is_seek(stde->freq_in, last_freq_pointer);
|
128
|
+
stde->seek_prox(stde, last_prox_pointer);
|
129
|
+
|
130
|
+
stde->doc_num = last_skip_doc;
|
131
|
+
stde->count += num_skipped;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
// done skipping, now just scan
|
136
|
+
do {
|
137
|
+
if (! tde->next(tde)) {
|
138
|
+
return false;
|
139
|
+
}
|
140
|
+
} while (target_doc_num > ((SegmentTermDocEnum *)tde->data)->doc_num);
|
141
|
+
return true;
|
142
|
+
}
|
143
|
+
|
144
|
+
int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
145
|
+
{
|
146
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
147
|
+
int i = 0, doc_code;
|
148
|
+
while (i < req_num && stde->count < stde->doc_freq) {
|
149
|
+
// manually inlined call to next() for speed
|
150
|
+
doc_code = is_read_vint(stde->freq_in);
|
151
|
+
stde->doc_num += doc_code >> 1; // shift off low bit
|
152
|
+
if ((doc_code & 1) != 0) // if low bit is set
|
153
|
+
stde->freq = 1; // freq is one
|
154
|
+
else
|
155
|
+
stde->freq = is_read_vint(stde->freq_in); // else read freq
|
156
|
+
|
157
|
+
stde->count++;
|
158
|
+
|
159
|
+
if (stde->deleted_docs == NULL ||
|
160
|
+
bv_get(stde->deleted_docs, stde->doc_num) == 0) {
|
161
|
+
docs[i] = stde->doc_num;
|
162
|
+
freqs[i] = stde->freq;
|
163
|
+
i++;
|
164
|
+
}
|
165
|
+
}
|
166
|
+
return i;
|
167
|
+
}
|
168
|
+
|
169
|
+
TermDocEnum *stde_create(IndexReader *ir)
|
170
|
+
{
|
171
|
+
SegmentReader *sr = (SegmentReader *)ir->data;
|
172
|
+
TermDocEnum *tde = ALLOC(TermDocEnum);
|
173
|
+
tde->seek = &stde_seek;
|
174
|
+
tde->doc_num = &stde_doc_num;
|
175
|
+
tde->freq = &stde_freq;
|
176
|
+
tde->next = &stde_next;
|
177
|
+
tde->read = &stde_read;
|
178
|
+
tde->skip_to = &stde_skip_to;
|
179
|
+
tde->next_position = NULL;
|
180
|
+
tde->close = &stde_close;
|
181
|
+
|
182
|
+
SegmentTermDocEnum *stde = ALLOC(SegmentTermDocEnum);
|
183
|
+
ZEROSET(stde, SegmentTermDocEnum, 1); // set all values to 0
|
184
|
+
tde->data = stde;
|
185
|
+
stde->parent = sr;
|
186
|
+
stde->freq_in = is_clone(sr->freq_in);
|
187
|
+
stde->deleted_docs = sr->deleted_docs;
|
188
|
+
stde->skip_interval = sr->tir->skip_interval;
|
189
|
+
stde->skip_in = NULL;
|
190
|
+
stde->have_skipped = false;
|
191
|
+
stde->skip_prox = &stde_skip_prox;
|
192
|
+
stde->seek_prox = &stde_seek_prox;
|
193
|
+
return tde;
|
194
|
+
}
|
195
|
+
|
196
|
+
/****************************************************************************
|
197
|
+
*
|
198
|
+
* SegmentTermPosEnum
|
199
|
+
*
|
200
|
+
****************************************************************************/
|
201
|
+
|
202
|
+
void stpe_seek(TermDocEnum *tde, Term *term)
|
203
|
+
{
|
204
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
205
|
+
TermInfo *ti = tir_get_ti(stde->parent->tir, term);
|
206
|
+
stde_seek_ti(tde, ti);
|
207
|
+
if (ti != NULL) {
|
208
|
+
is_seek(stde->prox_in, ti->prox_pointer);
|
209
|
+
}
|
210
|
+
stde->prox_cnt = 0;
|
211
|
+
ti_destroy(ti);
|
212
|
+
}
|
213
|
+
|
214
|
+
void stpe_close(TermDocEnum *tde)
|
215
|
+
{
|
216
|
+
// super
|
217
|
+
is_close(((SegmentTermDocEnum *)tde->data)->prox_in);
|
218
|
+
((SegmentTermDocEnum *)tde->data)->prox_in = NULL;
|
219
|
+
stde_close(tde);
|
220
|
+
}
|
221
|
+
|
222
|
+
void stpe_skip_prox(SegmentTermDocEnum *stde)
|
223
|
+
{
|
224
|
+
int i;
|
225
|
+
for (i = 0; i < stde->freq; i++)
|
226
|
+
is_read_vint(stde->prox_in);
|
227
|
+
}
|
228
|
+
|
229
|
+
void stpe_seek_prox(SegmentTermDocEnum *stde, int prox_pointer)
|
230
|
+
{
|
231
|
+
is_seek(stde->prox_in, prox_pointer);
|
232
|
+
stde->prox_cnt = 0;
|
233
|
+
}
|
234
|
+
|
235
|
+
bool stpe_next(TermDocEnum *tde)
|
236
|
+
{
|
237
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
238
|
+
int i;
|
239
|
+
for (i = 0; i < stde->prox_cnt; i++)
|
240
|
+
is_read_vint(stde->prox_in);
|
241
|
+
|
242
|
+
// if super
|
243
|
+
if (stde_next(tde)) {
|
244
|
+
stde->prox_cnt = stde->freq;
|
245
|
+
stde->position = 0;
|
246
|
+
return true;
|
247
|
+
}
|
248
|
+
return false;
|
249
|
+
}
|
250
|
+
|
251
|
+
int stpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
252
|
+
{
|
253
|
+
eprintf(ARG_ERROR, "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.");
|
254
|
+
return -1;
|
255
|
+
}
|
256
|
+
|
257
|
+
int stpe_next_position(TermDocEnum *tde)
|
258
|
+
{
|
259
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
260
|
+
stde->prox_cnt--;
|
261
|
+
return stde->position += is_read_vint(stde->prox_in);
|
262
|
+
}
|
263
|
+
|
264
|
+
TermDocEnum *stpe_create(IndexReader *ir)
|
265
|
+
{
|
266
|
+
SegmentReader *sr = (SegmentReader *)ir->data;
|
267
|
+
TermDocEnum *tde = stde_create(ir);
|
268
|
+
tde->close = &stpe_close;
|
269
|
+
tde->seek = &stpe_seek;
|
270
|
+
tde->next = &stpe_next;
|
271
|
+
tde->read = &stpe_read;
|
272
|
+
tde->next_position = &stpe_next_position;
|
273
|
+
|
274
|
+
SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
|
275
|
+
stde->prox_in = is_clone(sr->prox_in);
|
276
|
+
stde->prox_cnt = 0;
|
277
|
+
stde->position = 0;
|
278
|
+
stde->skip_prox = &stpe_skip_prox;
|
279
|
+
stde->seek_prox = &stpe_seek_prox;
|
280
|
+
|
281
|
+
return tde;
|
282
|
+
}
|
283
|
+
|
284
|
+
/****************************************************************************
|
285
|
+
*
|
286
|
+
* MultiTermDocEnum
|
287
|
+
*
|
288
|
+
****************************************************************************/
|
289
|
+
|
290
|
+
void mtde_close(TermDocEnum *tde)
|
291
|
+
{
|
292
|
+
MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
|
293
|
+
TermDocEnum *tmp_tde;
|
294
|
+
int i;
|
295
|
+
for (i = 0; i < mtde->ir_cnt; i++) {
|
296
|
+
if ((tmp_tde = mtde->irs_tde[i]) != NULL)
|
297
|
+
tmp_tde->close(tmp_tde);
|
298
|
+
}
|
299
|
+
if (mtde->term != NULL) term_destroy(mtde->term);
|
300
|
+
free(mtde->irs_tde);
|
301
|
+
free(mtde);
|
302
|
+
free(tde);
|
303
|
+
}
|
304
|
+
|
305
|
+
void mtde_seek(TermDocEnum *tde, Term *term)
|
306
|
+
{
|
307
|
+
MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
|
308
|
+
if (mtde->term != NULL) term_destroy(mtde->term);
|
309
|
+
mtde->term = term_create(term->field, term->text);
|
310
|
+
mtde->base = 0;
|
311
|
+
mtde->pointer = 0;
|
312
|
+
mtde->curr_tde = NULL;
|
313
|
+
}
|
314
|
+
|
315
|
+
TermDocEnum *mtde_term_docs_from_reader(IndexReader *ir)
|
316
|
+
{
|
317
|
+
return ir->term_docs(ir);
|
318
|
+
}
|
319
|
+
|
320
|
+
TermDocEnum *mtde_term_docs(MultiTermDocEnum *mtde, int i)
|
321
|
+
{
|
322
|
+
if (mtde->term == NULL)
|
323
|
+
return NULL;
|
324
|
+
|
325
|
+
TermDocEnum *tde = mtde->irs_tde[i];
|
326
|
+
if (tde == NULL) {
|
327
|
+
tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
|
328
|
+
}
|
329
|
+
|
330
|
+
tde->seek(tde, mtde->term);
|
331
|
+
return tde;
|
332
|
+
}
|
333
|
+
|
334
|
+
bool mtde_next(TermDocEnum *tde)
|
335
|
+
{
|
336
|
+
MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
|
337
|
+
if (mtde->curr_tde != NULL && mtde->curr_tde->next(mtde->curr_tde)) {
|
338
|
+
return true;
|
339
|
+
} else if (mtde->pointer < mtde->ir_cnt) {
|
340
|
+
mtde->base = mtde->starts[mtde->pointer];
|
341
|
+
mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer);
|
342
|
+
mtde->pointer++;
|
343
|
+
return mtde_next(tde);
|
344
|
+
} else {
|
345
|
+
return false;
|
346
|
+
}
|
347
|
+
}
|
348
|
+
|
349
|
+
int mtde_doc_num(TermDocEnum *tde)
|
350
|
+
{
|
351
|
+
MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
|
352
|
+
return mtde->base + mtde->curr_tde->doc_num(mtde->curr_tde);
|
353
|
+
}
|
354
|
+
|
355
|
+
int mtde_freq(TermDocEnum *tde)
|
356
|
+
{
|
357
|
+
MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
|
358
|
+
return mtde->curr_tde->freq(mtde->curr_tde);
|
359
|
+
}
|
360
|
+
|
361
|
+
bool mtde_skip_to(TermDocEnum *tde, int target_doc_num)
|
362
|
+
{
|
363
|
+
MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
|
364
|
+
while (mtde->pointer < mtde->ir_cnt) {
|
365
|
+
if ((target_doc_num < mtde->starts[mtde->pointer]) &&
|
366
|
+
(mtde->curr_tde->skip_to(mtde->curr_tde, target_doc_num - mtde->base))) {
|
367
|
+
return true;
|
368
|
+
}
|
369
|
+
|
370
|
+
mtde->base = mtde->starts[mtde->pointer];
|
371
|
+
mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer);
|
372
|
+
mtde->pointer++;
|
373
|
+
}
|
374
|
+
if (mtde->curr_tde) {
|
375
|
+
return mtde->curr_tde->skip_to(mtde->curr_tde, target_doc_num - mtde->base);
|
376
|
+
} else {
|
377
|
+
return false;
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
382
|
+
{
|
383
|
+
int i, end = 0, last_end = 0, b;
|
384
|
+
MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
|
385
|
+
while (true) {
|
386
|
+
while (mtde->curr_tde == NULL) {
|
387
|
+
if (mtde->pointer < mtde->ir_cnt) { // try next segment
|
388
|
+
mtde->base = mtde->starts[mtde->pointer];
|
389
|
+
mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer++);
|
390
|
+
} else {
|
391
|
+
return end;
|
392
|
+
}
|
393
|
+
}
|
394
|
+
end += mtde->curr_tde->read(mtde->curr_tde,
|
395
|
+
&docs[last_end], &freqs[last_end], req_num - last_end);
|
396
|
+
if (end == last_end) { // none left in segment
|
397
|
+
mtde->curr_tde = NULL;
|
398
|
+
} else { // got some
|
399
|
+
b = mtde->base; // adjust doc numbers
|
400
|
+
for (i = last_end; i < end; i++)
|
401
|
+
docs[i] += b;
|
402
|
+
if (end == req_num)
|
403
|
+
return end;
|
404
|
+
else
|
405
|
+
last_end = end;
|
406
|
+
}
|
407
|
+
}
|
408
|
+
}
|
409
|
+
|
410
|
+
TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
|
411
|
+
{
|
412
|
+
TermDocEnum *tde = ALLOC(TermDocEnum);
|
413
|
+
tde->close = &mtde_close;
|
414
|
+
tde->seek = &mtde_seek;
|
415
|
+
tde->next = &mtde_next;
|
416
|
+
tde->doc_num = &mtde_doc_num;
|
417
|
+
tde->freq = &mtde_freq;
|
418
|
+
tde->skip_to = &mtde_skip_to;
|
419
|
+
tde->read = &mtde_read;
|
420
|
+
tde->next_position = NULL;
|
421
|
+
|
422
|
+
MultiTermDocEnum *mtde = ALLOC(MultiTermDocEnum);
|
423
|
+
ZEROSET(mtde, MultiTermDocEnum, 1); // set all values to 0
|
424
|
+
tde->data = mtde;
|
425
|
+
mtde->irs = irs;
|
426
|
+
mtde->starts = starts;
|
427
|
+
mtde->ir_cnt = ir_cnt;
|
428
|
+
mtde->irs_tde = ALLOC_N(TermDocEnum *, ir_cnt);
|
429
|
+
ZEROSET(mtde->irs_tde, TermDocEnum *, ir_cnt);
|
430
|
+
mtde->term_docs_from_reader = &mtde_term_docs_from_reader;
|
431
|
+
|
432
|
+
return tde;
|
433
|
+
}
|
434
|
+
|
435
|
+
/****************************************************************************
|
436
|
+
*
|
437
|
+
* MultiTermPosEnum
|
438
|
+
*
|
439
|
+
****************************************************************************/
|
440
|
+
|
441
|
+
TermDocEnum *mtpe_term_docs_from_reader(IndexReader *ir)
|
442
|
+
{
|
443
|
+
return ir->term_positions(ir);
|
444
|
+
}
|
445
|
+
|
446
|
+
|
447
|
+
int mtpe_next_position(TermDocEnum *tde)
|
448
|
+
{
|
449
|
+
TermDocEnum *curr_tde = ((MultiTermDocEnum *)tde->data)->curr_tde;
|
450
|
+
return curr_tde->next_position(curr_tde);
|
451
|
+
}
|
452
|
+
|
453
|
+
TermDocEnum *mtpe_create(IndexReader **irs, int *starts, int ir_cnt)
|
454
|
+
{
|
455
|
+
TermDocEnum *tde = mtde_create(irs, starts, ir_cnt);
|
456
|
+
tde->next_position = &mtpe_next_position;
|
457
|
+
((MultiTermDocEnum *)tde->data)->term_docs_from_reader = &mtpe_term_docs_from_reader;
|
458
|
+
return tde;
|
459
|
+
}
|
460
|
+
|
461
|
+
/****************************************************************************
|
462
|
+
*
|
463
|
+
* MultipleTermDocPosEnum
|
464
|
+
*
|
465
|
+
****************************************************************************/
|
466
|
+
|
467
|
+
#define GET_MTDPE MultipleTermDocPosEnum *mtdpe = (MultipleTermDocPosEnum *)self->data
|
468
|
+
void tde_destroy(void *p) {
|
469
|
+
TermDocEnum *self = (TermDocEnum *)p;
|
470
|
+
self->close(self);
|
471
|
+
}
|
472
|
+
|
473
|
+
void mtdpe_close(TermDocEnum *self)
|
474
|
+
{
|
475
|
+
GET_MTDPE;
|
476
|
+
|
477
|
+
pq_clear(mtdpe->pq);
|
478
|
+
pq_destroy(mtdpe->pq);
|
479
|
+
free(mtdpe->pos_queue);
|
480
|
+
free(mtdpe);
|
481
|
+
free(self);
|
482
|
+
}
|
483
|
+
|
484
|
+
void mtdpe_seek(TermDocEnum *tde, Term *term)
|
485
|
+
{ eprintf(UNSUPPORTED_ERROR, "Unsupported op seek on MultipleTDPE");}
|
486
|
+
|
487
|
+
bool mtdpe_next(TermDocEnum *self)
|
488
|
+
{
|
489
|
+
TermDocEnum *tde;
|
490
|
+
int i = 0, freq = 0;
|
491
|
+
int doc;
|
492
|
+
GET_MTDPE;
|
493
|
+
|
494
|
+
if (mtdpe->pq->count == 0) return false;
|
495
|
+
|
496
|
+
tde = (TermDocEnum *)pq_top(mtdpe->pq);
|
497
|
+
doc = tde->doc_num(tde);
|
498
|
+
|
499
|
+
do {
|
500
|
+
freq += tde->freq(tde);
|
501
|
+
if (freq > mtdpe->pos_queue_capa) {
|
502
|
+
mtdpe->pos_queue_capa *= 2;
|
503
|
+
REALLOC_N(mtdpe->pos_queue, int, mtdpe->pos_queue_capa);
|
504
|
+
}
|
505
|
+
|
506
|
+
for (; i < freq; i++) {
|
507
|
+
mtdpe->pos_queue[i] = tde->next_position(tde);
|
508
|
+
}
|
509
|
+
|
510
|
+
if (tde->next(tde)) {
|
511
|
+
pq_down(mtdpe->pq);
|
512
|
+
} else {
|
513
|
+
tde = pq_pop(mtdpe->pq);
|
514
|
+
tde->close(tde);
|
515
|
+
}
|
516
|
+
tde = (TermDocEnum *)pq_top(mtdpe->pq);
|
517
|
+
} while ((mtdpe->pq->count > 0) && (tde->doc_num(tde) == doc));
|
518
|
+
|
519
|
+
qsort(mtdpe->pos_queue, freq, sizeof(int), &icmp_risky);
|
520
|
+
|
521
|
+
mtdpe->pos_queue_index = 0;
|
522
|
+
mtdpe->freq = freq;
|
523
|
+
mtdpe->doc_num = doc;
|
524
|
+
|
525
|
+
return true;
|
526
|
+
}
|
527
|
+
|
528
|
+
int mtdpe_doc_num(TermDocEnum *self)
|
529
|
+
{ return ((MultipleTermDocPosEnum *)self->data)->doc_num; }
|
530
|
+
|
531
|
+
int mtdpe_freq(TermDocEnum *self)
|
532
|
+
{ return ((MultipleTermDocPosEnum *)self->data)->freq; }
|
533
|
+
|
534
|
+
|
535
|
+
bool tdpe_less_than(void *p1, void *p2)
|
536
|
+
{
|
537
|
+
return ((TermDocEnum *)p1)->doc_num((TermDocEnum *)p1) <
|
538
|
+
((TermDocEnum *)p2)->doc_num((TermDocEnum *)p2);
|
539
|
+
}
|
540
|
+
|
541
|
+
bool mtdpe_skip_to(TermDocEnum *self, int target_doc_num)
|
542
|
+
{
|
543
|
+
GET_MTDPE;
|
544
|
+
TermDocEnum *tde;
|
545
|
+
while ((tde = pq_top(mtdpe->pq)) != NULL &&
|
546
|
+
(target_doc_num > tde->doc_num(tde))) {
|
547
|
+
if (tde->skip_to(tde, target_doc_num)) {
|
548
|
+
pq_down(mtdpe->pq);
|
549
|
+
} else {
|
550
|
+
tde = pq_pop(mtdpe->pq);
|
551
|
+
tde->close(tde);
|
552
|
+
}
|
553
|
+
}
|
554
|
+
return self->next(self);
|
555
|
+
}
|
556
|
+
|
557
|
+
int mtdpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
558
|
+
{
|
559
|
+
eprintf(UNSUPPORTED_ERROR, "Unsupported op read on MultipleTDPE");
|
560
|
+
return -1;
|
561
|
+
}
|
562
|
+
|
563
|
+
int mtdpe_next_position(TermDocEnum *self)
|
564
|
+
{
|
565
|
+
GET_MTDPE;
|
566
|
+
return mtdpe->pos_queue[mtdpe->pos_queue_index++];
|
567
|
+
}
|
568
|
+
|
569
|
+
TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
|
570
|
+
{
|
571
|
+
TermDocEnum *self = ALLOC(TermDocEnum);
|
572
|
+
MultipleTermDocPosEnum *mtdpe = ALLOC(MultipleTermDocPosEnum);
|
573
|
+
PriorityQueue *pq;
|
574
|
+
TermDocEnum *tpe;
|
575
|
+
int i;
|
576
|
+
|
577
|
+
self->close = &mtdpe_close;
|
578
|
+
self->seek = &mtdpe_seek;
|
579
|
+
self->next = &mtdpe_next;
|
580
|
+
self->doc_num = &mtdpe_doc_num;
|
581
|
+
self->freq = &mtdpe_freq;
|
582
|
+
self->skip_to = &mtdpe_skip_to;
|
583
|
+
self->read = &mtdpe_read;
|
584
|
+
self->next_position = &mtdpe_next_position;
|
585
|
+
|
586
|
+
ZEROSET(mtdpe, MultipleTermDocPosEnum, 1); // set all values to 0
|
587
|
+
self->data = mtdpe;
|
588
|
+
pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
|
589
|
+
mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
|
590
|
+
mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
|
591
|
+
for (i = 0; i < t_cnt; i++) {
|
592
|
+
tpe = ir_term_positions_for(ir, terms[i]);
|
593
|
+
if (tpe->next(tpe)) pq_push(pq, tpe);
|
594
|
+
}
|
595
|
+
pq->free_elem = &tde_destroy;
|
596
|
+
|
597
|
+
return self;
|
598
|
+
}
|
599
|
+
|