ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/termdocs.c ADDED
@@ -0,0 +1,599 @@
1
+ #include <index.h>
2
+ #include <string.h>
3
+
4
+ /****************************************************************************
5
+ *
6
+ * SegmentTermDocEnum
7
+ *
8
+ ****************************************************************************/
9
+
10
+
11
+ void stde_close(TermDocEnum *tde)
12
+ {
13
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
14
+
15
+ is_close(stde->freq_in);
16
+ if (stde->skip_in != NULL)
17
+ is_close(stde->skip_in);
18
+
19
+ free(stde);
20
+ free(tde);
21
+ }
22
+
23
+ void stde_seek_ti(TermDocEnum *tde, TermInfo *ti)
24
+ {
25
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
26
+ if (ti == NULL) {
27
+ stde->doc_freq = 0;
28
+ } else {
29
+ stde->count = 0;
30
+ stde->doc_freq = ti->doc_freq;
31
+ stde->doc_num = 0;
32
+ stde->skip_doc = 0;
33
+ stde->skip_count = 0;
34
+ stde->num_skips = stde->doc_freq / stde->skip_interval;
35
+ stde->freq_pointer = ti->freq_pointer;
36
+ stde->prox_pointer = ti->prox_pointer;
37
+ stde->skip_pointer = ti->freq_pointer + ti->skip_offset;
38
+ is_seek(stde->freq_in, ti->freq_pointer);
39
+ stde->have_skipped = false;
40
+ }
41
+ }
42
+
43
+ void stde_seek(TermDocEnum *tde, Term *term)
44
+ {
45
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
46
+ TermInfo *ti = tir_get_ti(stde->parent->tir, term);
47
+ stde_seek_ti(tde, ti);
48
+ ti_destroy(ti);
49
+ }
50
+
51
+ void stde_skip_prox(SegmentTermDocEnum *stde) { }
52
+ void stde_seek_prox(SegmentTermDocEnum *stde, int prox_pointer) { }
53
+
54
+ bool stde_next(TermDocEnum *tde)
55
+ {
56
+ int doc_code;
57
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
58
+ while (true) {
59
+
60
+ if (stde->count >= stde->doc_freq)
61
+ return false;
62
+
63
+ doc_code = is_read_vint(stde->freq_in);
64
+ stde->doc_num += doc_code >> 1; // shift off low bit
65
+ if ((doc_code & 1) != 0) { // if low bit is set
66
+ stde->freq = 1; // freq is one
67
+ } else {
68
+ stde->freq = is_read_vint(stde->freq_in); // else read freq
69
+ }
70
+
71
+ stde->count++;
72
+
73
+ if (stde->deleted_docs == NULL ||
74
+ bv_get(stde->deleted_docs, stde->doc_num) == 0)
75
+ break; // We found an undeleted doc so return
76
+
77
+ stde->skip_prox(stde);
78
+ }
79
+ return true;
80
+ }
81
+
82
+ int stde_doc_num(TermDocEnum *tde)
83
+ { return ((SegmentTermDocEnum *)tde->data)->doc_num; }
84
+
85
+ int stde_freq(TermDocEnum *tde)
86
+ { return ((SegmentTermDocEnum *)tde->data)->freq; }
87
+
88
+ bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
89
+ {
90
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
91
+ if (stde->doc_freq >= stde->skip_interval) { // optimized case
92
+
93
+ if (stde->skip_in == NULL)
94
+ stde->skip_in = is_clone(stde->freq_in); // lazily clone
95
+
96
+ if (!stde->have_skipped) { // lazily seek skip stream
97
+ is_seek(stde->skip_in, stde->skip_pointer);
98
+ stde->have_skipped = true;
99
+ }
100
+
101
+ // scan skip data
102
+ int last_skip_doc = stde->skip_doc;
103
+ int last_freq_pointer = is_pos(stde->freq_in);
104
+ int last_prox_pointer = -1;
105
+ int num_skipped = -1 - (stde->count % stde->skip_interval);
106
+
107
+ while (target_doc_num > stde->skip_doc) {
108
+ last_skip_doc = stde->skip_doc;
109
+ last_freq_pointer = stde->freq_pointer;
110
+ last_prox_pointer = stde->prox_pointer;
111
+
112
+ if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num)
113
+ num_skipped += stde->skip_interval;
114
+
115
+ if(stde->skip_count >= stde->num_skips)
116
+ break;
117
+
118
+ stde->skip_doc += is_read_vint(stde->skip_in);
119
+ stde->freq_pointer += is_read_vint(stde->skip_in);
120
+ stde->prox_pointer += is_read_vint(stde->skip_in);
121
+
122
+ stde->skip_count++;
123
+ }
124
+
125
+ // if we found something to skip, so skip it
126
+ if (last_freq_pointer > is_pos(stde->freq_in)) {
127
+ is_seek(stde->freq_in, last_freq_pointer);
128
+ stde->seek_prox(stde, last_prox_pointer);
129
+
130
+ stde->doc_num = last_skip_doc;
131
+ stde->count += num_skipped;
132
+ }
133
+ }
134
+
135
+ // done skipping, now just scan
136
+ do {
137
+ if (! tde->next(tde)) {
138
+ return false;
139
+ }
140
+ } while (target_doc_num > ((SegmentTermDocEnum *)tde->data)->doc_num);
141
+ return true;
142
+ }
143
+
144
+ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
145
+ {
146
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
147
+ int i = 0, doc_code;
148
+ while (i < req_num && stde->count < stde->doc_freq) {
149
+ // manually inlined call to next() for speed
150
+ doc_code = is_read_vint(stde->freq_in);
151
+ stde->doc_num += doc_code >> 1; // shift off low bit
152
+ if ((doc_code & 1) != 0) // if low bit is set
153
+ stde->freq = 1; // freq is one
154
+ else
155
+ stde->freq = is_read_vint(stde->freq_in); // else read freq
156
+
157
+ stde->count++;
158
+
159
+ if (stde->deleted_docs == NULL ||
160
+ bv_get(stde->deleted_docs, stde->doc_num) == 0) {
161
+ docs[i] = stde->doc_num;
162
+ freqs[i] = stde->freq;
163
+ i++;
164
+ }
165
+ }
166
+ return i;
167
+ }
168
+
169
+ TermDocEnum *stde_create(IndexReader *ir)
170
+ {
171
+ SegmentReader *sr = (SegmentReader *)ir->data;
172
+ TermDocEnum *tde = ALLOC(TermDocEnum);
173
+ tde->seek = &stde_seek;
174
+ tde->doc_num = &stde_doc_num;
175
+ tde->freq = &stde_freq;
176
+ tde->next = &stde_next;
177
+ tde->read = &stde_read;
178
+ tde->skip_to = &stde_skip_to;
179
+ tde->next_position = NULL;
180
+ tde->close = &stde_close;
181
+
182
+ SegmentTermDocEnum *stde = ALLOC(SegmentTermDocEnum);
183
+ ZEROSET(stde, SegmentTermDocEnum, 1); // set all values to 0
184
+ tde->data = stde;
185
+ stde->parent = sr;
186
+ stde->freq_in = is_clone(sr->freq_in);
187
+ stde->deleted_docs = sr->deleted_docs;
188
+ stde->skip_interval = sr->tir->skip_interval;
189
+ stde->skip_in = NULL;
190
+ stde->have_skipped = false;
191
+ stde->skip_prox = &stde_skip_prox;
192
+ stde->seek_prox = &stde_seek_prox;
193
+ return tde;
194
+ }
195
+
196
+ /****************************************************************************
197
+ *
198
+ * SegmentTermPosEnum
199
+ *
200
+ ****************************************************************************/
201
+
202
+ void stpe_seek(TermDocEnum *tde, Term *term)
203
+ {
204
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
205
+ TermInfo *ti = tir_get_ti(stde->parent->tir, term);
206
+ stde_seek_ti(tde, ti);
207
+ if (ti != NULL) {
208
+ is_seek(stde->prox_in, ti->prox_pointer);
209
+ }
210
+ stde->prox_cnt = 0;
211
+ ti_destroy(ti);
212
+ }
213
+
214
+ void stpe_close(TermDocEnum *tde)
215
+ {
216
+ // super
217
+ is_close(((SegmentTermDocEnum *)tde->data)->prox_in);
218
+ ((SegmentTermDocEnum *)tde->data)->prox_in = NULL;
219
+ stde_close(tde);
220
+ }
221
+
222
+ void stpe_skip_prox(SegmentTermDocEnum *stde)
223
+ {
224
+ int i;
225
+ for (i = 0; i < stde->freq; i++)
226
+ is_read_vint(stde->prox_in);
227
+ }
228
+
229
+ void stpe_seek_prox(SegmentTermDocEnum *stde, int prox_pointer)
230
+ {
231
+ is_seek(stde->prox_in, prox_pointer);
232
+ stde->prox_cnt = 0;
233
+ }
234
+
235
+ bool stpe_next(TermDocEnum *tde)
236
+ {
237
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
238
+ int i;
239
+ for (i = 0; i < stde->prox_cnt; i++)
240
+ is_read_vint(stde->prox_in);
241
+
242
+ // if super
243
+ if (stde_next(tde)) {
244
+ stde->prox_cnt = stde->freq;
245
+ stde->position = 0;
246
+ return true;
247
+ }
248
+ return false;
249
+ }
250
+
251
+ int stpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
252
+ {
253
+ eprintf(ARG_ERROR, "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.");
254
+ return -1;
255
+ }
256
+
257
+ int stpe_next_position(TermDocEnum *tde)
258
+ {
259
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
260
+ stde->prox_cnt--;
261
+ return stde->position += is_read_vint(stde->prox_in);
262
+ }
263
+
264
+ TermDocEnum *stpe_create(IndexReader *ir)
265
+ {
266
+ SegmentReader *sr = (SegmentReader *)ir->data;
267
+ TermDocEnum *tde = stde_create(ir);
268
+ tde->close = &stpe_close;
269
+ tde->seek = &stpe_seek;
270
+ tde->next = &stpe_next;
271
+ tde->read = &stpe_read;
272
+ tde->next_position = &stpe_next_position;
273
+
274
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
275
+ stde->prox_in = is_clone(sr->prox_in);
276
+ stde->prox_cnt = 0;
277
+ stde->position = 0;
278
+ stde->skip_prox = &stpe_skip_prox;
279
+ stde->seek_prox = &stpe_seek_prox;
280
+
281
+ return tde;
282
+ }
283
+
284
+ /****************************************************************************
285
+ *
286
+ * MultiTermDocEnum
287
+ *
288
+ ****************************************************************************/
289
+
290
+ void mtde_close(TermDocEnum *tde)
291
+ {
292
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
293
+ TermDocEnum *tmp_tde;
294
+ int i;
295
+ for (i = 0; i < mtde->ir_cnt; i++) {
296
+ if ((tmp_tde = mtde->irs_tde[i]) != NULL)
297
+ tmp_tde->close(tmp_tde);
298
+ }
299
+ if (mtde->term != NULL) term_destroy(mtde->term);
300
+ free(mtde->irs_tde);
301
+ free(mtde);
302
+ free(tde);
303
+ }
304
+
305
+ void mtde_seek(TermDocEnum *tde, Term *term)
306
+ {
307
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
308
+ if (mtde->term != NULL) term_destroy(mtde->term);
309
+ mtde->term = term_create(term->field, term->text);
310
+ mtde->base = 0;
311
+ mtde->pointer = 0;
312
+ mtde->curr_tde = NULL;
313
+ }
314
+
315
+ TermDocEnum *mtde_term_docs_from_reader(IndexReader *ir)
316
+ {
317
+ return ir->term_docs(ir);
318
+ }
319
+
320
+ TermDocEnum *mtde_term_docs(MultiTermDocEnum *mtde, int i)
321
+ {
322
+ if (mtde->term == NULL)
323
+ return NULL;
324
+
325
+ TermDocEnum *tde = mtde->irs_tde[i];
326
+ if (tde == NULL) {
327
+ tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
328
+ }
329
+
330
+ tde->seek(tde, mtde->term);
331
+ return tde;
332
+ }
333
+
334
+ bool mtde_next(TermDocEnum *tde)
335
+ {
336
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
337
+ if (mtde->curr_tde != NULL && mtde->curr_tde->next(mtde->curr_tde)) {
338
+ return true;
339
+ } else if (mtde->pointer < mtde->ir_cnt) {
340
+ mtde->base = mtde->starts[mtde->pointer];
341
+ mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer);
342
+ mtde->pointer++;
343
+ return mtde_next(tde);
344
+ } else {
345
+ return false;
346
+ }
347
+ }
348
+
349
+ int mtde_doc_num(TermDocEnum *tde)
350
+ {
351
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
352
+ return mtde->base + mtde->curr_tde->doc_num(mtde->curr_tde);
353
+ }
354
+
355
+ int mtde_freq(TermDocEnum *tde)
356
+ {
357
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
358
+ return mtde->curr_tde->freq(mtde->curr_tde);
359
+ }
360
+
361
+ bool mtde_skip_to(TermDocEnum *tde, int target_doc_num)
362
+ {
363
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
364
+ while (mtde->pointer < mtde->ir_cnt) {
365
+ if ((target_doc_num < mtde->starts[mtde->pointer]) &&
366
+ (mtde->curr_tde->skip_to(mtde->curr_tde, target_doc_num - mtde->base))) {
367
+ return true;
368
+ }
369
+
370
+ mtde->base = mtde->starts[mtde->pointer];
371
+ mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer);
372
+ mtde->pointer++;
373
+ }
374
+ if (mtde->curr_tde) {
375
+ return mtde->curr_tde->skip_to(mtde->curr_tde, target_doc_num - mtde->base);
376
+ } else {
377
+ return false;
378
+ }
379
+ }
380
+
381
+ int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
382
+ {
383
+ int i, end = 0, last_end = 0, b;
384
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
385
+ while (true) {
386
+ while (mtde->curr_tde == NULL) {
387
+ if (mtde->pointer < mtde->ir_cnt) { // try next segment
388
+ mtde->base = mtde->starts[mtde->pointer];
389
+ mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer++);
390
+ } else {
391
+ return end;
392
+ }
393
+ }
394
+ end += mtde->curr_tde->read(mtde->curr_tde,
395
+ &docs[last_end], &freqs[last_end], req_num - last_end);
396
+ if (end == last_end) { // none left in segment
397
+ mtde->curr_tde = NULL;
398
+ } else { // got some
399
+ b = mtde->base; // adjust doc numbers
400
+ for (i = last_end; i < end; i++)
401
+ docs[i] += b;
402
+ if (end == req_num)
403
+ return end;
404
+ else
405
+ last_end = end;
406
+ }
407
+ }
408
+ }
409
+
410
+ TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
411
+ {
412
+ TermDocEnum *tde = ALLOC(TermDocEnum);
413
+ tde->close = &mtde_close;
414
+ tde->seek = &mtde_seek;
415
+ tde->next = &mtde_next;
416
+ tde->doc_num = &mtde_doc_num;
417
+ tde->freq = &mtde_freq;
418
+ tde->skip_to = &mtde_skip_to;
419
+ tde->read = &mtde_read;
420
+ tde->next_position = NULL;
421
+
422
+ MultiTermDocEnum *mtde = ALLOC(MultiTermDocEnum);
423
+ ZEROSET(mtde, MultiTermDocEnum, 1); // set all values to 0
424
+ tde->data = mtde;
425
+ mtde->irs = irs;
426
+ mtde->starts = starts;
427
+ mtde->ir_cnt = ir_cnt;
428
+ mtde->irs_tde = ALLOC_N(TermDocEnum *, ir_cnt);
429
+ ZEROSET(mtde->irs_tde, TermDocEnum *, ir_cnt);
430
+ mtde->term_docs_from_reader = &mtde_term_docs_from_reader;
431
+
432
+ return tde;
433
+ }
434
+
435
+ /****************************************************************************
436
+ *
437
+ * MultiTermPosEnum
438
+ *
439
+ ****************************************************************************/
440
+
441
+ TermDocEnum *mtpe_term_docs_from_reader(IndexReader *ir)
442
+ {
443
+ return ir->term_positions(ir);
444
+ }
445
+
446
+
447
+ int mtpe_next_position(TermDocEnum *tde)
448
+ {
449
+ TermDocEnum *curr_tde = ((MultiTermDocEnum *)tde->data)->curr_tde;
450
+ return curr_tde->next_position(curr_tde);
451
+ }
452
+
453
+ TermDocEnum *mtpe_create(IndexReader **irs, int *starts, int ir_cnt)
454
+ {
455
+ TermDocEnum *tde = mtde_create(irs, starts, ir_cnt);
456
+ tde->next_position = &mtpe_next_position;
457
+ ((MultiTermDocEnum *)tde->data)->term_docs_from_reader = &mtpe_term_docs_from_reader;
458
+ return tde;
459
+ }
460
+
461
+ /****************************************************************************
462
+ *
463
+ * MultipleTermDocPosEnum
464
+ *
465
+ ****************************************************************************/
466
+
467
+ #define GET_MTDPE MultipleTermDocPosEnum *mtdpe = (MultipleTermDocPosEnum *)self->data
468
+ void tde_destroy(void *p) {
469
+ TermDocEnum *self = (TermDocEnum *)p;
470
+ self->close(self);
471
+ }
472
+
473
+ void mtdpe_close(TermDocEnum *self)
474
+ {
475
+ GET_MTDPE;
476
+
477
+ pq_clear(mtdpe->pq);
478
+ pq_destroy(mtdpe->pq);
479
+ free(mtdpe->pos_queue);
480
+ free(mtdpe);
481
+ free(self);
482
+ }
483
+
484
+ void mtdpe_seek(TermDocEnum *tde, Term *term)
485
+ { eprintf(UNSUPPORTED_ERROR, "Unsupported op seek on MultipleTDPE");}
486
+
487
+ bool mtdpe_next(TermDocEnum *self)
488
+ {
489
+ TermDocEnum *tde;
490
+ int i = 0, freq = 0;
491
+ int doc;
492
+ GET_MTDPE;
493
+
494
+ if (mtdpe->pq->count == 0) return false;
495
+
496
+ tde = (TermDocEnum *)pq_top(mtdpe->pq);
497
+ doc = tde->doc_num(tde);
498
+
499
+ do {
500
+ freq += tde->freq(tde);
501
+ if (freq > mtdpe->pos_queue_capa) {
502
+ mtdpe->pos_queue_capa *= 2;
503
+ REALLOC_N(mtdpe->pos_queue, int, mtdpe->pos_queue_capa);
504
+ }
505
+
506
+ for (; i < freq; i++) {
507
+ mtdpe->pos_queue[i] = tde->next_position(tde);
508
+ }
509
+
510
+ if (tde->next(tde)) {
511
+ pq_down(mtdpe->pq);
512
+ } else {
513
+ tde = pq_pop(mtdpe->pq);
514
+ tde->close(tde);
515
+ }
516
+ tde = (TermDocEnum *)pq_top(mtdpe->pq);
517
+ } while ((mtdpe->pq->count > 0) && (tde->doc_num(tde) == doc));
518
+
519
+ qsort(mtdpe->pos_queue, freq, sizeof(int), &icmp_risky);
520
+
521
+ mtdpe->pos_queue_index = 0;
522
+ mtdpe->freq = freq;
523
+ mtdpe->doc_num = doc;
524
+
525
+ return true;
526
+ }
527
+
528
+ int mtdpe_doc_num(TermDocEnum *self)
529
+ { return ((MultipleTermDocPosEnum *)self->data)->doc_num; }
530
+
531
+ int mtdpe_freq(TermDocEnum *self)
532
+ { return ((MultipleTermDocPosEnum *)self->data)->freq; }
533
+
534
+
535
+ bool tdpe_less_than(void *p1, void *p2)
536
+ {
537
+ return ((TermDocEnum *)p1)->doc_num((TermDocEnum *)p1) <
538
+ ((TermDocEnum *)p2)->doc_num((TermDocEnum *)p2);
539
+ }
540
+
541
+ bool mtdpe_skip_to(TermDocEnum *self, int target_doc_num)
542
+ {
543
+ GET_MTDPE;
544
+ TermDocEnum *tde;
545
+ while ((tde = pq_top(mtdpe->pq)) != NULL &&
546
+ (target_doc_num > tde->doc_num(tde))) {
547
+ if (tde->skip_to(tde, target_doc_num)) {
548
+ pq_down(mtdpe->pq);
549
+ } else {
550
+ tde = pq_pop(mtdpe->pq);
551
+ tde->close(tde);
552
+ }
553
+ }
554
+ return self->next(self);
555
+ }
556
+
557
+ int mtdpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
558
+ {
559
+ eprintf(UNSUPPORTED_ERROR, "Unsupported op read on MultipleTDPE");
560
+ return -1;
561
+ }
562
+
563
+ int mtdpe_next_position(TermDocEnum *self)
564
+ {
565
+ GET_MTDPE;
566
+ return mtdpe->pos_queue[mtdpe->pos_queue_index++];
567
+ }
568
+
569
+ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
570
+ {
571
+ TermDocEnum *self = ALLOC(TermDocEnum);
572
+ MultipleTermDocPosEnum *mtdpe = ALLOC(MultipleTermDocPosEnum);
573
+ PriorityQueue *pq;
574
+ TermDocEnum *tpe;
575
+ int i;
576
+
577
+ self->close = &mtdpe_close;
578
+ self->seek = &mtdpe_seek;
579
+ self->next = &mtdpe_next;
580
+ self->doc_num = &mtdpe_doc_num;
581
+ self->freq = &mtdpe_freq;
582
+ self->skip_to = &mtdpe_skip_to;
583
+ self->read = &mtdpe_read;
584
+ self->next_position = &mtdpe_next_position;
585
+
586
+ ZEROSET(mtdpe, MultipleTermDocPosEnum, 1); // set all values to 0
587
+ self->data = mtdpe;
588
+ pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
589
+ mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
590
+ mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
591
+ for (i = 0; i < t_cnt; i++) {
592
+ tpe = ir_term_positions_for(ir, terms[i]);
593
+ if (tpe->next(tpe)) pq_push(pq, tpe);
594
+ }
595
+ pq->free_elem = &tde_destroy;
596
+
597
+ return self;
598
+ }
599
+