ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/termdocs.c ADDED
@@ -0,0 +1,599 @@
1
+ #include <index.h>
2
+ #include <string.h>
3
+
4
+ /****************************************************************************
5
+ *
6
+ * SegmentTermDocEnum
7
+ *
8
+ ****************************************************************************/
9
+
10
+
11
+ void stde_close(TermDocEnum *tde)
12
+ {
13
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
14
+
15
+ is_close(stde->freq_in);
16
+ if (stde->skip_in != NULL)
17
+ is_close(stde->skip_in);
18
+
19
+ free(stde);
20
+ free(tde);
21
+ }
22
+
23
+ void stde_seek_ti(TermDocEnum *tde, TermInfo *ti)
24
+ {
25
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
26
+ if (ti == NULL) {
27
+ stde->doc_freq = 0;
28
+ } else {
29
+ stde->count = 0;
30
+ stde->doc_freq = ti->doc_freq;
31
+ stde->doc_num = 0;
32
+ stde->skip_doc = 0;
33
+ stde->skip_count = 0;
34
+ stde->num_skips = stde->doc_freq / stde->skip_interval;
35
+ stde->freq_pointer = ti->freq_pointer;
36
+ stde->prox_pointer = ti->prox_pointer;
37
+ stde->skip_pointer = ti->freq_pointer + ti->skip_offset;
38
+ is_seek(stde->freq_in, ti->freq_pointer);
39
+ stde->have_skipped = false;
40
+ }
41
+ }
42
+
43
+ void stde_seek(TermDocEnum *tde, Term *term)
44
+ {
45
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
46
+ TermInfo *ti = tir_get_ti(stde->parent->tir, term);
47
+ stde_seek_ti(tde, ti);
48
+ ti_destroy(ti);
49
+ }
50
+
51
+ void stde_skip_prox(SegmentTermDocEnum *stde) { }
52
+ void stde_seek_prox(SegmentTermDocEnum *stde, int prox_pointer) { }
53
+
54
+ bool stde_next(TermDocEnum *tde)
55
+ {
56
+ int doc_code;
57
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
58
+ while (true) {
59
+
60
+ if (stde->count >= stde->doc_freq)
61
+ return false;
62
+
63
+ doc_code = is_read_vint(stde->freq_in);
64
+ stde->doc_num += doc_code >> 1; // shift off low bit
65
+ if ((doc_code & 1) != 0) { // if low bit is set
66
+ stde->freq = 1; // freq is one
67
+ } else {
68
+ stde->freq = is_read_vint(stde->freq_in); // else read freq
69
+ }
70
+
71
+ stde->count++;
72
+
73
+ if (stde->deleted_docs == NULL ||
74
+ bv_get(stde->deleted_docs, stde->doc_num) == 0)
75
+ break; // We found an undeleted doc so return
76
+
77
+ stde->skip_prox(stde);
78
+ }
79
+ return true;
80
+ }
81
+
82
+ int stde_doc_num(TermDocEnum *tde)
83
+ { return ((SegmentTermDocEnum *)tde->data)->doc_num; }
84
+
85
+ int stde_freq(TermDocEnum *tde)
86
+ { return ((SegmentTermDocEnum *)tde->data)->freq; }
87
+
88
+ bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
89
+ {
90
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
91
+ if (stde->doc_freq >= stde->skip_interval) { // optimized case
92
+
93
+ if (stde->skip_in == NULL)
94
+ stde->skip_in = is_clone(stde->freq_in); // lazily clone
95
+
96
+ if (!stde->have_skipped) { // lazily seek skip stream
97
+ is_seek(stde->skip_in, stde->skip_pointer);
98
+ stde->have_skipped = true;
99
+ }
100
+
101
+ // scan skip data
102
+ int last_skip_doc = stde->skip_doc;
103
+ int last_freq_pointer = is_pos(stde->freq_in);
104
+ int last_prox_pointer = -1;
105
+ int num_skipped = -1 - (stde->count % stde->skip_interval);
106
+
107
+ while (target_doc_num > stde->skip_doc) {
108
+ last_skip_doc = stde->skip_doc;
109
+ last_freq_pointer = stde->freq_pointer;
110
+ last_prox_pointer = stde->prox_pointer;
111
+
112
+ if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num)
113
+ num_skipped += stde->skip_interval;
114
+
115
+ if(stde->skip_count >= stde->num_skips)
116
+ break;
117
+
118
+ stde->skip_doc += is_read_vint(stde->skip_in);
119
+ stde->freq_pointer += is_read_vint(stde->skip_in);
120
+ stde->prox_pointer += is_read_vint(stde->skip_in);
121
+
122
+ stde->skip_count++;
123
+ }
124
+
125
+ // if we found something to skip, so skip it
126
+ if (last_freq_pointer > is_pos(stde->freq_in)) {
127
+ is_seek(stde->freq_in, last_freq_pointer);
128
+ stde->seek_prox(stde, last_prox_pointer);
129
+
130
+ stde->doc_num = last_skip_doc;
131
+ stde->count += num_skipped;
132
+ }
133
+ }
134
+
135
+ // done skipping, now just scan
136
+ do {
137
+ if (! tde->next(tde)) {
138
+ return false;
139
+ }
140
+ } while (target_doc_num > ((SegmentTermDocEnum *)tde->data)->doc_num);
141
+ return true;
142
+ }
143
+
144
+ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
145
+ {
146
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
147
+ int i = 0, doc_code;
148
+ while (i < req_num && stde->count < stde->doc_freq) {
149
+ // manually inlined call to next() for speed
150
+ doc_code = is_read_vint(stde->freq_in);
151
+ stde->doc_num += doc_code >> 1; // shift off low bit
152
+ if ((doc_code & 1) != 0) // if low bit is set
153
+ stde->freq = 1; // freq is one
154
+ else
155
+ stde->freq = is_read_vint(stde->freq_in); // else read freq
156
+
157
+ stde->count++;
158
+
159
+ if (stde->deleted_docs == NULL ||
160
+ bv_get(stde->deleted_docs, stde->doc_num) == 0) {
161
+ docs[i] = stde->doc_num;
162
+ freqs[i] = stde->freq;
163
+ i++;
164
+ }
165
+ }
166
+ return i;
167
+ }
168
+
169
+ TermDocEnum *stde_create(IndexReader *ir)
170
+ {
171
+ SegmentReader *sr = (SegmentReader *)ir->data;
172
+ TermDocEnum *tde = ALLOC(TermDocEnum);
173
+ tde->seek = &stde_seek;
174
+ tde->doc_num = &stde_doc_num;
175
+ tde->freq = &stde_freq;
176
+ tde->next = &stde_next;
177
+ tde->read = &stde_read;
178
+ tde->skip_to = &stde_skip_to;
179
+ tde->next_position = NULL;
180
+ tde->close = &stde_close;
181
+
182
+ SegmentTermDocEnum *stde = ALLOC(SegmentTermDocEnum);
183
+ ZEROSET(stde, SegmentTermDocEnum, 1); // set all values to 0
184
+ tde->data = stde;
185
+ stde->parent = sr;
186
+ stde->freq_in = is_clone(sr->freq_in);
187
+ stde->deleted_docs = sr->deleted_docs;
188
+ stde->skip_interval = sr->tir->skip_interval;
189
+ stde->skip_in = NULL;
190
+ stde->have_skipped = false;
191
+ stde->skip_prox = &stde_skip_prox;
192
+ stde->seek_prox = &stde_seek_prox;
193
+ return tde;
194
+ }
195
+
196
+ /****************************************************************************
197
+ *
198
+ * SegmentTermPosEnum
199
+ *
200
+ ****************************************************************************/
201
+
202
+ void stpe_seek(TermDocEnum *tde, Term *term)
203
+ {
204
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
205
+ TermInfo *ti = tir_get_ti(stde->parent->tir, term);
206
+ stde_seek_ti(tde, ti);
207
+ if (ti != NULL) {
208
+ is_seek(stde->prox_in, ti->prox_pointer);
209
+ }
210
+ stde->prox_cnt = 0;
211
+ ti_destroy(ti);
212
+ }
213
+
214
+ void stpe_close(TermDocEnum *tde)
215
+ {
216
+ // super
217
+ is_close(((SegmentTermDocEnum *)tde->data)->prox_in);
218
+ ((SegmentTermDocEnum *)tde->data)->prox_in = NULL;
219
+ stde_close(tde);
220
+ }
221
+
222
+ void stpe_skip_prox(SegmentTermDocEnum *stde)
223
+ {
224
+ int i;
225
+ for (i = 0; i < stde->freq; i++)
226
+ is_read_vint(stde->prox_in);
227
+ }
228
+
229
+ void stpe_seek_prox(SegmentTermDocEnum *stde, int prox_pointer)
230
+ {
231
+ is_seek(stde->prox_in, prox_pointer);
232
+ stde->prox_cnt = 0;
233
+ }
234
+
235
+ bool stpe_next(TermDocEnum *tde)
236
+ {
237
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
238
+ int i;
239
+ for (i = 0; i < stde->prox_cnt; i++)
240
+ is_read_vint(stde->prox_in);
241
+
242
+ // if super
243
+ if (stde_next(tde)) {
244
+ stde->prox_cnt = stde->freq;
245
+ stde->position = 0;
246
+ return true;
247
+ }
248
+ return false;
249
+ }
250
+
251
+ int stpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
252
+ {
253
+ eprintf(ARG_ERROR, "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.");
254
+ return -1;
255
+ }
256
+
257
+ int stpe_next_position(TermDocEnum *tde)
258
+ {
259
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
260
+ stde->prox_cnt--;
261
+ return stde->position += is_read_vint(stde->prox_in);
262
+ }
263
+
264
+ TermDocEnum *stpe_create(IndexReader *ir)
265
+ {
266
+ SegmentReader *sr = (SegmentReader *)ir->data;
267
+ TermDocEnum *tde = stde_create(ir);
268
+ tde->close = &stpe_close;
269
+ tde->seek = &stpe_seek;
270
+ tde->next = &stpe_next;
271
+ tde->read = &stpe_read;
272
+ tde->next_position = &stpe_next_position;
273
+
274
+ SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
275
+ stde->prox_in = is_clone(sr->prox_in);
276
+ stde->prox_cnt = 0;
277
+ stde->position = 0;
278
+ stde->skip_prox = &stpe_skip_prox;
279
+ stde->seek_prox = &stpe_seek_prox;
280
+
281
+ return tde;
282
+ }
283
+
284
+ /****************************************************************************
285
+ *
286
+ * MultiTermDocEnum
287
+ *
288
+ ****************************************************************************/
289
+
290
+ void mtde_close(TermDocEnum *tde)
291
+ {
292
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
293
+ TermDocEnum *tmp_tde;
294
+ int i;
295
+ for (i = 0; i < mtde->ir_cnt; i++) {
296
+ if ((tmp_tde = mtde->irs_tde[i]) != NULL)
297
+ tmp_tde->close(tmp_tde);
298
+ }
299
+ if (mtde->term != NULL) term_destroy(mtde->term);
300
+ free(mtde->irs_tde);
301
+ free(mtde);
302
+ free(tde);
303
+ }
304
+
305
+ void mtde_seek(TermDocEnum *tde, Term *term)
306
+ {
307
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
308
+ if (mtde->term != NULL) term_destroy(mtde->term);
309
+ mtde->term = term_create(term->field, term->text);
310
+ mtde->base = 0;
311
+ mtde->pointer = 0;
312
+ mtde->curr_tde = NULL;
313
+ }
314
+
315
+ TermDocEnum *mtde_term_docs_from_reader(IndexReader *ir)
316
+ {
317
+ return ir->term_docs(ir);
318
+ }
319
+
320
+ TermDocEnum *mtde_term_docs(MultiTermDocEnum *mtde, int i)
321
+ {
322
+ if (mtde->term == NULL)
323
+ return NULL;
324
+
325
+ TermDocEnum *tde = mtde->irs_tde[i];
326
+ if (tde == NULL) {
327
+ tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
328
+ }
329
+
330
+ tde->seek(tde, mtde->term);
331
+ return tde;
332
+ }
333
+
334
+ bool mtde_next(TermDocEnum *tde)
335
+ {
336
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
337
+ if (mtde->curr_tde != NULL && mtde->curr_tde->next(mtde->curr_tde)) {
338
+ return true;
339
+ } else if (mtde->pointer < mtde->ir_cnt) {
340
+ mtde->base = mtde->starts[mtde->pointer];
341
+ mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer);
342
+ mtde->pointer++;
343
+ return mtde_next(tde);
344
+ } else {
345
+ return false;
346
+ }
347
+ }
348
+
349
+ int mtde_doc_num(TermDocEnum *tde)
350
+ {
351
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
352
+ return mtde->base + mtde->curr_tde->doc_num(mtde->curr_tde);
353
+ }
354
+
355
+ int mtde_freq(TermDocEnum *tde)
356
+ {
357
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
358
+ return mtde->curr_tde->freq(mtde->curr_tde);
359
+ }
360
+
361
+ bool mtde_skip_to(TermDocEnum *tde, int target_doc_num)
362
+ {
363
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
364
+ while (mtde->pointer < mtde->ir_cnt) {
365
+ if ((target_doc_num < mtde->starts[mtde->pointer]) &&
366
+ (mtde->curr_tde->skip_to(mtde->curr_tde, target_doc_num - mtde->base))) {
367
+ return true;
368
+ }
369
+
370
+ mtde->base = mtde->starts[mtde->pointer];
371
+ mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer);
372
+ mtde->pointer++;
373
+ }
374
+ if (mtde->curr_tde) {
375
+ return mtde->curr_tde->skip_to(mtde->curr_tde, target_doc_num - mtde->base);
376
+ } else {
377
+ return false;
378
+ }
379
+ }
380
+
381
+ int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
382
+ {
383
+ int i, end = 0, last_end = 0, b;
384
+ MultiTermDocEnum *mtde = (MultiTermDocEnum *)tde->data;
385
+ while (true) {
386
+ while (mtde->curr_tde == NULL) {
387
+ if (mtde->pointer < mtde->ir_cnt) { // try next segment
388
+ mtde->base = mtde->starts[mtde->pointer];
389
+ mtde->curr_tde = mtde_term_docs(mtde, mtde->pointer++);
390
+ } else {
391
+ return end;
392
+ }
393
+ }
394
+ end += mtde->curr_tde->read(mtde->curr_tde,
395
+ &docs[last_end], &freqs[last_end], req_num - last_end);
396
+ if (end == last_end) { // none left in segment
397
+ mtde->curr_tde = NULL;
398
+ } else { // got some
399
+ b = mtde->base; // adjust doc numbers
400
+ for (i = last_end; i < end; i++)
401
+ docs[i] += b;
402
+ if (end == req_num)
403
+ return end;
404
+ else
405
+ last_end = end;
406
+ }
407
+ }
408
+ }
409
+
410
+ TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
411
+ {
412
+ TermDocEnum *tde = ALLOC(TermDocEnum);
413
+ tde->close = &mtde_close;
414
+ tde->seek = &mtde_seek;
415
+ tde->next = &mtde_next;
416
+ tde->doc_num = &mtde_doc_num;
417
+ tde->freq = &mtde_freq;
418
+ tde->skip_to = &mtde_skip_to;
419
+ tde->read = &mtde_read;
420
+ tde->next_position = NULL;
421
+
422
+ MultiTermDocEnum *mtde = ALLOC(MultiTermDocEnum);
423
+ ZEROSET(mtde, MultiTermDocEnum, 1); // set all values to 0
424
+ tde->data = mtde;
425
+ mtde->irs = irs;
426
+ mtde->starts = starts;
427
+ mtde->ir_cnt = ir_cnt;
428
+ mtde->irs_tde = ALLOC_N(TermDocEnum *, ir_cnt);
429
+ ZEROSET(mtde->irs_tde, TermDocEnum *, ir_cnt);
430
+ mtde->term_docs_from_reader = &mtde_term_docs_from_reader;
431
+
432
+ return tde;
433
+ }
434
+
435
+ /****************************************************************************
436
+ *
437
+ * MultiTermPosEnum
438
+ *
439
+ ****************************************************************************/
440
+
441
+ TermDocEnum *mtpe_term_docs_from_reader(IndexReader *ir)
442
+ {
443
+ return ir->term_positions(ir);
444
+ }
445
+
446
+
447
+ int mtpe_next_position(TermDocEnum *tde)
448
+ {
449
+ TermDocEnum *curr_tde = ((MultiTermDocEnum *)tde->data)->curr_tde;
450
+ return curr_tde->next_position(curr_tde);
451
+ }
452
+
453
+ TermDocEnum *mtpe_create(IndexReader **irs, int *starts, int ir_cnt)
454
+ {
455
+ TermDocEnum *tde = mtde_create(irs, starts, ir_cnt);
456
+ tde->next_position = &mtpe_next_position;
457
+ ((MultiTermDocEnum *)tde->data)->term_docs_from_reader = &mtpe_term_docs_from_reader;
458
+ return tde;
459
+ }
460
+
461
+ /****************************************************************************
462
+ *
463
+ * MultipleTermDocPosEnum
464
+ *
465
+ ****************************************************************************/
466
+
467
+ #define GET_MTDPE MultipleTermDocPosEnum *mtdpe = (MultipleTermDocPosEnum *)self->data
468
+ void tde_destroy(void *p) {
469
+ TermDocEnum *self = (TermDocEnum *)p;
470
+ self->close(self);
471
+ }
472
+
473
+ void mtdpe_close(TermDocEnum *self)
474
+ {
475
+ GET_MTDPE;
476
+
477
+ pq_clear(mtdpe->pq);
478
+ pq_destroy(mtdpe->pq);
479
+ free(mtdpe->pos_queue);
480
+ free(mtdpe);
481
+ free(self);
482
+ }
483
+
484
+ void mtdpe_seek(TermDocEnum *tde, Term *term)
485
+ { eprintf(UNSUPPORTED_ERROR, "Unsupported op seek on MultipleTDPE");}
486
+
487
+ bool mtdpe_next(TermDocEnum *self)
488
+ {
489
+ TermDocEnum *tde;
490
+ int i = 0, freq = 0;
491
+ int doc;
492
+ GET_MTDPE;
493
+
494
+ if (mtdpe->pq->count == 0) return false;
495
+
496
+ tde = (TermDocEnum *)pq_top(mtdpe->pq);
497
+ doc = tde->doc_num(tde);
498
+
499
+ do {
500
+ freq += tde->freq(tde);
501
+ if (freq > mtdpe->pos_queue_capa) {
502
+ mtdpe->pos_queue_capa *= 2;
503
+ REALLOC_N(mtdpe->pos_queue, int, mtdpe->pos_queue_capa);
504
+ }
505
+
506
+ for (; i < freq; i++) {
507
+ mtdpe->pos_queue[i] = tde->next_position(tde);
508
+ }
509
+
510
+ if (tde->next(tde)) {
511
+ pq_down(mtdpe->pq);
512
+ } else {
513
+ tde = pq_pop(mtdpe->pq);
514
+ tde->close(tde);
515
+ }
516
+ tde = (TermDocEnum *)pq_top(mtdpe->pq);
517
+ } while ((mtdpe->pq->count > 0) && (tde->doc_num(tde) == doc));
518
+
519
+ qsort(mtdpe->pos_queue, freq, sizeof(int), &icmp_risky);
520
+
521
+ mtdpe->pos_queue_index = 0;
522
+ mtdpe->freq = freq;
523
+ mtdpe->doc_num = doc;
524
+
525
+ return true;
526
+ }
527
+
528
+ int mtdpe_doc_num(TermDocEnum *self)
529
+ { return ((MultipleTermDocPosEnum *)self->data)->doc_num; }
530
+
531
+ int mtdpe_freq(TermDocEnum *self)
532
+ { return ((MultipleTermDocPosEnum *)self->data)->freq; }
533
+
534
+
535
+ bool tdpe_less_than(void *p1, void *p2)
536
+ {
537
+ return ((TermDocEnum *)p1)->doc_num((TermDocEnum *)p1) <
538
+ ((TermDocEnum *)p2)->doc_num((TermDocEnum *)p2);
539
+ }
540
+
541
+ bool mtdpe_skip_to(TermDocEnum *self, int target_doc_num)
542
+ {
543
+ GET_MTDPE;
544
+ TermDocEnum *tde;
545
+ while ((tde = pq_top(mtdpe->pq)) != NULL &&
546
+ (target_doc_num > tde->doc_num(tde))) {
547
+ if (tde->skip_to(tde, target_doc_num)) {
548
+ pq_down(mtdpe->pq);
549
+ } else {
550
+ tde = pq_pop(mtdpe->pq);
551
+ tde->close(tde);
552
+ }
553
+ }
554
+ return self->next(self);
555
+ }
556
+
557
+ int mtdpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
558
+ {
559
+ eprintf(UNSUPPORTED_ERROR, "Unsupported op read on MultipleTDPE");
560
+ return -1;
561
+ }
562
+
563
+ int mtdpe_next_position(TermDocEnum *self)
564
+ {
565
+ GET_MTDPE;
566
+ return mtdpe->pos_queue[mtdpe->pos_queue_index++];
567
+ }
568
+
569
+ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
570
+ {
571
+ TermDocEnum *self = ALLOC(TermDocEnum);
572
+ MultipleTermDocPosEnum *mtdpe = ALLOC(MultipleTermDocPosEnum);
573
+ PriorityQueue *pq;
574
+ TermDocEnum *tpe;
575
+ int i;
576
+
577
+ self->close = &mtdpe_close;
578
+ self->seek = &mtdpe_seek;
579
+ self->next = &mtdpe_next;
580
+ self->doc_num = &mtdpe_doc_num;
581
+ self->freq = &mtdpe_freq;
582
+ self->skip_to = &mtdpe_skip_to;
583
+ self->read = &mtdpe_read;
584
+ self->next_position = &mtdpe_next_position;
585
+
586
+ ZEROSET(mtdpe, MultipleTermDocPosEnum, 1); // set all values to 0
587
+ self->data = mtdpe;
588
+ pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
589
+ mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
590
+ mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
591
+ for (i = 0; i < t_cnt; i++) {
592
+ tpe = ir_term_positions_for(ir, terms[i]);
593
+ if (tpe->next(tpe)) pq_push(pq, tpe);
594
+ }
595
+ pq->free_elem = &tde_destroy;
596
+
597
+ return self;
598
+ }
599
+