jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/search.c ADDED
@@ -0,0 +1,1864 @@
1
+ #include <string.h>
2
+ #include <limits.h>
3
+ #include "search.h"
4
+ #include "array.h"
5
+ #include "internal.h"
6
+
7
+ /***************************************************************************
8
+ *
9
+ * Explanation - Used to give details for query scores
10
+ *
11
+ ***************************************************************************/
12
+
13
+ Explanation *expl_new(float value, const char *description, ...)
14
+ {
15
+ Explanation *expl = ALLOC(Explanation);
16
+
17
+ va_list args;
18
+ va_start(args, description);
19
+ expl->description = vstrfmt(description, args);
20
+ va_end(args);
21
+
22
+ expl->value = value;
23
+ expl->details = ary_new_type_capa(Explanation *,
24
+ EXPLANATION_DETAILS_START_SIZE);
25
+ return expl;
26
+ }
27
+
28
+ void expl_destroy(Explanation *expl)
29
+ {
30
+ ary_destroy((void **)expl->details, (free_ft)expl_destroy);
31
+ free(expl->description);
32
+ free(expl);
33
+ }
34
+
35
+ Explanation *expl_add_detail(Explanation *expl, Explanation *detail)
36
+ {
37
+ ary_push(expl->details, detail);
38
+ return expl;
39
+ }
40
+
41
+ char *expl_to_s_depth(Explanation *expl, int depth)
42
+ {
43
+ int i;
44
+ char *buffer = ALLOC_N(char, depth * 2 + 1);
45
+ const int num_details = ary_size(expl->details);
46
+
47
+ memset(buffer, ' ', sizeof(char) * depth * 2);
48
+ buffer[depth*2] = 0;
49
+
50
+ buffer = estrcat(buffer, strfmt("%f = %s\n",
51
+ expl->value, expl->description));
52
+ for (i = 0; i < num_details; i++) {
53
+ buffer = estrcat(buffer, expl_to_s_depth(expl->details[i], depth + 1));
54
+ }
55
+
56
+ return buffer;
57
+ }
58
+
59
+ char *expl_to_html(Explanation *expl)
60
+ {
61
+ int i;
62
+ char *buffer;
63
+ const int num_details = ary_size(expl->details);
64
+
65
+ buffer = strfmt("<ul>\n<li>%f = %s</li>\n", expl->value, expl->description);
66
+
67
+ for (i = 0; i < num_details; i++) {
68
+ estrcat(buffer, expl_to_html(expl->details[i]));
69
+ }
70
+
71
+ REALLOC_N(buffer, char, strlen(buffer) + 10);
72
+ return strcat(buffer, "</ul>\n");
73
+ }
74
+
75
+ /***************************************************************************
76
+ *
77
+ * Hit
78
+ *
79
+ ***************************************************************************/
80
+
81
+ static bool hit_less_than(const Hit *hit1, const Hit *hit2)
82
+ {
83
+ if (hit1->score == hit2->score) {
84
+ return hit1->doc > hit2->doc;
85
+ }
86
+ else {
87
+ return hit1->score < hit1->score;
88
+ }
89
+ }
90
+
91
+ static bool hit_lt(Hit *hit1, Hit *hit2)
92
+ {
93
+ if (hit1->score == hit2->score) {
94
+ return hit1->doc > hit2->doc;
95
+ }
96
+ else {
97
+ return hit1->score < hit2->score;
98
+ }
99
+ }
100
+
101
+ static void hit_pq_down(PriorityQueue *pq)
102
+ {
103
+ register int i = 1;
104
+ register int j = 2; /* i << 1; */
105
+ register int k = 3; /* j + 1; */
106
+ Hit **heap = (Hit **)pq->heap;
107
+ Hit *node = heap[i]; /* save top node */
108
+
109
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
110
+ j = k;
111
+ }
112
+
113
+ while ((j <= pq->size) && hit_lt(heap[j], node)) {
114
+ heap[i] = heap[j]; /* shift up child */
115
+ i = j;
116
+ j = i << 1;
117
+ k = j + 1;
118
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
119
+ j = k;
120
+ }
121
+ }
122
+ heap[i] = node;
123
+ }
124
+
125
+ static Hit *hit_pq_pop(PriorityQueue *pq)
126
+ {
127
+ if (pq->size > 0) {
128
+ Hit **heap = (Hit **)pq->heap;
129
+ Hit *result = heap[1]; /* save first value */
130
+ heap[1] = heap[pq->size]; /* move last to first */
131
+ heap[pq->size] = NULL;
132
+ pq->size--;
133
+ hit_pq_down(pq); /* adjust heap */
134
+ return result;
135
+ }
136
+ else {
137
+ return NULL;
138
+ }
139
+ }
140
+
141
+ static void hit_pq_up(PriorityQueue *pq)
142
+ {
143
+ Hit **heap = (Hit **)pq->heap;
144
+ Hit *node;
145
+ int i = pq->size;
146
+ int j = i >> 1;
147
+ node = heap[i];
148
+
149
+ while ((j > 0) && hit_lt(node, heap[j])) {
150
+ heap[i] = heap[j];
151
+ i = j;
152
+ j = j >> 1;
153
+ }
154
+ heap[i] = node;
155
+ }
156
+
157
+ static void hit_pq_insert(PriorityQueue *pq, Hit *hit)
158
+ {
159
+ if (pq->size < pq->capa) {
160
+ Hit *new_hit = ALLOC(Hit);
161
+ memcpy(new_hit, hit, sizeof(Hit));
162
+ pq->size++;
163
+ if (pq->size >= pq->mem_capa) {
164
+ pq->mem_capa <<= 1;
165
+ REALLOC_N(pq->heap, void *, pq->mem_capa);
166
+ }
167
+ pq->heap[pq->size] = new_hit;
168
+ hit_pq_up(pq);
169
+ }
170
+ else if (pq->size > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
171
+ memcpy(pq->heap[1], hit, sizeof(Hit));
172
+ hit_pq_down(pq);
173
+ }
174
+ }
175
+
176
+ static void hit_pq_multi_insert(PriorityQueue *pq, Hit *hit)
177
+ {
178
+ hit_pq_insert(pq, hit);
179
+ free(hit);
180
+ }
181
+
182
+ /***************************************************************************
183
+ *
184
+ * TopDocs
185
+ *
186
+ ***************************************************************************/
187
+
188
+ TopDocs *td_new(int total_hits, int size, Hit **hits, float max_score)
189
+ {
190
+ TopDocs *td = ALLOC(TopDocs);
191
+ td->total_hits = total_hits;
192
+ td->size = size;
193
+ td->hits = hits;
194
+ td->max_score = max_score;
195
+ return td;
196
+ }
197
+
198
+ void td_destroy(TopDocs *td)
199
+ {
200
+ int i;
201
+
202
+ for (i = 0; i < td->size; i++) {
203
+ free(td->hits[i]);
204
+ }
205
+ free(td->hits);
206
+ free(td);
207
+ }
208
+
209
+ char *td_to_s(TopDocs *td)
210
+ {
211
+ int i;
212
+ Hit *hit;
213
+ char *buffer = strfmt("%d hits sorted by <score, doc_num>\n",
214
+ td->total_hits);
215
+ for (i = 0; i < td->size; i++) {
216
+ hit = td->hits[i];
217
+ estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
218
+ }
219
+ return buffer;
220
+ }
221
+
222
+ /***************************************************************************
223
+ *
224
+ * Weight
225
+ *
226
+ ***************************************************************************/
227
+
228
+ Query *w_get_query(Weight *self)
229
+ {
230
+ return self->query;
231
+ }
232
+
233
+ float w_get_value(Weight *self)
234
+ {
235
+ return self->value;
236
+ }
237
+
238
+ float w_sum_of_squared_weights(Weight *self)
239
+ {
240
+ self->qweight = self->idf * self->query->boost;
241
+ return self->qweight * self->qweight; /* square it */
242
+ }
243
+
244
+ void w_normalize(Weight *self, float normalization_factor)
245
+ {
246
+ self->qnorm = normalization_factor;
247
+ self->qweight *= normalization_factor; /* normalize query weight */
248
+ self->value = self->qweight * self->idf;/* idf for document */
249
+ }
250
+
251
+ void w_destroy(Weight *self)
252
+ {
253
+ q_deref(self->query);
254
+ free(self);
255
+ }
256
+
257
+ Weight *w_create(size_t size, Query *query)
258
+ {
259
+ Weight *self = (Weight *)ecalloc(size);
260
+ #ifdef DEBUG
261
+ if (size < sizeof(Weight)) {
262
+ RAISE(FERRET_ERROR, "size of weight <%d> should be at least <%d>",
263
+ (int)size, (int)sizeof(Weight));
264
+ }
265
+ #endif
266
+ REF(query);
267
+ self->query = query;
268
+ self->get_query = &w_get_query;
269
+ self->get_value = &w_get_value;
270
+ self->normalize = &w_normalize;
271
+ self->destroy = &w_destroy;
272
+ self->sum_of_squared_weights = &w_sum_of_squared_weights;
273
+ return self;
274
+ }
275
+
276
+ /***************************************************************************
277
+ *
278
+ * Query
279
+ *
280
+ ***************************************************************************/
281
+
282
+ static const char *QUERY_NAMES[] = {
283
+ "TermQuery",
284
+ "MultiTermQuery",
285
+ "BooleanQuery",
286
+ "PhraseQuery",
287
+ "ConstantScoreQuery",
288
+ "FilteredQuery",
289
+ "MatchAllQuery",
290
+ "RangeQuery",
291
+ "WildCardQuery",
292
+ "FuzzyQuery",
293
+ "PrefixQuery",
294
+ "SpanTermQuery",
295
+ "SpanMultiTermQuery",
296
+ "SpanPrefixQuery",
297
+ "SpanFirstQuery",
298
+ "SpanOrQuery",
299
+ "SpanNotQuery",
300
+ "SpanNearQuery"
301
+ };
302
+
303
+ static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
304
+
305
+ const char *q_get_query_name(QueryType type) {
306
+ if (type >= NELEMS(QUERY_NAMES)) {
307
+ return UNKNOWN_QUERY_NAME;
308
+ }
309
+ else {
310
+ return QUERY_NAMES[type];
311
+ }
312
+ }
313
+
314
+ static Query *q_rewrite(Query *self, IndexReader *ir)
315
+ {
316
+ (void)ir;
317
+ self->ref_cnt++;
318
+ return self;
319
+ }
320
+
321
+ static void q_extract_terms(Query *self, HashSet *terms)
322
+ {
323
+ /* do nothing by default */
324
+ (void)self;
325
+ (void)terms;
326
+ }
327
+
328
+ Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
329
+ {
330
+ (void)self;
331
+ return searcher->get_similarity(searcher);
332
+ }
333
+
334
+ void q_destroy_i(Query *self)
335
+ {
336
+ free(self);
337
+ }
338
+
339
+ void q_deref(Query *self)
340
+ {
341
+ if (--(self->ref_cnt) == 0) {
342
+ self->destroy_i(self);
343
+ }
344
+ }
345
+
346
+ Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
347
+ {
348
+ (void)self;
349
+ (void)searcher;
350
+ RAISE(UNSUPPORTED_ERROR,
351
+ "Create weight is unsupported for this type of query");
352
+ return NULL;
353
+ }
354
+
355
+ Weight *q_weight(Query *self, Searcher *searcher)
356
+ {
357
+ Query *query = searcher->rewrite(searcher, self);
358
+ Weight *weight = query->create_weight_i(query, searcher);
359
+ float sum = weight->sum_of_squared_weights(weight);
360
+ Similarity *sim = query->get_similarity(query, searcher);
361
+ float norm = sim_query_norm(sim, sum);
362
+ q_deref(query);
363
+
364
+ weight->normalize(weight, norm);
365
+ return self->weight = weight;
366
+ }
367
+
368
+ #define BQ(query) ((BooleanQuery *)(query))
369
+ Query *q_combine(Query **queries, int q_cnt)
370
+ {
371
+ int i;
372
+ Query *q, *ret_q;
373
+ HashSet *uniques = hs_new((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
374
+
375
+ for (i = 0; i < q_cnt; i++) {
376
+ q = queries[i];
377
+ if (q->type == BOOLEAN_QUERY) {
378
+ int j;
379
+ bool splittable = true;
380
+ if (BQ(q)->coord_disabled == false) {
381
+ splittable = false;
382
+ }
383
+ else {
384
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
385
+ if (BQ(q)->clauses[j]->occur != BC_SHOULD) {
386
+ splittable = false;
387
+ break;
388
+ }
389
+ }
390
+ }
391
+ if (splittable) {
392
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
393
+ Query *sub_q = BQ(q)->clauses[j]->query;
394
+ hs_add(uniques, sub_q);
395
+ }
396
+ }
397
+ else {
398
+ hs_add(uniques, q);
399
+ }
400
+ }
401
+ else {
402
+ hs_add(uniques, q);
403
+ }
404
+ }
405
+ if (uniques->size == 1) {
406
+ ret_q = (Query *)uniques->first->elem;
407
+ REF(ret_q);
408
+ }
409
+ else {
410
+ HashSetEntry *hse;
411
+ ret_q = bq_new(true);
412
+ for (hse = uniques->first; hse; hse = hse->next) {
413
+ q = (Query *)hse->elem;
414
+ bq_add_query(ret_q, q, BC_SHOULD);
415
+ }
416
+ }
417
+ hs_destroy(uniques);
418
+
419
+ return ret_q;
420
+ }
421
+
422
+ unsigned long q_hash(Query *self)
423
+ {
424
+ return (self->hash(self) << 5) | self->type;
425
+ }
426
+
427
+ int q_eq(Query *self, Query *o)
428
+ {
429
+ return (self == o)
430
+ || ((self->type == o->type)
431
+ && (self->boost == o->boost)
432
+ && self->eq(self, o));
433
+ }
434
+
435
+ static MatchVector *q_get_matchv_i(Query *self, MatchVector *mv, TermVector *tv)
436
+ {
437
+ /* be default we don't add any matches */
438
+ (void)self; (void)tv;
439
+ return mv;
440
+ }
441
+
442
+ Query *q_create(size_t size)
443
+ {
444
+ Query *self = (Query *)ecalloc(size);
445
+ #ifdef DEBUG
446
+ if (size < sizeof(Query)) {
447
+ RAISE(FERRET_ERROR, "Size of a query <%d> should never be smaller than "
448
+ "the size of a Query struct <%d>", (int)size, (int)sizeof(Query));
449
+ }
450
+ #endif
451
+ self->boost = 1.0;
452
+ self->rewrite = &q_rewrite;
453
+ self->get_similarity = &q_get_similarity_i;
454
+ self->extract_terms = &q_extract_terms;
455
+ self->get_matchv_i = &q_get_matchv_i;
456
+ self->weight = NULL;
457
+ self->ref_cnt = 1;
458
+ return self;
459
+ }
460
+
461
+ /***************************************************************************
462
+ *
463
+ * Scorer
464
+ *
465
+ ***************************************************************************/
466
+
467
+ void scorer_destroy_i(Scorer *scorer)
468
+ {
469
+ free(scorer);
470
+ }
471
+
472
+ Scorer *scorer_create(size_t size, Similarity *similarity)
473
+ {
474
+ Scorer *self = (Scorer *)ecalloc(size);
475
+ #ifdef DEBUG
476
+ if (size < sizeof(Scorer)) {
477
+ RAISE(FERRET_ERROR, "size of scorer <%d> should be at least <%d>",
478
+ (int)size, (int)sizeof(Scorer));
479
+ }
480
+ #endif
481
+ self->destroy = &scorer_destroy_i;
482
+ self->similarity = similarity;
483
+ return self;
484
+ }
485
+
486
+ bool scorer_less_than(void *p1, void *p2)
487
+ {
488
+ Scorer *s1 = (Scorer *)p1;
489
+ Scorer *s2 = (Scorer *)p2;
490
+ return s1->score(s1) < s2->score(s2);
491
+ }
492
+
493
+ bool scorer_doc_less_than(const Scorer *s1, const Scorer *s2)
494
+ {
495
+ return s1->doc < s2->doc;
496
+ }
497
+
498
+ int scorer_doc_cmp(const void *p1, const void *p2)
499
+ {
500
+ return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
501
+ }
502
+
503
+ /***************************************************************************
504
+ *
505
+ * Highlighter
506
+ *
507
+ ***************************************************************************/
508
+
509
+ /* ** MatchRange ** */
510
+ static int match_range_cmp(const void *p1, const void *p2)
511
+ {
512
+ int diff = ((MatchRange *)p1)->start - ((MatchRange *)p2)->start;
513
+ if (diff != 0) {
514
+ return diff;
515
+ }
516
+ else {
517
+ return ((MatchRange *)p2)->end - ((MatchRange *)p1)->end;
518
+ }
519
+ }
520
+
521
+
522
+
523
+ /* ** MatchVector ** */
524
+ MatchVector *matchv_new()
525
+ {
526
+ MatchVector *matchv = ALLOC(MatchVector);
527
+
528
+ matchv->size = 0;
529
+ matchv->capa = MATCH_VECTOR_INIT_CAPA;
530
+ matchv->matches = ALLOC_N(MatchRange, MATCH_VECTOR_INIT_CAPA);
531
+
532
+ return matchv;
533
+ }
534
+
535
+ MatchVector *matchv_add(MatchVector *self, int start, int end)
536
+ {
537
+ if (self->size >= self->capa) {
538
+ self->capa <<= 1;
539
+ REALLOC_N(self->matches, MatchRange, self->capa);
540
+ }
541
+ self->matches[self->size].start = start;
542
+ self->matches[self->size].end = end;
543
+ self->matches[self->size].score = 1.0;
544
+ self->size++;
545
+ return self;
546
+ }
547
+
548
+ MatchVector *matchv_sort(MatchVector *self)
549
+ {
550
+ qsort(self->matches, self->size, sizeof(MatchRange), &match_range_cmp);
551
+ return self;
552
+ }
553
+
554
+ MatchVector *matchv_compact(MatchVector *self)
555
+ {
556
+ int left, right;
557
+ matchv_sort(self);
558
+ for (right = left = 0; right < self->size; right++) {
559
+ /* Note the end + 1. This compacts a range 3:5 and 6:8 inleft 3:8 */
560
+ if (self->matches[right].start > self->matches[left].end + 1) {
561
+ left++;
562
+ self->matches[left].start = self->matches[right].start;
563
+ self->matches[left].end = self->matches[right].end;
564
+ self->matches[left].score = self->matches[right].score;
565
+ }
566
+ else if (self->matches[right].end > self->matches[left].end) {
567
+ self->matches[left].end = self->matches[right].end;
568
+ }
569
+ else {
570
+ self->matches[left].score += self->matches[right].score;
571
+ }
572
+ }
573
+ self->size = left + 1;
574
+ return self;
575
+ }
576
+
577
+ MatchVector *matchv_compact_with_breaks(MatchVector *self)
578
+ {
579
+ int left, right;
580
+ matchv_sort(self);
581
+ for (right = left = 0; right < self->size; right++) {
582
+ /* Note: no end + 1. Unlike above won't compact ranges 3:5 and 6:8 */
583
+ if (self->matches[right].start > self->matches[left].end) {
584
+ left++;
585
+ self->matches[left].start = self->matches[right].start;
586
+ self->matches[left].end = self->matches[right].end;
587
+ self->matches[left].score = self->matches[right].score;
588
+ }
589
+ else if (self->matches[right].end > self->matches[left].end) {
590
+ self->matches[left].end = self->matches[right].end;
591
+ self->matches[left].score += self->matches[right].score;
592
+ }
593
+ else if (right > left) {
594
+ self->matches[left].score += self->matches[right].score;
595
+ }
596
+ }
597
+ self->size = left + 1;
598
+ return self;
599
+ }
600
+
601
+
602
+ static MatchVector *matchv_set_offsets(MatchVector *mv, Offset *offsets)
603
+ {
604
+ int i;
605
+ for (i = 0; i < mv->size; i++) {
606
+ mv->matches[i].start_offset = offsets[mv->matches[i].start].start;
607
+ mv->matches[i].end_offset = offsets[mv->matches[i].end].end;
608
+ }
609
+ return mv;
610
+ }
611
+
612
+ void matchv_destroy(MatchVector *self)
613
+ {
614
+ free(self->matches);
615
+ free(self);
616
+ }
617
+
618
+ /***************************************************************************
619
+ *
620
+ * Searcher
621
+ *
622
+ ***************************************************************************/
623
+
624
+ MatchVector *searcher_get_match_vector(Searcher *self,
625
+ Query *query,
626
+ const int doc_num,
627
+ Symbol field)
628
+ {
629
+ MatchVector *mv = matchv_new();
630
+ bool rewrite = query->get_matchv_i == q_get_matchv_i;
631
+ TermVector *tv = self->get_term_vector(self, doc_num, field);
632
+ if (rewrite) {
633
+ query = self->rewrite(self, query);
634
+ }
635
+ if (tv && tv->term_cnt > 0 && tv->terms[0].positions != NULL) {
636
+ mv = query->get_matchv_i(query, mv, tv);
637
+ tv_destroy(tv);
638
+ }
639
+ if (rewrite) {
640
+ q_deref(query);
641
+ }
642
+ return mv;
643
+ }
644
+
645
+ typedef struct Excerpt
646
+ {
647
+ int start;
648
+ int end;
649
+ int start_pos;
650
+ int end_pos;
651
+ int start_offset;
652
+ int end_offset;
653
+ double score;
654
+ } Excerpt;
655
+
656
+ /*
657
+ static int excerpt_cmp(const void *p1, const void *p2)
658
+ {
659
+ double score1 = (*((Excerpt **)p1))->score;
660
+ double score2 = (*((Excerpt **)p2))->score;
661
+ if (score1 > score2) return 1;
662
+ if (score1 < score2) return -1;
663
+ return 0;
664
+ }
665
+ */
666
+
667
+ static int excerpt_start_cmp(const void *p1, const void *p2)
668
+ {
669
+ return (*((Excerpt **)p1))->start - (*((Excerpt **)p2))->start;
670
+ }
671
+
672
+ static int excerpt_lt(Excerpt *e1, Excerpt *e2)
673
+ {
674
+ return e1->score > e2->score; /* want the highest score at top */
675
+ }
676
+
677
+ static Excerpt *excerpt_new(int start, int end, double score)
678
+ {
679
+ Excerpt *excerpt = ALLOC_AND_ZERO(Excerpt);
680
+ excerpt->start = start;
681
+ excerpt->end = end;
682
+ excerpt->score = score;
683
+ return excerpt;
684
+ }
685
+
686
+ static Excerpt *excerpt_recalc_score(Excerpt *e, MatchVector *mv)
687
+ {
688
+ int i;
689
+ double score = 0.0;
690
+ for (i = e->start; i <= e->end; i++) {
691
+ score += mv->matches[i].score;
692
+ }
693
+ e->score = score;
694
+ return e;
695
+ }
696
+
697
+ /* expand an excerpt to it's largest possible size */
698
+ static Excerpt *excerpt_expand(Excerpt *e, const int len, TermVector *tv)
699
+ {
700
+ Offset *offsets = tv->offsets;
701
+ int offset_cnt = tv->offset_cnt;
702
+ bool did_expansion = true;
703
+ int i;
704
+ /* fill in skipped offsets */
705
+ for (i = 1; i < offset_cnt; i++) {
706
+ if (offsets[i].start == 0) {
707
+ offsets[i].start = offsets[i-1].start;
708
+ }
709
+ if (offsets[i].end == 0) {
710
+ offsets[i].end = offsets[i-1].end;
711
+ }
712
+ }
713
+
714
+ while (did_expansion) {
715
+ did_expansion = false;
716
+ if (e->start_pos > 0
717
+ && (e->end_offset - offsets[e->start_pos - 1].start) < len) {
718
+ e->start_pos--;
719
+ e->start_offset = offsets[e->start_pos].start;
720
+ did_expansion = true;
721
+ }
722
+ if (e->end_pos < (offset_cnt - 1)
723
+ && (offsets[e->end_pos + 1].end - e->start_offset) < len) {
724
+ e->end_pos++;
725
+ e->end_offset = offsets[e->end_pos].end;
726
+ did_expansion = true;
727
+ }
728
+ }
729
+ return e;
730
+ }
731
+
732
+ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
733
+ LazyDocField *lazy_df,
734
+ const char *pre_tag,
735
+ const char *post_tag,
736
+ const char *ellipsis)
737
+ {
738
+ int i, len;
739
+ int last_offset = e->start_offset;
740
+ const int num_matches = e->end - e->start + 1;
741
+ const int pre_tag_len = (int)strlen(pre_tag);
742
+ const int post_tag_len = (int)strlen(post_tag);
743
+ const int ellipsis_len = (int)strlen(ellipsis);
744
+ char *excerpt_str = ALLOC_N(char,
745
+ 10 + e->end_offset - e->start_offset
746
+ + (num_matches * (pre_tag_len + post_tag_len))
747
+ + (2 * ellipsis_len));
748
+ char *e_ptr = excerpt_str;
749
+ if (e->start_offset > 0) {
750
+ memcpy(e_ptr, ellipsis, ellipsis_len);
751
+ e_ptr += ellipsis_len;
752
+ }
753
+ for (i = e->start; i <= e->end; i++) {
754
+ MatchRange *mr = mv->matches + i;
755
+ len = mr->start_offset - last_offset;
756
+ if (len) {
757
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
758
+ e_ptr += len;
759
+ }
760
+ memcpy(e_ptr, pre_tag, pre_tag_len);
761
+ e_ptr += pre_tag_len;
762
+ len = mr->end_offset - mr->start_offset;
763
+ if (len) {
764
+ lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
765
+ e_ptr += len;
766
+ }
767
+ memcpy(e_ptr, post_tag, post_tag_len);
768
+ e_ptr += post_tag_len;
769
+ last_offset = mr->end_offset;
770
+ }
771
+ if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
772
+ /* no point using ellipsis if it takes up more space */
773
+ e->end_offset = lazy_df->len;
774
+ }
775
+ len = e->end_offset - last_offset;
776
+ if (len) {
777
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
778
+ e_ptr += len;
779
+ }
780
+ if (e->end_offset < lazy_df->len) {
781
+ memcpy(e_ptr, ellipsis, ellipsis_len);
782
+ e_ptr += ellipsis_len;
783
+ }
784
+ *e_ptr = '\0';
785
+ return excerpt_str;
786
+ }
787
+
788
+ static char *highlight_field(MatchVector *mv,
789
+ LazyDocField *lazy_df,
790
+ TermVector *tv,
791
+ const char *pre_tag,
792
+ const char *post_tag)
793
+ {
794
+ const int pre_len = (int)strlen(pre_tag);
795
+ const int post_len = (int)strlen(post_tag);
796
+ char *excerpt_str =
797
+ ALLOC_N(char, 10 + lazy_df->len + (mv->size * (pre_len + post_len)));
798
+ if (mv->size > 0) {
799
+ int last_offset = 0;
800
+ int i, len;
801
+ char *e_ptr = excerpt_str;
802
+ matchv_compact_with_breaks(mv);
803
+ matchv_set_offsets(mv, tv->offsets);
804
+ for (i = 0; i < mv->size; i++) {
805
+ MatchRange *mr = mv->matches + i;
806
+ len = mr->start_offset - last_offset;
807
+ if (len) {
808
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
809
+ e_ptr += len;
810
+ }
811
+ memcpy(e_ptr, pre_tag, pre_len);
812
+ e_ptr += pre_len;
813
+ len = mr->end_offset - mr->start_offset;
814
+ if (len) {
815
+ lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
816
+ e_ptr += len;
817
+ }
818
+ memcpy(e_ptr, post_tag, post_len);
819
+ e_ptr += post_len;
820
+ last_offset = mr->end_offset;
821
+ }
822
+ len = lazy_df->len - last_offset;
823
+ if (len) {
824
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
825
+ e_ptr += len;
826
+ }
827
+ *e_ptr = '\0';
828
+ }
829
+ else {
830
+ lazy_df_get_bytes(lazy_df, excerpt_str, 0, lazy_df->len);
831
+ excerpt_str[lazy_df->len] = '\0';
832
+ }
833
+ return excerpt_str;
834
+ }
835
+
836
+ char **searcher_highlight(Searcher *self,
837
+ Query *query,
838
+ const int doc_num,
839
+ Symbol field,
840
+ const int excerpt_len,
841
+ const int num_excerpts,
842
+ const char *pre_tag,
843
+ const char *post_tag,
844
+ const char *ellipsis)
845
+ {
846
+ char **excerpt_strs = NULL;
847
+ TermVector *tv = self->get_term_vector(self, doc_num, field);
848
+ LazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
849
+ LazyDocField *lazy_df = NULL;
850
+ if (lazy_doc) {
851
+ lazy_df = lazy_doc_get(lazy_doc, field);
852
+ }
853
+ if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
854
+ && tv->offsets != NULL) {
855
+ MatchVector *mv;
856
+ query = self->rewrite(self, query);
857
+ mv = query->get_matchv_i(query, matchv_new(), tv);
858
+ q_deref(query);
859
+ if (lazy_df->len < (excerpt_len * num_excerpts)) {
860
+ excerpt_strs = ary_new_type_capa(char *, 1);
861
+ ary_push(excerpt_strs,
862
+ highlight_field(mv, lazy_df, tv, pre_tag, post_tag));
863
+ }
864
+ else if (mv->size > 0) {
865
+ Excerpt **excerpts = ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
866
+ int e_start, e_end, i, j;
867
+ MatchRange *matches = mv->matches;
868
+ double running_score = 0.0;
869
+ Offset *offsets = tv->offsets;
870
+ PriorityQueue *excerpt_pq;
871
+
872
+ matchv_compact_with_breaks(mv);
873
+ matchv_set_offsets(mv, offsets);
874
+ excerpt_pq = pq_new(mv->size, (lt_ft)&excerpt_lt, &free);
875
+ /* add all possible excerpts to the priority queue */
876
+
877
+ for (e_start = e_end = 0; e_start < mv->size; e_start++) {
878
+ const int start_offset = matches[e_start].start_offset;
879
+ if (e_start > e_end) {
880
+ running_score = 0.0;
881
+ e_end = e_start;
882
+ }
883
+ while (e_end < mv->size && (matches[e_end].end_offset
884
+ <= start_offset + excerpt_len)) {
885
+ running_score += matches[e_end].score;
886
+ e_end++;
887
+ }
888
+ pq_push(excerpt_pq,
889
+ excerpt_new(e_start, e_end - 1, running_score));
890
+ /* - 0.1 so that earlier matches take priority */
891
+ running_score -= matches[e_start].score;
892
+ }
893
+
894
+ for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
895
+ excerpts[i] = (Excerpt *)pq_pop(excerpt_pq);
896
+ if (i < num_excerpts - 1) {
897
+ /* set match ranges alread included to 0 */
898
+ Excerpt *e = excerpts[i];
899
+ for (j = e->start; j <= e->end; j++) {
900
+ matches[j].score = 0.0;
901
+ }
902
+ e = NULL;
903
+ while (e != (Excerpt *)pq_top(excerpt_pq)) {
904
+ e = (Excerpt *)pq_top(excerpt_pq);
905
+ excerpt_recalc_score(e, mv);
906
+ pq_down(excerpt_pq);
907
+ }
908
+ }
909
+ }
910
+
911
+ qsort(excerpts, i, sizeof(Excerpt *), &excerpt_start_cmp);
912
+ for (j = 0; j < i; j++) {
913
+ Excerpt *e = excerpts[j];
914
+ e->start_pos = matches[e->start].start;
915
+ e->end_pos = matches[e->end].end;
916
+ e->start_offset = offsets[e->start_pos].start;
917
+ e->end_offset = offsets[e->end_pos].end;
918
+ }
919
+
920
+ if (i < num_excerpts) {
921
+ const int diff = num_excerpts - i;
922
+ memmove(excerpts + (diff), excerpts,
923
+ i * sizeof(Excerpt *));
924
+ for (j = 0; j < diff; j++) {
925
+ /* these new excerpts will grow into one long excerpt at
926
+ * the start */
927
+ excerpts[j] = ALLOC_AND_ZERO(Excerpt);
928
+ excerpts[j]->end = -1;
929
+ }
930
+ }
931
+
932
+ excerpt_strs = ary_new_type_capa(char *, num_excerpts);
933
+ /* merge excerpts where possible */
934
+ for (i = 0; i < num_excerpts;) {
935
+ Excerpt *ei = excerpts[i];
936
+ int merged = 1; /* 1 means a single excerpt, ie no merges */
937
+ for (j = i + 1; j < num_excerpts; j++) {
938
+ Excerpt *ej = excerpts[j];
939
+ if ((ej->end_offset - ei->start_offset)
940
+ < (j - i + 1) * excerpt_len) {
941
+ ei->end = ej->end;
942
+ ei->end_pos = ej->end_pos;
943
+ ei->end_offset = ej->end_offset;
944
+ merged = j - i + 1;
945
+ }
946
+ }
947
+ excerpt_expand(ei, merged * excerpt_len, tv);
948
+ ary_push(excerpt_strs,
949
+ excerpt_get_str(ei, mv, lazy_df,
950
+ pre_tag, post_tag, ellipsis));
951
+ i += merged;
952
+ }
953
+ for (i = 0; i < num_excerpts; i++) {
954
+ free(excerpts[i]);
955
+ }
956
+ free(excerpts);
957
+ pq_destroy(excerpt_pq);
958
+ }
959
+ matchv_destroy(mv);
960
+ }
961
+ if (tv) tv_destroy(tv);
962
+ if (lazy_doc) lazy_doc_close(lazy_doc);
963
+ return excerpt_strs;
964
+ }
965
+
966
+ static Weight *sea_create_weight(Searcher *self, Query *query)
967
+ {
968
+ return q_weight(query, self);
969
+ }
970
+
971
+ static void sea_check_args(int num_docs, int first_doc)
972
+ {
973
+ if (num_docs <= 0) {
974
+ RAISE(ARG_ERROR, ":num_docs was set to %d but should be greater "
975
+ "than 0 : %d <= 0", num_docs, num_docs);
976
+ }
977
+
978
+ if (first_doc < 0) {
979
+ RAISE(ARG_ERROR, ":first_doc was set to %d but should be greater "
980
+ "than or equal to 0 : %d < 0", first_doc, first_doc);
981
+ }
982
+ }
983
+
984
+ static Similarity *sea_get_similarity(Searcher *self)
985
+ {
986
+ return self->similarity;
987
+ }
988
+
989
+ /***************************************************************************
990
+ *
991
+ * IndexSearcher
992
+ *
993
+ ***************************************************************************/
994
+
995
+ #define ISEA(searcher) ((IndexSearcher *)(searcher))
996
+
997
+ int isea_doc_freq(Searcher *self, Symbol field, const char *term)
998
+ {
999
+ return ir_doc_freq(ISEA(self)->ir, field, term);
1000
+ }
1001
+
1002
+ static Document *isea_get_doc(Searcher *self, int doc_num)
1003
+ {
1004
+ IndexReader *ir = ISEA(self)->ir;
1005
+ return ir->get_doc(ir, doc_num);
1006
+ }
1007
+
1008
+ static LazyDoc *isea_get_lazy_doc(Searcher *self, int doc_num)
1009
+ {
1010
+ IndexReader *ir = ISEA(self)->ir;
1011
+ return ir->get_lazy_doc(ir, doc_num);
1012
+ }
1013
+
1014
+ static int isea_max_doc(Searcher *self)
1015
+ {
1016
+ IndexReader *ir = ISEA(self)->ir;
1017
+ return ir->max_doc(ir);
1018
+ }
1019
+
1020
+ #define IS_FILTERED(bits, post_filter, scorer, searcher) \
1021
+ ((bits && !bv_get(bits, scorer->doc))\
1022
+ || (post_filter \
1023
+ && !(filter_factor = \
1024
+ post_filter->filter_func(scorer->doc, scorer->score(scorer),\
1025
+ searcher, post_filter->arg))))
1026
+
1027
+ static TopDocs *isea_search_w(Searcher *self,
1028
+ Weight *weight,
1029
+ int first_doc,
1030
+ int num_docs,
1031
+ Filter *filter,
1032
+ Sort *sort,
1033
+ PostFilter *post_filter,
1034
+ bool load_fields)
1035
+ {
1036
+ int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
1037
+ int i;
1038
+ Scorer *scorer;
1039
+ Hit **score_docs = NULL;
1040
+ Hit hit;
1041
+ int total_hits = 0;
1042
+ float score, max_score = 0.0;
1043
+ float filter_factor = 1.0;
1044
+ BitVector *bits = (filter
1045
+ ? filt_get_bv(filter, ISEA(self)->ir)
1046
+ : NULL);
1047
+ Hit *(*hq_pop)(PriorityQueue *pq);
1048
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1049
+ void (*hq_destroy)(PriorityQueue *self);
1050
+ PriorityQueue *hq;
1051
+
1052
+ sea_check_args(num_docs, first_doc);
1053
+
1054
+ scorer = weight->scorer(weight, ISEA(self)->ir);
1055
+ if (!scorer || 0 == ISEA(self)->ir->num_docs(ISEA(self)->ir)) {
1056
+ if (scorer) scorer->destroy(scorer);
1057
+ return td_new(0, 0, NULL, 0.0);
1058
+ }
1059
+
1060
+ if (sort) {
1061
+ hq = fshq_pq_new(max_size, sort, ISEA(self)->ir);
1062
+ hq_insert = &fshq_pq_insert;
1063
+ hq_destroy = &fshq_pq_destroy;
1064
+ if (load_fields) {
1065
+ hq_pop = &fshq_pq_pop_fd;
1066
+ }
1067
+ else {
1068
+ hq_pop = &fshq_pq_pop;
1069
+ }
1070
+ }
1071
+ else {
1072
+ hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
1073
+ hq_pop = &hit_pq_pop;
1074
+ hq_insert = &hit_pq_insert;
1075
+ hq_destroy = &pq_destroy;
1076
+ }
1077
+
1078
+ while (scorer->next(scorer)) {
1079
+ if (bits && !bv_get(bits, scorer->doc)) continue;
1080
+ score = scorer->score(scorer);
1081
+ if (post_filter &&
1082
+ !(filter_factor = post_filter->filter_func(scorer->doc,
1083
+ score,
1084
+ self,
1085
+ post_filter->arg))) {
1086
+ continue;
1087
+ }
1088
+ total_hits++;
1089
+ if (filter_factor < 1.0) score *= filter_factor;
1090
+ if (score > max_score) max_score = score;
1091
+ hit.doc = scorer->doc; hit.score = score;
1092
+ hq_insert(hq, &hit);
1093
+ }
1094
+ scorer->destroy(scorer);
1095
+
1096
+ if (hq->size > first_doc) {
1097
+ if ((hq->size - first_doc) < num_docs) {
1098
+ num_docs = hq->size - first_doc;
1099
+ }
1100
+ score_docs = ALLOC_N(Hit *, num_docs);
1101
+ for (i = num_docs - 1; i >= 0; i--) {
1102
+ score_docs[i] = hq_pop(hq);
1103
+ /*
1104
+ printf("score_docs[i][%d] = [%ld] => %d-->%f\n", i,
1105
+ score_docs[i], score_docs[i]->doc, score_docs[i]->score);
1106
+ */
1107
+ }
1108
+ }
1109
+ else {
1110
+ num_docs = 0;
1111
+ }
1112
+ pq_clear(hq);
1113
+ hq_destroy(hq);
1114
+
1115
+ return td_new(total_hits, num_docs, score_docs, max_score);
1116
+ }
1117
+
1118
+ static TopDocs *isea_search(Searcher *self,
1119
+ Query *query,
1120
+ int first_doc,
1121
+ int num_docs,
1122
+ Filter *filter,
1123
+ Sort *sort,
1124
+ PostFilter *post_filter,
1125
+ bool load_fields)
1126
+ {
1127
+ TopDocs *td;
1128
+ Weight *weight = q_weight(query, self);
1129
+ td = isea_search_w(self, weight, first_doc, num_docs, filter,
1130
+ sort, post_filter, load_fields);
1131
+ weight->destroy(weight);
1132
+ return td;
1133
+ }
1134
+
1135
+ static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
1136
+ PostFilter *post_filter,
1137
+ void (*fn)(Searcher *, int, float, void *),
1138
+ void *arg)
1139
+ {
1140
+ Scorer *scorer;
1141
+ float filter_factor = 1.0;
1142
+ BitVector *bits = (filter
1143
+ ? filt_get_bv(filter, ISEA(self)->ir)
1144
+ : NULL);
1145
+
1146
+ scorer = weight->scorer(weight, ISEA(self)->ir);
1147
+ if (!scorer) {
1148
+ return;
1149
+ }
1150
+
1151
+ while (scorer->next(scorer)) {
1152
+ if (bits && !bv_get(bits, scorer->doc)) continue;
1153
+ float score = scorer->score(scorer);
1154
+ if (post_filter &&
1155
+ !(filter_factor = post_filter->filter_func(scorer->doc,
1156
+ score,
1157
+ self,
1158
+ post_filter->arg))) {
1159
+ continue;
1160
+ }
1161
+ fn(self, scorer->doc, filter_factor * score, arg);
1162
+ }
1163
+ scorer->destroy(scorer);
1164
+ }
1165
+
1166
+ static void isea_search_each(Searcher *self, Query *query, Filter *filter,
1167
+ PostFilter *post_filter,
1168
+ void (*fn)(Searcher *, int, float, void *),
1169
+ void *arg)
1170
+ {
1171
+ Weight *weight = q_weight(query, self);
1172
+ isea_search_each_w(self, weight, filter, post_filter, fn, arg);
1173
+ weight->destroy(weight);
1174
+ }
1175
+
1176
+ /*
1177
+ * Scan the index for all documents that match a query and write the results
1178
+ * to a buffer. It will stop scanning once the limit is reached and it starts
1179
+ * scanning from offset_docnum.
1180
+ *
1181
+ * Note: Unlike the offset_docnum in other search methods, this offset_docnum
1182
+ * refers to document number and not hit.
1183
+ */
1184
+ static int isea_search_unscored_w(Searcher *self,
1185
+ Weight *weight,
1186
+ int *buf,
1187
+ int limit,
1188
+ int offset_docnum)
1189
+ {
1190
+ int count = 0;
1191
+ Scorer *scorer = weight->scorer(weight, ISEA(self)->ir);
1192
+ if (scorer) {
1193
+ if (scorer->skip_to(scorer, offset_docnum)) {
1194
+ do {
1195
+ buf[count++] = scorer->doc;
1196
+ } while (count < limit && scorer->next(scorer));
1197
+ }
1198
+ scorer->destroy(scorer);
1199
+ }
1200
+ return count;
1201
+ }
1202
+
1203
+ static int isea_search_unscored(Searcher *self,
1204
+ Query *query,
1205
+ int *buf,
1206
+ int limit,
1207
+ int offset_docnum)
1208
+ {
1209
+ int count;
1210
+ Weight *weight = q_weight(query, self);
1211
+ count = isea_search_unscored_w(self, weight, buf, limit, offset_docnum);
1212
+ weight->destroy(weight);
1213
+ return count;
1214
+ }
1215
+
1216
+ static Query *isea_rewrite(Searcher *self, Query *original)
1217
+ {
1218
+ int q_is_destroyed = false;
1219
+ Query *query = original;
1220
+ Query *rewritten_query = query->rewrite(query, ISEA(self)->ir);
1221
+ while (q_is_destroyed || (query != rewritten_query)) {
1222
+ query = rewritten_query;
1223
+ rewritten_query = query->rewrite(query, ISEA(self)->ir);
1224
+ q_is_destroyed = (query->ref_cnt <= 1);
1225
+ q_deref(query); /* destroy intermediate queries */
1226
+ }
1227
+ return query;
1228
+ }
1229
+
1230
+ static Explanation *isea_explain(Searcher *self,
1231
+ Query *query,
1232
+ int doc_num)
1233
+ {
1234
+ Weight *weight = q_weight(query, self);
1235
+ Explanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
1236
+ weight->destroy(weight);
1237
+ return e;
1238
+ }
1239
+
1240
+ static Explanation *isea_explain_w(Searcher *self, Weight *w, int doc_num)
1241
+ {
1242
+ return w->explain(w, ISEA(self)->ir, doc_num);
1243
+ }
1244
+
1245
+ static TermVector *isea_get_term_vector(Searcher *self,
1246
+ const int doc_num,
1247
+ Symbol field)
1248
+ {
1249
+ IndexReader *ir = ISEA(self)->ir;
1250
+ return ir->term_vector(ir, doc_num, field);
1251
+ }
1252
+
1253
+ static void isea_close(Searcher *self)
1254
+ {
1255
+ if (ISEA(self)->ir && ISEA(self)->close_ir) {
1256
+ ir_close(ISEA(self)->ir);
1257
+ }
1258
+ free(self);
1259
+ }
1260
+
1261
+ Searcher *isea_new(IndexReader *ir)
1262
+ {
1263
+ Searcher *self = (Searcher *)ALLOC(IndexSearcher);
1264
+
1265
+ ISEA(self)->ir = ir;
1266
+ ISEA(self)->close_ir = true;
1267
+
1268
+ self->similarity = sim_create_default();
1269
+ self->doc_freq = &isea_doc_freq;
1270
+ self->get_doc = &isea_get_doc;
1271
+ self->get_lazy_doc = &isea_get_lazy_doc;
1272
+ self->max_doc = &isea_max_doc;
1273
+ self->create_weight = &sea_create_weight;
1274
+ self->search = &isea_search;
1275
+ self->search_w = &isea_search_w;
1276
+ self->search_each = &isea_search_each;
1277
+ self->search_each_w = &isea_search_each_w;
1278
+ self->search_unscored = &isea_search_unscored;
1279
+ self->search_unscored_w = &isea_search_unscored_w;
1280
+ self->rewrite = &isea_rewrite;
1281
+ self->explain = &isea_explain;
1282
+ self->explain_w = &isea_explain_w;
1283
+ self->get_term_vector = &isea_get_term_vector;
1284
+ self->get_similarity = &sea_get_similarity;
1285
+ self->close = &isea_close;
1286
+
1287
+ return self;
1288
+ }
1289
+
1290
+ /***************************************************************************
1291
+ *
1292
+ * CachedDFSearcher
1293
+ *
1294
+ ***************************************************************************/
1295
+
1296
+ #define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
1297
+ typedef struct CachedDFSearcher
1298
+ {
1299
+ Searcher super;
1300
+ Hash *df_map;
1301
+ int max_doc;
1302
+ } CachedDFSearcher;
1303
+
1304
+ static int cdfsea_doc_freq(Searcher *self, Symbol field, const char *text)
1305
+ {
1306
+ Term term;
1307
+ int *df;
1308
+ term.field = field;
1309
+ term.text = (char *)text;
1310
+ df = (int *)h_get(CDFSEA(self)->df_map, &term);
1311
+ return df ? *df : 0;
1312
+ }
1313
+
1314
+ static Document *cdfsea_get_doc(Searcher *self, int doc_num)
1315
+ {
1316
+ (void)self; (void)doc_num;
1317
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1318
+ return NULL;
1319
+ }
1320
+
1321
+ static int cdfsea_max_doc(Searcher *self)
1322
+ {
1323
+ (void)self;
1324
+ return CDFSEA(self)->max_doc;
1325
+ }
1326
+
1327
+ static Weight *cdfsea_create_weight(Searcher *self, Query *query)
1328
+ {
1329
+ (void)self; (void)query;
1330
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1331
+ return NULL;
1332
+ }
1333
+
1334
+ static TopDocs *cdfsea_search_w(Searcher *self, Weight *w, int fd, int nd,
1335
+ Filter *f, Sort *s, PostFilter *pf, bool load)
1336
+ {
1337
+ (void)self; (void)w; (void)fd; (void)nd;
1338
+ (void)f; (void)s; (void)pf; (void)load;
1339
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1340
+ return NULL;
1341
+ }
1342
+
1343
+ static TopDocs *cdfsea_search(Searcher *self, Query *q, int fd, int nd,
1344
+ Filter *f, Sort *s, PostFilter *pf, bool load)
1345
+ {
1346
+ (void)self; (void)q; (void)fd; (void)nd;
1347
+ (void)f; (void)s; (void)pf; (void)load;
1348
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1349
+ return NULL;
1350
+ }
1351
+
1352
+ static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
1353
+ PostFilter *pf,
1354
+ void (*fn)(Searcher *, int, float, void *),
1355
+ void *arg)
1356
+ {
1357
+ (void)self; (void)query; (void)filter; (void)pf; (void)fn; (void)arg;
1358
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1359
+ }
1360
+
1361
+ static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
1362
+ PostFilter *pf,
1363
+ void (*fn)(Searcher *, int, float, void *),
1364
+ void *arg)
1365
+ {
1366
+ (void)self; (void)w; (void)filter; (void)pf; (void)fn; (void)arg;
1367
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1368
+ }
1369
+
1370
+ static Query *cdfsea_rewrite(Searcher *self, Query *original)
1371
+ {
1372
+ (void)self;
1373
+ original->ref_cnt++;
1374
+ return original;
1375
+ }
1376
+
1377
+ static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
1378
+ {
1379
+ (void)self; (void)query; (void)doc_num;
1380
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1381
+ return NULL;
1382
+ }
1383
+
1384
+ static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
1385
+ {
1386
+ (void)self; (void)w; (void)doc_num;
1387
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1388
+ return NULL;
1389
+ }
1390
+
1391
+ static TermVector *cdfsea_get_term_vector(Searcher *self, const int doc_num,
1392
+ Symbol field)
1393
+ {
1394
+ (void)self; (void)doc_num; (void)field;
1395
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1396
+ return NULL;
1397
+ }
1398
+
1399
+ static Similarity *cdfsea_get_similarity(Searcher *self)
1400
+ {
1401
+ return self->similarity;
1402
+ }
1403
+
1404
+ static void cdfsea_close(Searcher *self)
1405
+ {
1406
+ h_destroy(CDFSEA(self)->df_map);
1407
+ free(self);
1408
+ }
1409
+
1410
+ static Searcher *cdfsea_new(Hash *df_map, int max_doc)
1411
+ {
1412
+ Searcher *self = (Searcher *)ALLOC(CachedDFSearcher);
1413
+
1414
+ CDFSEA(self)->df_map = df_map;
1415
+ CDFSEA(self)->max_doc = max_doc;
1416
+
1417
+ self->similarity = sim_create_default();
1418
+ self->doc_freq = &cdfsea_doc_freq;
1419
+ self->get_doc = &cdfsea_get_doc;
1420
+ self->max_doc = &cdfsea_max_doc;
1421
+ self->create_weight = &cdfsea_create_weight;
1422
+ self->search = &cdfsea_search;
1423
+ self->search_w = &cdfsea_search_w;
1424
+ self->search_each = &cdfsea_search_each;
1425
+ self->search_each_w = &cdfsea_search_each_w;
1426
+ self->rewrite = &cdfsea_rewrite;
1427
+ self->explain = &cdfsea_explain;
1428
+ self->explain_w = &cdfsea_explain_w;
1429
+ self->get_term_vector = &cdfsea_get_term_vector;
1430
+ self->get_similarity = &cdfsea_get_similarity;
1431
+ self->close = &cdfsea_close;
1432
+ return self;
1433
+ }
1434
+
1435
+ /***************************************************************************
1436
+ *
1437
+ * MultiSearcher
1438
+ *
1439
+ ***************************************************************************/
1440
+
1441
+ #define MSEA(searcher) ((MultiSearcher *)(searcher))
1442
+ static INLINE int msea_get_searcher_index(Searcher *self, int n)
1443
+ {
1444
+ MultiSearcher *msea = MSEA(self);
1445
+ int lo = 0; /* search starts array */
1446
+ int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
1447
+ int mid, mid_val;
1448
+
1449
+ while (hi >= lo) {
1450
+ mid = (lo + hi) >> 1;
1451
+ mid_val = msea->starts[mid];
1452
+ if (n < mid_val) {
1453
+ hi = mid - 1;
1454
+ }
1455
+ else if (n > mid_val) {
1456
+ lo = mid + 1;
1457
+ }
1458
+ else { /* found a match */
1459
+ while (((mid+1) < msea->s_cnt)
1460
+ && (msea->starts[mid+1] == mid_val)) {
1461
+ mid++; /* scan to last match */
1462
+ }
1463
+ return mid;
1464
+ }
1465
+ }
1466
+ return hi;
1467
+ }
1468
+
1469
+ static int msea_doc_freq(Searcher *self, Symbol field, const char *term)
1470
+ {
1471
+ int i;
1472
+ int doc_freq = 0;
1473
+ MultiSearcher *msea = MSEA(self);
1474
+ for (i = 0; i < msea->s_cnt; i++) {
1475
+ Searcher *s = msea->searchers[i];
1476
+ doc_freq += s->doc_freq(s, field, term);
1477
+ }
1478
+
1479
+ return doc_freq;
1480
+ }
1481
+
1482
+ static Document *msea_get_doc(Searcher *self, int doc_num)
1483
+ {
1484
+ MultiSearcher *msea = MSEA(self);
1485
+ int i = msea_get_searcher_index(self, doc_num);
1486
+ Searcher *s = msea->searchers[i];
1487
+ return s->get_doc(s, doc_num - msea->starts[i]);
1488
+ }
1489
+
1490
+ static LazyDoc *msea_get_lazy_doc(Searcher *self, int doc_num)
1491
+ {
1492
+ MultiSearcher *msea = MSEA(self);
1493
+ int i = msea_get_searcher_index(self, doc_num);
1494
+ Searcher *s = msea->searchers[i];
1495
+ return s->get_lazy_doc(s, doc_num - msea->starts[i]);
1496
+ }
1497
+
1498
+ static int msea_max_doc(Searcher *self)
1499
+ {
1500
+ return MSEA(self)->max_doc;
1501
+ }
1502
+
1503
+ static int *msea_get_doc_freqs(Searcher *self, HashSet *terms)
1504
+ {
1505
+ int i;
1506
+ HashSetEntry *hse;
1507
+ int *doc_freqs = ALLOC_N(int, terms->size);
1508
+ for (i = 0, hse = terms->first; hse; ++i, hse = hse->next) {
1509
+ Term *t = (Term *)hse->elem;
1510
+ doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
1511
+ }
1512
+ return doc_freqs;
1513
+ }
1514
+
1515
+ static Weight *msea_create_weight(Searcher *self, Query *query)
1516
+ {
1517
+ int i, *doc_freqs;
1518
+ Searcher *cdfsea;
1519
+ Weight *w;
1520
+ Hash *df_map = h_new((hash_ft)&term_hash,
1521
+ (eq_ft)&term_eq,
1522
+ (free_ft)term_destroy,
1523
+ free);
1524
+ Query *rewritten_query = self->rewrite(self, query);
1525
+ /* terms get copied directly to df_map so no need to free here */
1526
+ HashSet *terms = hs_new((hash_ft)&term_hash,
1527
+ (eq_ft)&term_eq,
1528
+ (free_ft)NULL);
1529
+ HashSetEntry *hse;
1530
+
1531
+ rewritten_query->extract_terms(rewritten_query, terms);
1532
+ doc_freqs = msea_get_doc_freqs(self, terms);
1533
+
1534
+ for (hse = terms->first, i = 0; hse; ++i, hse = hse->next) {
1535
+ h_set(df_map, hse->elem, imalloc(doc_freqs[i]));
1536
+ }
1537
+ hs_destroy(terms);
1538
+ free(doc_freqs);
1539
+
1540
+ cdfsea = cdfsea_new(df_map, MSEA(self)->max_doc);
1541
+
1542
+ w = q_weight(rewritten_query, cdfsea);
1543
+ q_deref(rewritten_query);
1544
+ cdfsea->close(cdfsea);
1545
+
1546
+ return w;
1547
+ }
1548
+
1549
+ struct MultiSearchEachArg {
1550
+ int start;
1551
+ void *arg;
1552
+ void (*fn)(Searcher *, int, float, void *);
1553
+ };
1554
+
1555
+ static void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
1556
+ {
1557
+ struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
1558
+
1559
+ mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
1560
+ }
1561
+
1562
+ static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
1563
+ PostFilter *post_filter,
1564
+ void (*fn)(Searcher *, int, float, void *),
1565
+ void *arg)
1566
+ {
1567
+ int i;
1568
+ struct MultiSearchEachArg mse_arg;
1569
+ MultiSearcher *msea = MSEA(self);
1570
+ Searcher *s;
1571
+
1572
+ mse_arg.fn = fn;
1573
+ mse_arg.arg = arg;
1574
+ for (i = 0; i < msea->s_cnt; i++) {
1575
+ s = msea->searchers[i];
1576
+ mse_arg.start = msea->starts[i];
1577
+ s->search_each_w(s, w, filter, post_filter,
1578
+ &msea_search_each_i, &mse_arg);
1579
+ }
1580
+ }
1581
+
1582
+ static void msea_search_each(Searcher *self, Query *query, Filter *filter,
1583
+ PostFilter *post_filter,
1584
+ void (*fn)(Searcher *, int, float, void *),
1585
+ void *arg)
1586
+ {
1587
+ Weight *weight = q_weight(query, self);
1588
+ msea_search_each_w(self, weight, filter, post_filter, fn, arg);
1589
+ weight->destroy(weight);
1590
+ }
1591
+
1592
+ static int msea_search_unscored_w(Searcher *self,
1593
+ Weight *w,
1594
+ int *buf,
1595
+ int limit,
1596
+ int offset_docnum)
1597
+ {
1598
+ int i, count = 0;
1599
+ MultiSearcher *msea = MSEA(self);
1600
+
1601
+ for (i = 0; count < limit && i < msea->s_cnt; i++) {
1602
+ /* if offset_docnum falls in this or previous indexes */
1603
+ if (offset_docnum < msea->starts[i+1]) {
1604
+ Searcher *searcher = msea->searchers[i];
1605
+ const int index_offset = msea->starts[i];
1606
+ int current_limit = limit - count;
1607
+ /* if offset_docnum occurs in the current index then adjust,
1608
+ * otherwise set it to zero as it occured in a previous index */
1609
+ int current_offset_docnum = offset_docnum > index_offset
1610
+ ? offset_docnum - index_offset
1611
+ : 0;
1612
+
1613
+ /* record current count as we'll need to update docnums by the
1614
+ * index's offset */
1615
+ int j = count;
1616
+ count += searcher->search_unscored_w(searcher, w, buf + count,
1617
+ current_limit,
1618
+ current_offset_docnum);
1619
+ /* update doc nums with the current index's offsets */
1620
+ for (; j < count; j++) {
1621
+ buf[j] += index_offset;
1622
+ }
1623
+ }
1624
+ }
1625
+ return count;
1626
+ }
1627
+
1628
+ static int msea_search_unscored(Searcher *self,
1629
+ Query *query,
1630
+ int *buf,
1631
+ int limit,
1632
+ int offset_docnum)
1633
+ {
1634
+ int count;
1635
+ Weight *weight = q_weight(query, self);
1636
+ count = msea_search_unscored_w(self, weight, buf, limit, offset_docnum);
1637
+ weight->destroy(weight);
1638
+ return count;
1639
+ }
1640
+
1641
+ struct MultiSearchArg {
1642
+ int total_hits, max_size;
1643
+ PriorityQueue *hq;
1644
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1645
+ };
1646
+
1647
+ /*
1648
+ * FIXME Not used anywhere. Is it needed?
1649
+ static void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
1650
+ {
1651
+ struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
1652
+ Hit hit;
1653
+ (void)self;
1654
+
1655
+ ms_arg->total_hits++;
1656
+ hit.doc = doc_num;
1657
+ hit.score = score;
1658
+ ms_arg->hq_insert(ms_arg->hq, &hit);
1659
+ }
1660
+ */
1661
+
1662
+ static TopDocs *msea_search_w(Searcher *self,
1663
+ Weight *weight,
1664
+ int first_doc,
1665
+ int num_docs,
1666
+ Filter *filter,
1667
+ Sort *sort,
1668
+ PostFilter *post_filter,
1669
+ bool load_fields)
1670
+ {
1671
+ int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
1672
+ int i;
1673
+ int total_hits = 0;
1674
+ Hit **score_docs = NULL;
1675
+ Hit *(*hq_pop)(PriorityQueue *pq);
1676
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1677
+ PriorityQueue *hq;
1678
+ float max_score = 0.0;
1679
+ (void)load_fields; /* does it automatically */
1680
+
1681
+ sea_check_args(num_docs, first_doc);
1682
+
1683
+ if (sort) {
1684
+ hq = pq_new(max_size, (lt_ft)fdshq_lt, &free);
1685
+ hq_insert = (void (*)(PriorityQueue *pq, Hit *hit))&pq_insert;
1686
+ hq_pop = (Hit *(*)(PriorityQueue *pq))&pq_pop;
1687
+ }
1688
+ else {
1689
+ hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
1690
+ hq_insert = &hit_pq_multi_insert;
1691
+ hq_pop = &hit_pq_pop;
1692
+ }
1693
+
1694
+ /*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
1695
+ for (i = 0; i < MSEA(self)->s_cnt; i++) {
1696
+ Searcher *s = MSEA(self)->searchers[i];
1697
+ TopDocs *td = s->search_w(s, weight, 0, max_size,
1698
+ filter, sort, post_filter, true);
1699
+ /*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
1700
+ if (td->size > 0) {
1701
+ /*printf("td->size = %d %d\n", td->size, num_docs); */
1702
+ int j;
1703
+ int start = MSEA(self)->starts[i];
1704
+ for (j = 0; j < td->size; j++) {
1705
+ Hit *hit = td->hits[j];
1706
+ hit->doc += start;
1707
+ /*
1708
+ printf("adding hit = %d:%f\n", hit->doc, hit->score);
1709
+ */
1710
+ hq_insert(hq, hit);
1711
+ }
1712
+ td->size = 0;
1713
+ if (td->max_score > max_score) max_score = td->max_score;
1714
+ }
1715
+ total_hits += td->total_hits;
1716
+ td_destroy(td);
1717
+ }
1718
+
1719
+ if (hq->size > first_doc) {
1720
+ if ((hq->size - first_doc) < num_docs) {
1721
+ num_docs = hq->size - first_doc;
1722
+ }
1723
+ score_docs = ALLOC_N(Hit *, num_docs);
1724
+ for (i = num_docs - 1; i >= 0; i--) {
1725
+ score_docs[i] = hq_pop(hq);
1726
+ /*
1727
+ Hit *hit = score_docs[i] = hq_pop(hq);
1728
+ printf("popped hit = %d-->%f\n", hit->doc, hit->score);
1729
+ */
1730
+ }
1731
+ }
1732
+ else {
1733
+ num_docs = 0;
1734
+ }
1735
+ pq_clear(hq);
1736
+ pq_destroy(hq);
1737
+
1738
+ return td_new(total_hits, num_docs, score_docs, max_score);
1739
+ }
1740
+
1741
+ static TopDocs *msea_search(Searcher *self,
1742
+ Query *query,
1743
+ int first_doc,
1744
+ int num_docs,
1745
+ Filter *filter,
1746
+ Sort *sort,
1747
+ PostFilter *post_filter,
1748
+ bool load_fields)
1749
+ {
1750
+ TopDocs *td;
1751
+ Weight *weight = q_weight(query, self);
1752
+ td = msea_search_w(self, weight, first_doc, num_docs, filter,
1753
+ sort, post_filter, load_fields);
1754
+ weight->destroy(weight);
1755
+ return td;
1756
+ }
1757
+
1758
+ static Query *msea_rewrite(Searcher *self, Query *original)
1759
+ {
1760
+ int i;
1761
+ Searcher *s;
1762
+ MultiSearcher *msea = MSEA(self);
1763
+ Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
1764
+
1765
+ for (i = 0; i < msea->s_cnt; i++) {
1766
+ s = msea->searchers[i];
1767
+ queries[i] = s->rewrite(s, original);
1768
+ }
1769
+ rewritten = q_combine(queries, msea->s_cnt);
1770
+
1771
+ for (i = 0; i < msea->s_cnt; i++) {
1772
+ q_deref(queries[i]);
1773
+ }
1774
+ free(queries);
1775
+ return rewritten;
1776
+ }
1777
+
1778
+ static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
1779
+ {
1780
+ MultiSearcher *msea = MSEA(self);
1781
+ int i = msea_get_searcher_index(self, doc_num);
1782
+ Weight *w = q_weight(query, self);
1783
+ Searcher *s = msea->searchers[i];
1784
+ Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1785
+ w->destroy(w);
1786
+ return e;
1787
+ }
1788
+
1789
+ static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
1790
+ {
1791
+ MultiSearcher *msea = MSEA(self);
1792
+ int i = msea_get_searcher_index(self, doc_num);
1793
+ Searcher *s = msea->searchers[i];
1794
+ Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1795
+ return e;
1796
+ }
1797
+
1798
+ static TermVector *msea_get_term_vector(Searcher *self, const int doc_num,
1799
+ Symbol field)
1800
+ {
1801
+ MultiSearcher *msea = MSEA(self);
1802
+ int i = msea_get_searcher_index(self, doc_num);
1803
+ Searcher *s = msea->searchers[i];
1804
+ return s->get_term_vector(s, doc_num - msea->starts[i], field);
1805
+ }
1806
+
1807
+ static Similarity *msea_get_similarity(Searcher *self)
1808
+ {
1809
+ return self->similarity;
1810
+ }
1811
+
1812
+ static void msea_close(Searcher *self)
1813
+ {
1814
+ int i;
1815
+ Searcher *s;
1816
+ MultiSearcher *msea = MSEA(self);
1817
+ if (msea->close_subs) {
1818
+ for (i = 0; i < msea->s_cnt; i++) {
1819
+ s = msea->searchers[i];
1820
+ s->close(s);
1821
+ }
1822
+ }
1823
+ free(msea->searchers);
1824
+ free(msea->starts);
1825
+ free(self);
1826
+ }
1827
+
1828
+ Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
1829
+ {
1830
+ int i, max_doc = 0;
1831
+ Searcher *self = (Searcher *)ALLOC(MultiSearcher);
1832
+ int *starts = ALLOC_N(int, s_cnt + 1);
1833
+ for (i = 0; i < s_cnt; i++) {
1834
+ starts[i] = max_doc;
1835
+ max_doc += searchers[i]->max_doc(searchers[i]);
1836
+ }
1837
+ starts[i] = max_doc;
1838
+
1839
+ MSEA(self)->s_cnt = s_cnt;
1840
+ MSEA(self)->searchers = searchers;
1841
+ MSEA(self)->starts = starts;
1842
+ MSEA(self)->max_doc = max_doc;
1843
+ MSEA(self)->close_subs = close_subs;
1844
+
1845
+ self->similarity = sim_create_default();
1846
+ self->doc_freq = &msea_doc_freq;
1847
+ self->get_doc = &msea_get_doc;
1848
+ self->get_lazy_doc = &msea_get_lazy_doc;
1849
+ self->max_doc = &msea_max_doc;
1850
+ self->create_weight = &msea_create_weight;
1851
+ self->search = &msea_search;
1852
+ self->search_w = &msea_search_w;
1853
+ self->search_each = &msea_search_each;
1854
+ self->search_each_w = &msea_search_each_w;
1855
+ self->search_unscored = &msea_search_unscored;
1856
+ self->search_unscored_w = &msea_search_unscored_w;
1857
+ self->rewrite = &msea_rewrite;
1858
+ self->explain = &msea_explain;
1859
+ self->explain_w = &msea_explain_w;
1860
+ self->get_term_vector = &msea_get_term_vector;
1861
+ self->get_similarity = &msea_get_similarity;
1862
+ self->close = &msea_close;
1863
+ return self;
1864
+ }