sdsykes-ferret 0.11.6.19
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +24 -0
- data/MIT-LICENSE +20 -0
- data/README +102 -0
- data/Rakefile +338 -0
- data/TODO +17 -0
- data/TUTORIAL +231 -0
- data/bin/ferret-browser +79 -0
- data/ext/analysis.c +1555 -0
- data/ext/analysis.h +219 -0
- data/ext/api.c +69 -0
- data/ext/api.h +27 -0
- data/ext/array.c +123 -0
- data/ext/array.h +53 -0
- data/ext/bitvector.c +540 -0
- data/ext/bitvector.h +272 -0
- data/ext/compound_io.c +383 -0
- data/ext/config.h +42 -0
- data/ext/document.c +156 -0
- data/ext/document.h +53 -0
- data/ext/except.c +120 -0
- data/ext/except.h +168 -0
- data/ext/extconf.rb +14 -0
- data/ext/ferret.c +402 -0
- data/ext/ferret.h +91 -0
- data/ext/filter.c +156 -0
- data/ext/fs_store.c +483 -0
- data/ext/global.c +418 -0
- data/ext/global.h +117 -0
- data/ext/hash.c +567 -0
- data/ext/hash.h +473 -0
- data/ext/hashset.c +170 -0
- data/ext/hashset.h +187 -0
- data/ext/header.h +58 -0
- data/ext/helper.c +62 -0
- data/ext/helper.h +13 -0
- data/ext/inc/lang.h +48 -0
- data/ext/inc/threading.h +31 -0
- data/ext/index.c +6425 -0
- data/ext/index.h +961 -0
- data/ext/lang.h +66 -0
- data/ext/libstemmer.c +92 -0
- data/ext/libstemmer.h +79 -0
- data/ext/mempool.c +87 -0
- data/ext/mempool.h +35 -0
- data/ext/modules.h +162 -0
- data/ext/multimapper.c +310 -0
- data/ext/multimapper.h +51 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +151 -0
- data/ext/priorityqueue.h +143 -0
- data/ext/q_boolean.c +1608 -0
- data/ext/q_const_score.c +161 -0
- data/ext/q_filtered_query.c +209 -0
- data/ext/q_fuzzy.c +268 -0
- data/ext/q_match_all.c +148 -0
- data/ext/q_multi_term.c +677 -0
- data/ext/q_parser.c +2825 -0
- data/ext/q_phrase.c +1126 -0
- data/ext/q_prefix.c +100 -0
- data/ext/q_range.c +350 -0
- data/ext/q_span.c +2402 -0
- data/ext/q_term.c +337 -0
- data/ext/q_wildcard.c +171 -0
- data/ext/r_analysis.c +2575 -0
- data/ext/r_index.c +3472 -0
- data/ext/r_qparser.c +585 -0
- data/ext/r_search.c +4105 -0
- data/ext/r_store.c +513 -0
- data/ext/r_utils.c +963 -0
- data/ext/ram_store.c +471 -0
- data/ext/search.c +1741 -0
- data/ext/search.h +885 -0
- data/ext/similarity.c +150 -0
- data/ext/similarity.h +82 -0
- data/ext/sort.c +983 -0
- data/ext/stem_ISO_8859_1_danish.c +338 -0
- data/ext/stem_ISO_8859_1_danish.h +16 -0
- data/ext/stem_ISO_8859_1_dutch.c +635 -0
- data/ext/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/stem_ISO_8859_1_english.c +1156 -0
- data/ext/stem_ISO_8859_1_english.h +16 -0
- data/ext/stem_ISO_8859_1_finnish.c +792 -0
- data/ext/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/stem_ISO_8859_1_french.c +1276 -0
- data/ext/stem_ISO_8859_1_french.h +16 -0
- data/ext/stem_ISO_8859_1_german.c +512 -0
- data/ext/stem_ISO_8859_1_german.h +16 -0
- data/ext/stem_ISO_8859_1_italian.c +1091 -0
- data/ext/stem_ISO_8859_1_italian.h +16 -0
- data/ext/stem_ISO_8859_1_norwegian.c +296 -0
- data/ext/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/stem_ISO_8859_1_porter.c +776 -0
- data/ext/stem_ISO_8859_1_porter.h +16 -0
- data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
- data/ext/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/stem_ISO_8859_1_spanish.c +1119 -0
- data/ext/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/stem_KOI8_R_russian.c +701 -0
- data/ext/stem_KOI8_R_russian.h +16 -0
- data/ext/stem_UTF_8_danish.c +344 -0
- data/ext/stem_UTF_8_danish.h +16 -0
- data/ext/stem_UTF_8_dutch.c +653 -0
- data/ext/stem_UTF_8_dutch.h +16 -0
- data/ext/stem_UTF_8_english.c +1176 -0
- data/ext/stem_UTF_8_english.h +16 -0
- data/ext/stem_UTF_8_finnish.c +808 -0
- data/ext/stem_UTF_8_finnish.h +16 -0
- data/ext/stem_UTF_8_french.c +1296 -0
- data/ext/stem_UTF_8_french.h +16 -0
- data/ext/stem_UTF_8_german.c +526 -0
- data/ext/stem_UTF_8_german.h +16 -0
- data/ext/stem_UTF_8_italian.c +1113 -0
- data/ext/stem_UTF_8_italian.h +16 -0
- data/ext/stem_UTF_8_norwegian.c +302 -0
- data/ext/stem_UTF_8_norwegian.h +16 -0
- data/ext/stem_UTF_8_porter.c +794 -0
- data/ext/stem_UTF_8_porter.h +16 -0
- data/ext/stem_UTF_8_portuguese.c +1055 -0
- data/ext/stem_UTF_8_portuguese.h +16 -0
- data/ext/stem_UTF_8_russian.c +709 -0
- data/ext/stem_UTF_8_russian.h +16 -0
- data/ext/stem_UTF_8_spanish.c +1137 -0
- data/ext/stem_UTF_8_spanish.h +16 -0
- data/ext/stem_UTF_8_swedish.c +313 -0
- data/ext/stem_UTF_8_swedish.h +16 -0
- data/ext/stopwords.c +401 -0
- data/ext/store.c +692 -0
- data/ext/store.h +777 -0
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/utilities.c +446 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +29 -0
- data/lib/ferret/browser.rb +246 -0
- data/lib/ferret/browser/s/global.js +192 -0
- data/lib/ferret/browser/s/style.css +148 -0
- data/lib/ferret/browser/views/document/list.rhtml +49 -0
- data/lib/ferret/browser/views/document/show.rhtml +27 -0
- data/lib/ferret/browser/views/error/index.rhtml +7 -0
- data/lib/ferret/browser/views/help/index.rhtml +8 -0
- data/lib/ferret/browser/views/home/index.rhtml +29 -0
- data/lib/ferret/browser/views/layout.rhtml +22 -0
- data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
- data/lib/ferret/browser/views/term/index.rhtml +199 -0
- data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
- data/lib/ferret/browser/webrick.rb +14 -0
- data/lib/ferret/document.rb +130 -0
- data/lib/ferret/field_infos.rb +44 -0
- data/lib/ferret/index.rb +786 -0
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/setup.rb +1555 -0
- data/test/test_all.rb +5 -0
- data/test/test_helper.rb +24 -0
- data/test/threading/number_to_spoken.rb +132 -0
- data/test/threading/thread_safety_index_test.rb +79 -0
- data/test/threading/thread_safety_read_write_test.rb +76 -0
- data/test/threading/thread_safety_test.rb +133 -0
- data/test/unit/analysis/tc_analyzer.rb +548 -0
- data/test/unit/analysis/tc_token_stream.rb +646 -0
- data/test/unit/index/tc_index.rb +762 -0
- data/test/unit/index/tc_index_reader.rb +699 -0
- data/test/unit/index/tc_index_writer.rb +437 -0
- data/test/unit/index/th_doc.rb +315 -0
- data/test/unit/largefile/tc_largefile.rb +46 -0
- data/test/unit/query_parser/tc_query_parser.rb +238 -0
- data/test/unit/search/tc_filter.rb +135 -0
- data/test/unit/search/tc_fuzzy_query.rb +147 -0
- data/test/unit/search/tc_index_searcher.rb +61 -0
- data/test/unit/search/tc_multi_searcher.rb +128 -0
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tc_search_and_sort.rb +179 -0
- data/test/unit/search/tc_sort.rb +49 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +190 -0
- data/test/unit/search/tm_searcher.rb +384 -0
- data/test/unit/store/tc_fs_store.rb +77 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +34 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +2 -0
- data/test/unit/ts_index.rb +2 -0
- data/test/unit/ts_largefile.rb +4 -0
- data/test/unit/ts_query_parser.rb +2 -0
- data/test/unit/ts_search.rb +2 -0
- data/test/unit/ts_store.rb +2 -0
- data/test/unit/ts_utils.rb +2 -0
- data/test/unit/utils/tc_bit_vector.rb +295 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +285 -0
data/ext/search.c
ADDED
@@ -0,0 +1,1741 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <limits.h>
|
3
|
+
#include "search.h"
|
4
|
+
#include "array.h"
|
5
|
+
|
6
|
+
/***************************************************************************
|
7
|
+
*
|
8
|
+
* Explanation
|
9
|
+
*
|
10
|
+
***************************************************************************/
|
11
|
+
|
12
|
+
Explanation *expl_new(float value, const char *description, ...)
|
13
|
+
{
|
14
|
+
Explanation *expl = ALLOC(Explanation);
|
15
|
+
|
16
|
+
va_list args;
|
17
|
+
va_start(args, description);
|
18
|
+
expl->description = vstrfmt(description, args);
|
19
|
+
va_end(args);
|
20
|
+
|
21
|
+
expl->value = value;
|
22
|
+
expl->details = ary_new_type_capa(Explanation *,
|
23
|
+
EXPLANATION_DETAILS_START_SIZE);
|
24
|
+
return expl;
|
25
|
+
}
|
26
|
+
|
27
|
+
void expl_destroy(Explanation *expl)
|
28
|
+
{
|
29
|
+
ary_destroy((void **)expl->details, (free_ft)expl_destroy);
|
30
|
+
free(expl->description);
|
31
|
+
free(expl);
|
32
|
+
}
|
33
|
+
|
34
|
+
Explanation *expl_add_detail(Explanation *expl, Explanation *detail)
|
35
|
+
{
|
36
|
+
ary_push(expl->details, detail);
|
37
|
+
return expl;
|
38
|
+
}
|
39
|
+
|
40
|
+
char *expl_to_s_depth(Explanation *expl, int depth)
|
41
|
+
{
|
42
|
+
int i;
|
43
|
+
char *buffer = ALLOC_N(char, depth * 2 + 1);
|
44
|
+
const int num_details = ary_size(expl->details);
|
45
|
+
|
46
|
+
memset(buffer, ' ', sizeof(char) * depth * 2);
|
47
|
+
buffer[depth*2] = 0;
|
48
|
+
|
49
|
+
buffer = estrcat(buffer, strfmt("%f = %s\n", expl->value, expl->description));
|
50
|
+
for (i = 0; i < num_details; i++) {
|
51
|
+
buffer = estrcat(buffer, expl_to_s_depth(expl->details[i], depth + 1));
|
52
|
+
}
|
53
|
+
|
54
|
+
return buffer;
|
55
|
+
}
|
56
|
+
|
57
|
+
char *expl_to_html(Explanation *expl)
|
58
|
+
{
|
59
|
+
int i;
|
60
|
+
char *buffer;
|
61
|
+
const int num_details = ary_size(expl->details);
|
62
|
+
|
63
|
+
buffer = strfmt("<ul>\n<li>%f = %s</li>\n", expl->value, expl->description);
|
64
|
+
|
65
|
+
for (i = 0; i < num_details; i++) {
|
66
|
+
estrcat(buffer, expl_to_html(expl->details[i]));
|
67
|
+
}
|
68
|
+
|
69
|
+
REALLOC_N(buffer, char, strlen(buffer) + 10);
|
70
|
+
return strcat(buffer, "</ul>\n");
|
71
|
+
}
|
72
|
+
|
73
|
+
/***************************************************************************
|
74
|
+
*
|
75
|
+
* Hit
|
76
|
+
*
|
77
|
+
***************************************************************************/
|
78
|
+
|
79
|
+
static bool hit_less_than(const Hit *hit1, const Hit *hit2)
|
80
|
+
{
|
81
|
+
if (hit1->score == hit2->score) {
|
82
|
+
return hit1->doc > hit2->doc;
|
83
|
+
}
|
84
|
+
else {
|
85
|
+
return hit1->score < hit1->score;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
static bool hit_lt(Hit *hit1, Hit *hit2)
|
90
|
+
{
|
91
|
+
if (hit1->score == hit2->score) {
|
92
|
+
return hit1->doc > hit2->doc;
|
93
|
+
}
|
94
|
+
else {
|
95
|
+
return hit1->score < hit2->score;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
static void hit_pq_down(PriorityQueue *pq)
|
100
|
+
{
|
101
|
+
register int i = 1;
|
102
|
+
register int j = 2; /* i << 1; */
|
103
|
+
register int k = 3; /* j + 1; */
|
104
|
+
Hit **heap = (Hit **)pq->heap;
|
105
|
+
Hit *node = heap[i]; /* save top node */
|
106
|
+
|
107
|
+
if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
|
108
|
+
j = k;
|
109
|
+
}
|
110
|
+
|
111
|
+
while ((j <= pq->size) && hit_lt(heap[j], node)) {
|
112
|
+
heap[i] = heap[j]; /* shift up child */
|
113
|
+
i = j;
|
114
|
+
j = i << 1;
|
115
|
+
k = j + 1;
|
116
|
+
if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
|
117
|
+
j = k;
|
118
|
+
}
|
119
|
+
}
|
120
|
+
heap[i] = node;
|
121
|
+
}
|
122
|
+
|
123
|
+
static Hit *hit_pq_pop(PriorityQueue *pq)
|
124
|
+
{
|
125
|
+
if (pq->size > 0) {
|
126
|
+
Hit **heap = (Hit **)pq->heap;
|
127
|
+
Hit *result = heap[1]; /* save first value */
|
128
|
+
heap[1] = heap[pq->size]; /* move last to first */
|
129
|
+
heap[pq->size] = NULL;
|
130
|
+
pq->size--;
|
131
|
+
hit_pq_down(pq); /* adjust heap */
|
132
|
+
return result;
|
133
|
+
}
|
134
|
+
else {
|
135
|
+
return NULL;
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
static void hit_pq_up(PriorityQueue *pq)
|
140
|
+
{
|
141
|
+
Hit **heap = (Hit **)pq->heap;
|
142
|
+
Hit *node;
|
143
|
+
int i = pq->size;
|
144
|
+
int j = i >> 1;
|
145
|
+
node = heap[i];
|
146
|
+
|
147
|
+
while ((j > 0) && hit_lt(node, heap[j])) {
|
148
|
+
heap[i] = heap[j];
|
149
|
+
i = j;
|
150
|
+
j = j >> 1;
|
151
|
+
}
|
152
|
+
heap[i] = node;
|
153
|
+
}
|
154
|
+
|
155
|
+
static void hit_pq_insert(PriorityQueue *pq, Hit *hit)
|
156
|
+
{
|
157
|
+
if (pq->size < pq->capa) {
|
158
|
+
Hit *new_hit = ALLOC(Hit);
|
159
|
+
memcpy(new_hit, hit, sizeof(Hit));
|
160
|
+
pq->size++;
|
161
|
+
if (pq->size >= pq->mem_capa) {
|
162
|
+
pq->mem_capa <<= 1;
|
163
|
+
REALLOC_N(pq->heap, void *, pq->mem_capa);
|
164
|
+
}
|
165
|
+
pq->heap[pq->size] = new_hit;
|
166
|
+
hit_pq_up(pq);
|
167
|
+
}
|
168
|
+
else if (pq->size > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
|
169
|
+
memcpy(pq->heap[1], hit, sizeof(Hit));
|
170
|
+
hit_pq_down(pq);
|
171
|
+
}
|
172
|
+
}
|
173
|
+
|
174
|
+
static void hit_pq_multi_insert(PriorityQueue *pq, Hit *hit)
|
175
|
+
{
|
176
|
+
hit_pq_insert(pq, hit);
|
177
|
+
free(hit);
|
178
|
+
}
|
179
|
+
|
180
|
+
/***************************************************************************
|
181
|
+
*
|
182
|
+
* TopDocs
|
183
|
+
*
|
184
|
+
***************************************************************************/
|
185
|
+
|
186
|
+
TopDocs *td_new(int total_hits, int size, Hit **hits, float max_score)
|
187
|
+
{
|
188
|
+
TopDocs *td = ALLOC(TopDocs);
|
189
|
+
td->total_hits = total_hits;
|
190
|
+
td->size = size;
|
191
|
+
td->hits = hits;
|
192
|
+
td->max_score = max_score;
|
193
|
+
return td;
|
194
|
+
}
|
195
|
+
|
196
|
+
void td_destroy(TopDocs *td)
|
197
|
+
{
|
198
|
+
int i;
|
199
|
+
|
200
|
+
for (i = 0; i < td->size; i++) {
|
201
|
+
free(td->hits[i]);
|
202
|
+
}
|
203
|
+
free(td->hits);
|
204
|
+
free(td);
|
205
|
+
}
|
206
|
+
|
207
|
+
char *td_to_s(TopDocs *td)
|
208
|
+
{
|
209
|
+
int i;
|
210
|
+
Hit *hit;
|
211
|
+
char *buffer = strfmt("%d hits sorted by <score, doc_num>\n",
|
212
|
+
td->total_hits);
|
213
|
+
for (i = 0; i < td->size; i++) {
|
214
|
+
hit = td->hits[i];
|
215
|
+
estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
|
216
|
+
}
|
217
|
+
return buffer;
|
218
|
+
}
|
219
|
+
|
220
|
+
/***************************************************************************
|
221
|
+
*
|
222
|
+
* Weight
|
223
|
+
*
|
224
|
+
***************************************************************************/
|
225
|
+
|
226
|
+
Query *w_get_query(Weight *self)
|
227
|
+
{
|
228
|
+
return self->query;
|
229
|
+
}
|
230
|
+
|
231
|
+
float w_get_value(Weight *self)
|
232
|
+
{
|
233
|
+
return self->value;
|
234
|
+
}
|
235
|
+
|
236
|
+
float w_sum_of_squared_weights(Weight *self)
|
237
|
+
{
|
238
|
+
self->qweight = self->idf * self->query->boost;
|
239
|
+
return self->qweight * self->qweight; /* square it */
|
240
|
+
}
|
241
|
+
|
242
|
+
void w_normalize(Weight *self, float normalization_factor)
|
243
|
+
{
|
244
|
+
self->qnorm = normalization_factor;
|
245
|
+
self->qweight *= normalization_factor; /* normalize query weight */
|
246
|
+
self->value = self->qweight * self->idf;/* idf for document */
|
247
|
+
}
|
248
|
+
|
249
|
+
void w_destroy(Weight *self)
|
250
|
+
{
|
251
|
+
q_deref(self->query);
|
252
|
+
free(self);
|
253
|
+
}
|
254
|
+
|
255
|
+
Weight *w_create(size_t size, Query *query)
|
256
|
+
{
|
257
|
+
Weight *self = (Weight *)ecalloc(size);
|
258
|
+
#ifdef DEBUG
|
259
|
+
if (size < sizeof(Weight)) {
|
260
|
+
RAISE(FERRET_ERROR, "size of weight <%d> should be at least <%d>",
|
261
|
+
(int)size, (int)sizeof(Weight));
|
262
|
+
}
|
263
|
+
#endif
|
264
|
+
REF(query);
|
265
|
+
self->query = query;
|
266
|
+
self->get_query = &w_get_query;
|
267
|
+
self->get_value = &w_get_value;
|
268
|
+
self->normalize = &w_normalize;
|
269
|
+
self->destroy = &w_destroy;
|
270
|
+
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
271
|
+
return self;
|
272
|
+
}
|
273
|
+
|
274
|
+
/***************************************************************************
|
275
|
+
*
|
276
|
+
* Query
|
277
|
+
*
|
278
|
+
***************************************************************************/
|
279
|
+
|
280
|
+
static const char *QUERY_NAMES[] = {
|
281
|
+
"TermQuery",
|
282
|
+
"MultiTermQuery",
|
283
|
+
"BooleanQuery",
|
284
|
+
"PhraseQuery",
|
285
|
+
"ConstantScoreQuery",
|
286
|
+
"FilteredQuery",
|
287
|
+
"MatchAllQuery",
|
288
|
+
"RangeQuery",
|
289
|
+
"WildCardQuery",
|
290
|
+
"FuzzyQuery",
|
291
|
+
"PrefixQuery",
|
292
|
+
"SpanTermQuery",
|
293
|
+
"SpanMultiTermQuery",
|
294
|
+
"SpanPrefixQuery",
|
295
|
+
"SpanFirstQuery",
|
296
|
+
"SpanOrQuery",
|
297
|
+
"SpanNotQuery",
|
298
|
+
"SpanNearQuery"
|
299
|
+
};
|
300
|
+
|
301
|
+
static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
|
302
|
+
|
303
|
+
const char *q_get_query_name(enum QUERY_TYPE type) {
|
304
|
+
if (type >= NELEMS(QUERY_NAMES)) {
|
305
|
+
return UNKNOWN_QUERY_NAME;
|
306
|
+
}
|
307
|
+
else {
|
308
|
+
return QUERY_NAMES[type];
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
static Query *q_rewrite(Query *self, IndexReader *ir)
|
313
|
+
{
|
314
|
+
(void)ir;
|
315
|
+
self->ref_cnt++;
|
316
|
+
return self;
|
317
|
+
}
|
318
|
+
|
319
|
+
static void q_extract_terms(Query *self, HashSet *terms)
|
320
|
+
{
|
321
|
+
/* do nothing by default */
|
322
|
+
(void)self;
|
323
|
+
(void)terms;
|
324
|
+
}
|
325
|
+
|
326
|
+
Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
|
327
|
+
{
|
328
|
+
(void)self;
|
329
|
+
return searcher->get_similarity(searcher);
|
330
|
+
}
|
331
|
+
|
332
|
+
void q_destroy_i(Query *self)
|
333
|
+
{
|
334
|
+
free(self);
|
335
|
+
}
|
336
|
+
|
337
|
+
void q_deref(Query *self)
|
338
|
+
{
|
339
|
+
if (--(self->ref_cnt) == 0) {
|
340
|
+
self->destroy_i(self);
|
341
|
+
}
|
342
|
+
}
|
343
|
+
|
344
|
+
Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
|
345
|
+
{
|
346
|
+
(void)self;
|
347
|
+
(void)searcher;
|
348
|
+
RAISE(UNSUPPORTED_ERROR,
|
349
|
+
"Create weight is unsupported for this type of query");
|
350
|
+
return NULL;
|
351
|
+
}
|
352
|
+
|
353
|
+
Weight *q_weight(Query *self, Searcher *searcher)
|
354
|
+
{
|
355
|
+
Query *query = searcher->rewrite(searcher, self);
|
356
|
+
Weight *weight = query->create_weight_i(query, searcher);
|
357
|
+
float sum = weight->sum_of_squared_weights(weight);
|
358
|
+
Similarity *sim = query->get_similarity(query, searcher);
|
359
|
+
float norm = sim_query_norm(sim, sum);
|
360
|
+
q_deref(query);
|
361
|
+
|
362
|
+
weight->normalize(weight, norm);
|
363
|
+
return self->weight = weight;
|
364
|
+
}
|
365
|
+
|
366
|
+
#define BQ(query) ((BooleanQuery *)(query))
|
367
|
+
Query *q_combine(Query **queries, int q_cnt)
|
368
|
+
{
|
369
|
+
int i;
|
370
|
+
Query *q, *ret_q;
|
371
|
+
HashSet *uniques = hs_new((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
|
372
|
+
|
373
|
+
for (i = 0; i < q_cnt; i++) {
|
374
|
+
q = queries[i];
|
375
|
+
if (q->type == BOOLEAN_QUERY) {
|
376
|
+
int j;
|
377
|
+
bool splittable = true;
|
378
|
+
if (BQ(q)->coord_disabled == false) {
|
379
|
+
splittable = false;
|
380
|
+
}
|
381
|
+
else {
|
382
|
+
for (j = 0; j < BQ(q)->clause_cnt; j++) {
|
383
|
+
if (BQ(q)->clauses[j]->occur != BC_SHOULD) {
|
384
|
+
splittable = false;
|
385
|
+
break;
|
386
|
+
}
|
387
|
+
}
|
388
|
+
}
|
389
|
+
if (splittable) {
|
390
|
+
for (j = 0; j < BQ(q)->clause_cnt; j++) {
|
391
|
+
Query *sub_q = BQ(q)->clauses[j]->query;
|
392
|
+
hs_add(uniques, sub_q);
|
393
|
+
}
|
394
|
+
}
|
395
|
+
else {
|
396
|
+
hs_add(uniques, q);
|
397
|
+
}
|
398
|
+
}
|
399
|
+
else {
|
400
|
+
hs_add(uniques, q);
|
401
|
+
}
|
402
|
+
}
|
403
|
+
if (uniques->size == 1) {
|
404
|
+
ret_q = (Query *)uniques->elems[0];
|
405
|
+
REF(ret_q);
|
406
|
+
}
|
407
|
+
else {
|
408
|
+
ret_q = bq_new(true);
|
409
|
+
for (i = 0; i < uniques->size; i++) {
|
410
|
+
q = (Query *)uniques->elems[i];
|
411
|
+
bq_add_query(ret_q, q, BC_SHOULD);
|
412
|
+
}
|
413
|
+
}
|
414
|
+
hs_destroy(uniques);
|
415
|
+
|
416
|
+
return ret_q;
|
417
|
+
}
|
418
|
+
|
419
|
+
unsigned long q_hash(Query *self)
|
420
|
+
{
|
421
|
+
return (self->hash(self) << 5) | self->type;
|
422
|
+
}
|
423
|
+
|
424
|
+
int q_eq(Query *self, Query *o)
|
425
|
+
{
|
426
|
+
return (self == o)
|
427
|
+
|| ((self->type == o->type)
|
428
|
+
&& (self->boost == o->boost)
|
429
|
+
&& self->eq(self, o));
|
430
|
+
}
|
431
|
+
|
432
|
+
static MatchVector *q_get_matchv_i(Query *self, MatchVector *mv, TermVector *tv)
|
433
|
+
{
|
434
|
+
/* be default we don't add any matches */
|
435
|
+
(void)self; (void)tv;
|
436
|
+
return mv;
|
437
|
+
}
|
438
|
+
|
439
|
+
Query *q_create(size_t size)
|
440
|
+
{
|
441
|
+
Query *self = (Query *)ecalloc(size);
|
442
|
+
#ifdef DEBUG
|
443
|
+
if (size < sizeof(Query)) {
|
444
|
+
RAISE(FERRET_ERROR, "Size of a query <%d> should never be smaller than the "
|
445
|
+
"size of a Query struct <%d>", (int)size, (int)sizeof(Query));
|
446
|
+
}
|
447
|
+
#endif
|
448
|
+
self->boost = 1.0;
|
449
|
+
self->rewrite = &q_rewrite;
|
450
|
+
self->get_similarity = &q_get_similarity_i;
|
451
|
+
self->extract_terms = &q_extract_terms;
|
452
|
+
self->get_matchv_i = &q_get_matchv_i;
|
453
|
+
self->weight = NULL;
|
454
|
+
self->ref_cnt = 1;
|
455
|
+
return self;
|
456
|
+
}
|
457
|
+
|
458
|
+
/***************************************************************************
|
459
|
+
*
|
460
|
+
* Scorer
|
461
|
+
*
|
462
|
+
***************************************************************************/
|
463
|
+
|
464
|
+
void scorer_destroy_i(Scorer *scorer)
|
465
|
+
{
|
466
|
+
free(scorer);
|
467
|
+
}
|
468
|
+
|
469
|
+
Scorer *scorer_create(size_t size, Similarity *similarity)
|
470
|
+
{
|
471
|
+
Scorer *self = (Scorer *)ecalloc(size);
|
472
|
+
#ifdef DEBUG
|
473
|
+
if (size < sizeof(Scorer)) {
|
474
|
+
RAISE(FERRET_ERROR, "size of scorer <%d> should be at least <%d>",
|
475
|
+
(int)size, (int)sizeof(Scorer));
|
476
|
+
}
|
477
|
+
#endif
|
478
|
+
self->destroy = &scorer_destroy_i;
|
479
|
+
self->similarity = similarity;
|
480
|
+
return self;
|
481
|
+
}
|
482
|
+
|
483
|
+
bool scorer_less_than(void *p1, void *p2)
|
484
|
+
{
|
485
|
+
Scorer *s1 = (Scorer *)p1;
|
486
|
+
Scorer *s2 = (Scorer *)p2;
|
487
|
+
return s1->score(s1) < s2->score(s2);
|
488
|
+
}
|
489
|
+
|
490
|
+
bool scorer_doc_less_than(const Scorer *s1, const Scorer *s2)
|
491
|
+
{
|
492
|
+
return s1->doc < s2->doc;
|
493
|
+
}
|
494
|
+
|
495
|
+
int scorer_doc_cmp(const void *p1, const void *p2)
|
496
|
+
{
|
497
|
+
return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
|
498
|
+
}
|
499
|
+
|
500
|
+
/***************************************************************************
|
501
|
+
*
|
502
|
+
* Highlighter
|
503
|
+
*
|
504
|
+
***************************************************************************/
|
505
|
+
|
506
|
+
/* ** MatchRange ** */
|
507
|
+
static int match_range_cmp(const void *p1, const void *p2)
|
508
|
+
{
|
509
|
+
int diff = ((MatchRange *)p1)->start - ((MatchRange *)p2)->start;
|
510
|
+
if (diff != 0) {
|
511
|
+
return diff;
|
512
|
+
}
|
513
|
+
else {
|
514
|
+
return ((MatchRange *)p2)->end - ((MatchRange *)p1)->end;
|
515
|
+
}
|
516
|
+
}
|
517
|
+
|
518
|
+
|
519
|
+
|
520
|
+
/* ** MatchVector ** */
|
521
|
+
MatchVector *matchv_new()
|
522
|
+
{
|
523
|
+
MatchVector *matchv = ALLOC(MatchVector);
|
524
|
+
|
525
|
+
matchv->size = 0;
|
526
|
+
matchv->capa = MATCH_VECTOR_INIT_CAPA;
|
527
|
+
matchv->matches = ALLOC_N(MatchRange, MATCH_VECTOR_INIT_CAPA);
|
528
|
+
|
529
|
+
return matchv;
|
530
|
+
}
|
531
|
+
|
532
|
+
MatchVector *matchv_add(MatchVector *self, int start, int end)
|
533
|
+
{
|
534
|
+
if (self->size >= self->capa) {
|
535
|
+
self->capa <<= 1;
|
536
|
+
REALLOC_N(self->matches, MatchRange, self->capa);
|
537
|
+
}
|
538
|
+
self->matches[self->size].start = start;
|
539
|
+
self->matches[self->size].end = end;
|
540
|
+
self->matches[self->size++].score = 1.0;
|
541
|
+
return self;
|
542
|
+
}
|
543
|
+
|
544
|
+
MatchVector *matchv_sort(MatchVector *self)
|
545
|
+
{
|
546
|
+
qsort(self->matches, self->size, sizeof(MatchRange), &match_range_cmp);
|
547
|
+
return self;
|
548
|
+
}
|
549
|
+
|
550
|
+
MatchVector *matchv_compact(MatchVector *self)
|
551
|
+
{
|
552
|
+
int left, right;
|
553
|
+
matchv_sort(self);
|
554
|
+
for (right = left = 0; right < self->size; right++) {
|
555
|
+
/* Note the end + 1. This compacts a range 3:5 and 6:8 inleft 3:8 */
|
556
|
+
if (self->matches[right].start > self->matches[left].end + 1) {
|
557
|
+
left++;
|
558
|
+
self->matches[left].start = self->matches[right].start;
|
559
|
+
self->matches[left].end = self->matches[right].end;
|
560
|
+
self->matches[left].score = self->matches[right].score;
|
561
|
+
}
|
562
|
+
else if (self->matches[right].end > self->matches[left].end) {
|
563
|
+
self->matches[left].end = self->matches[right].end;
|
564
|
+
}
|
565
|
+
else {
|
566
|
+
self->matches[left].score += self->matches[right].score;
|
567
|
+
}
|
568
|
+
}
|
569
|
+
self->size = left + 1;
|
570
|
+
return self;
|
571
|
+
}
|
572
|
+
|
573
|
+
MatchVector *matchv_compact_with_breaks(MatchVector *self)
|
574
|
+
{
|
575
|
+
int left, right;
|
576
|
+
matchv_sort(self);
|
577
|
+
for (right = left = 0; right < self->size; right++) {
|
578
|
+
/* Note: no end + 1. Unlike above won't compact ranges 3:5 and 6:8 */
|
579
|
+
if (self->matches[right].start > self->matches[left].end) {
|
580
|
+
left++;
|
581
|
+
self->matches[left].start = self->matches[right].start;
|
582
|
+
self->matches[left].end = self->matches[right].end;
|
583
|
+
self->matches[left].score = self->matches[right].score;
|
584
|
+
}
|
585
|
+
else if (self->matches[right].end > self->matches[left].end) {
|
586
|
+
self->matches[left].end = self->matches[right].end;
|
587
|
+
self->matches[left].score += self->matches[right].score;
|
588
|
+
}
|
589
|
+
else if (right > left) {
|
590
|
+
self->matches[left].score += self->matches[right].score;
|
591
|
+
}
|
592
|
+
}
|
593
|
+
self->size = left + 1;
|
594
|
+
return self;
|
595
|
+
}
|
596
|
+
|
597
|
+
|
598
|
+
static MatchVector *matchv_set_offsets(MatchVector *mv, Offset *offsets)
|
599
|
+
{
|
600
|
+
int i;
|
601
|
+
for (i = 0; i < mv->size; i++) {
|
602
|
+
mv->matches[i].start_offset = offsets[mv->matches[i].start].start;
|
603
|
+
mv->matches[i].end_offset = offsets[mv->matches[i].end].end;
|
604
|
+
}
|
605
|
+
return mv;
|
606
|
+
}
|
607
|
+
|
608
|
+
void matchv_destroy(MatchVector *self)
|
609
|
+
{
|
610
|
+
free(self->matches);
|
611
|
+
free(self);
|
612
|
+
}
|
613
|
+
|
614
|
+
/***************************************************************************
|
615
|
+
*
|
616
|
+
* Searcher
|
617
|
+
*
|
618
|
+
***************************************************************************/
|
619
|
+
|
620
|
+
MatchVector *searcher_get_match_vector(Searcher *self,
|
621
|
+
Query *query,
|
622
|
+
const int doc_num,
|
623
|
+
const char *field)
|
624
|
+
{
|
625
|
+
MatchVector *mv = matchv_new();
|
626
|
+
bool rewrite = query->get_matchv_i == q_get_matchv_i;
|
627
|
+
TermVector *tv = self->get_term_vector(self, doc_num, field);
|
628
|
+
if (rewrite) {
|
629
|
+
query = self->rewrite(self, query);
|
630
|
+
}
|
631
|
+
if (tv && tv->term_cnt > 0 && tv->terms[0].positions != NULL) {
|
632
|
+
mv = query->get_matchv_i(query, mv, tv);
|
633
|
+
tv_destroy(tv);
|
634
|
+
}
|
635
|
+
if (rewrite) {
|
636
|
+
q_deref(query);
|
637
|
+
}
|
638
|
+
return mv;
|
639
|
+
}
|
640
|
+
|
641
|
+
typedef struct Excerpt
|
642
|
+
{
|
643
|
+
int start;
|
644
|
+
int end;
|
645
|
+
int start_pos;
|
646
|
+
int end_pos;
|
647
|
+
int start_offset;
|
648
|
+
int end_offset;
|
649
|
+
double score;
|
650
|
+
} Excerpt;
|
651
|
+
|
652
|
+
/*
|
653
|
+
static int excerpt_cmp(const void *p1, const void *p2)
|
654
|
+
{
|
655
|
+
double score1 = (*((Excerpt **)p1))->score;
|
656
|
+
double score2 = (*((Excerpt **)p2))->score;
|
657
|
+
if (score1 > score2) return 1;
|
658
|
+
if (score1 < score2) return -1;
|
659
|
+
return 0;
|
660
|
+
}
|
661
|
+
*/
|
662
|
+
|
663
|
+
static int excerpt_start_cmp(const void *p1, const void *p2)
|
664
|
+
{
|
665
|
+
return (*((Excerpt **)p1))->start - (*((Excerpt **)p2))->start;
|
666
|
+
}
|
667
|
+
|
668
|
+
static int excerpt_lt(Excerpt *e1, Excerpt *e2)
|
669
|
+
{
|
670
|
+
return e1->score > e2->score; /* want the highest score at top */
|
671
|
+
}
|
672
|
+
|
673
|
+
static Excerpt *excerpt_new(int start, int end, double score)
|
674
|
+
{
|
675
|
+
Excerpt *excerpt = ALLOC_AND_ZERO(Excerpt);
|
676
|
+
excerpt->start = start;
|
677
|
+
excerpt->end = end;
|
678
|
+
excerpt->score = score;
|
679
|
+
return excerpt;
|
680
|
+
}
|
681
|
+
|
682
|
+
static Excerpt *excerpt_recalc_score(Excerpt *e, MatchVector *mv)
|
683
|
+
{
|
684
|
+
int i;
|
685
|
+
double score = 0.0;
|
686
|
+
for (i = e->start; i <= e->end; i++) {
|
687
|
+
score += mv->matches[i].score;
|
688
|
+
}
|
689
|
+
e->score = score;
|
690
|
+
return e;
|
691
|
+
}
|
692
|
+
|
693
|
+
/* expand an excerpt to it's largest possible size */
|
694
|
+
static Excerpt *excerpt_expand(Excerpt *e, const int len, TermVector *tv)
|
695
|
+
{
|
696
|
+
Offset *offsets = tv->offsets;
|
697
|
+
int offset_cnt = tv->offset_cnt;
|
698
|
+
bool did_expansion = true;
|
699
|
+
int i;
|
700
|
+
/* fill in skipped offsets */
|
701
|
+
for (i = 1; i < offset_cnt; i++) {
|
702
|
+
if (offsets[i].start == 0) {
|
703
|
+
offsets[i].start = offsets[i-1].start;
|
704
|
+
}
|
705
|
+
if (offsets[i].end == 0) {
|
706
|
+
offsets[i].end = offsets[i-1].end;
|
707
|
+
}
|
708
|
+
}
|
709
|
+
|
710
|
+
while (did_expansion) {
|
711
|
+
did_expansion = false;
|
712
|
+
if (e->start_pos > 0
|
713
|
+
&& (e->end_offset - offsets[e->start_pos - 1].start) < len) {
|
714
|
+
e->start_pos--;
|
715
|
+
e->start_offset = offsets[e->start_pos].start;
|
716
|
+
did_expansion = true;
|
717
|
+
}
|
718
|
+
if (e->end_pos < (offset_cnt - 1)
|
719
|
+
&& (offsets[e->end_pos + 1].end - e->start_offset) < len) {
|
720
|
+
e->end_pos++;
|
721
|
+
e->end_offset = offsets[e->end_pos].end;
|
722
|
+
did_expansion = true;
|
723
|
+
}
|
724
|
+
}
|
725
|
+
return e;
|
726
|
+
}
|
727
|
+
|
728
|
+
static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
|
729
|
+
LazyDocField *lazy_df,
|
730
|
+
const char *pre_tag,
|
731
|
+
const char *post_tag,
|
732
|
+
const char *ellipsis)
|
733
|
+
{
|
734
|
+
int i, len;
|
735
|
+
int last_offset = e->start_offset;
|
736
|
+
const int num_matches = e->end - e->start + 1;
|
737
|
+
const int pre_tag_len = (int)strlen(pre_tag);
|
738
|
+
const int post_tag_len = (int)strlen(post_tag);
|
739
|
+
const int ellipsis_len = (int)strlen(ellipsis);
|
740
|
+
char *excerpt_str = ALLOC_N(char,
|
741
|
+
10 + e->end_offset - e->start_offset
|
742
|
+
+ (num_matches * (pre_tag_len + post_tag_len))
|
743
|
+
+ (2 * ellipsis_len));
|
744
|
+
char *e_ptr = excerpt_str;
|
745
|
+
if (e->start_offset > 0) {
|
746
|
+
memcpy(e_ptr, ellipsis, ellipsis_len);
|
747
|
+
e_ptr += ellipsis_len;
|
748
|
+
}
|
749
|
+
for (i = e->start; i <= e->end; i++) {
|
750
|
+
MatchRange *mr = mv->matches + i;
|
751
|
+
len = mr->start_offset - last_offset;
|
752
|
+
if (len) {
|
753
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
754
|
+
e_ptr += len;
|
755
|
+
}
|
756
|
+
memcpy(e_ptr, pre_tag, pre_tag_len);
|
757
|
+
e_ptr += pre_tag_len;
|
758
|
+
len = mr->end_offset - mr->start_offset;
|
759
|
+
if (len) {
|
760
|
+
lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
761
|
+
e_ptr += len;
|
762
|
+
}
|
763
|
+
memcpy(e_ptr, post_tag, post_tag_len);
|
764
|
+
e_ptr += post_tag_len;
|
765
|
+
last_offset = mr->end_offset;
|
766
|
+
}
|
767
|
+
if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
|
768
|
+
/* no point using ellipsis if it takes up more space */
|
769
|
+
e->end_offset = lazy_df->len;
|
770
|
+
}
|
771
|
+
len = e->end_offset - last_offset;
|
772
|
+
if (len) {
|
773
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
774
|
+
e_ptr += len;
|
775
|
+
}
|
776
|
+
if (e->end_offset < lazy_df->len) {
|
777
|
+
memcpy(e_ptr, ellipsis, ellipsis_len);
|
778
|
+
e_ptr += ellipsis_len;
|
779
|
+
}
|
780
|
+
*e_ptr = '\0';
|
781
|
+
return excerpt_str;
|
782
|
+
}
|
783
|
+
|
784
|
+
static char *highlight_field(MatchVector *mv,
|
785
|
+
LazyDocField *lazy_df,
|
786
|
+
TermVector *tv,
|
787
|
+
const char *pre_tag,
|
788
|
+
const char *post_tag)
|
789
|
+
{
|
790
|
+
const int pre_len = (int)strlen(pre_tag);
|
791
|
+
const int post_len = (int)strlen(post_tag);
|
792
|
+
char *excerpt_str =
|
793
|
+
ALLOC_N(char, 10 + lazy_df->len + (mv->size * (pre_len + post_len)));
|
794
|
+
if (mv->size > 0) {
|
795
|
+
int last_offset = 0;
|
796
|
+
int i, len;
|
797
|
+
char *e_ptr = excerpt_str;
|
798
|
+
matchv_compact_with_breaks(mv);
|
799
|
+
matchv_set_offsets(mv, tv->offsets);
|
800
|
+
for (i = 0; i < mv->size; i++) {
|
801
|
+
MatchRange *mr = mv->matches + i;
|
802
|
+
len = mr->start_offset - last_offset;
|
803
|
+
if (len) {
|
804
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
805
|
+
e_ptr += len;
|
806
|
+
}
|
807
|
+
memcpy(e_ptr, pre_tag, pre_len);
|
808
|
+
e_ptr += pre_len;
|
809
|
+
len = mr->end_offset - mr->start_offset;
|
810
|
+
if (len) {
|
811
|
+
lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
812
|
+
e_ptr += len;
|
813
|
+
}
|
814
|
+
memcpy(e_ptr, post_tag, post_len);
|
815
|
+
e_ptr += post_len;
|
816
|
+
last_offset = mr->end_offset;
|
817
|
+
}
|
818
|
+
len = lazy_df->len - last_offset;
|
819
|
+
if (len) {
|
820
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
821
|
+
e_ptr += len;
|
822
|
+
}
|
823
|
+
*e_ptr = '\0';
|
824
|
+
}
|
825
|
+
else {
|
826
|
+
lazy_df_get_bytes(lazy_df, excerpt_str, 0, lazy_df->len);
|
827
|
+
excerpt_str[lazy_df->len] = '\0';
|
828
|
+
}
|
829
|
+
return excerpt_str;
|
830
|
+
}
|
831
|
+
|
832
|
+
char **searcher_highlight(Searcher *self,
|
833
|
+
Query *query,
|
834
|
+
const int doc_num,
|
835
|
+
const char *field,
|
836
|
+
const int excerpt_len,
|
837
|
+
const int num_excerpts,
|
838
|
+
const char *pre_tag,
|
839
|
+
const char *post_tag,
|
840
|
+
const char *ellipsis)
|
841
|
+
{
|
842
|
+
char **excerpt_strs = NULL;
|
843
|
+
TermVector *tv = self->get_term_vector(self, doc_num, field);
|
844
|
+
LazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
|
845
|
+
LazyDocField *lazy_df = NULL;
|
846
|
+
if (lazy_doc) {
|
847
|
+
lazy_df = h_get(lazy_doc->field_dict, field);
|
848
|
+
}
|
849
|
+
if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
|
850
|
+
&& tv->offsets != NULL) {
|
851
|
+
MatchVector *mv;
|
852
|
+
query = self->rewrite(self, query);
|
853
|
+
mv = query->get_matchv_i(query, matchv_new(), tv);
|
854
|
+
q_deref(query);
|
855
|
+
if (lazy_df->len < (excerpt_len * num_excerpts)) {
|
856
|
+
excerpt_strs = ary_new_type_capa(char *, 1);
|
857
|
+
ary_push(excerpt_strs,
|
858
|
+
highlight_field(mv, lazy_df, tv, pre_tag, post_tag));
|
859
|
+
}
|
860
|
+
else if (mv->size > 0) {
|
861
|
+
Excerpt **excerpts = ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
|
862
|
+
int e_start, e_end, i, j;
|
863
|
+
MatchRange *matches = mv->matches;
|
864
|
+
double running_score = 0.0;
|
865
|
+
Offset *offsets = tv->offsets;
|
866
|
+
PriorityQueue *excerpt_pq;
|
867
|
+
|
868
|
+
matchv_compact_with_breaks(mv);
|
869
|
+
matchv_set_offsets(mv, offsets);
|
870
|
+
excerpt_pq = pq_new(mv->size, (lt_ft)&excerpt_lt, &free);
|
871
|
+
/* add all possible excerpts to the priority queue */
|
872
|
+
|
873
|
+
for (e_start = e_end = 0; e_start < mv->size; e_start++) {
|
874
|
+
const int start_offset = matches[e_start].start_offset;
|
875
|
+
if (e_start > e_end) {
|
876
|
+
running_score = 0.0;
|
877
|
+
e_end = e_start;
|
878
|
+
}
|
879
|
+
while (e_end < mv->size && (matches[e_end].end_offset
|
880
|
+
<= start_offset + excerpt_len)) {
|
881
|
+
running_score += matches[e_end].score;
|
882
|
+
e_end++;
|
883
|
+
}
|
884
|
+
pq_push(excerpt_pq,
|
885
|
+
excerpt_new(e_start, e_end - 1, running_score));
|
886
|
+
/* - 0.1 so that earlier matches take priority */
|
887
|
+
running_score -= matches[e_start].score;
|
888
|
+
}
|
889
|
+
|
890
|
+
for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
|
891
|
+
excerpts[i] = pq_pop(excerpt_pq);
|
892
|
+
if (i < num_excerpts - 1) {
|
893
|
+
/* set match ranges alread included to 0 */
|
894
|
+
Excerpt *e = excerpts[i];
|
895
|
+
for (j = e->start; j <= e->end; j++) {
|
896
|
+
matches[j].score = 0.0;
|
897
|
+
}
|
898
|
+
e = NULL;
|
899
|
+
while (e != (Excerpt *)pq_top(excerpt_pq)) {
|
900
|
+
e = pq_top(excerpt_pq);
|
901
|
+
excerpt_recalc_score(e, mv);
|
902
|
+
pq_down(excerpt_pq);
|
903
|
+
}
|
904
|
+
}
|
905
|
+
}
|
906
|
+
|
907
|
+
qsort(excerpts, i, sizeof(Excerpt *), &excerpt_start_cmp);
|
908
|
+
for (j = 0; j < i; j++) {
|
909
|
+
Excerpt *e = excerpts[j];
|
910
|
+
e->start_pos = matches[e->start].start;
|
911
|
+
e->end_pos = matches[e->end].end;
|
912
|
+
e->start_offset = offsets[e->start_pos].start;
|
913
|
+
e->end_offset = offsets[e->end_pos].end;
|
914
|
+
}
|
915
|
+
|
916
|
+
if (i < num_excerpts) {
|
917
|
+
const int diff = num_excerpts - i;
|
918
|
+
memmove(excerpts + (diff), excerpts,
|
919
|
+
i * sizeof(Excerpt *));
|
920
|
+
for (j = 0; j < diff; j++) {
|
921
|
+
/* these new excerpts will grow into one long excerpt at
|
922
|
+
* the start */
|
923
|
+
excerpts[j] = ALLOC_AND_ZERO(Excerpt);
|
924
|
+
excerpts[j]->end = -1;
|
925
|
+
}
|
926
|
+
}
|
927
|
+
|
928
|
+
excerpt_strs = ary_new_type_capa(char *, num_excerpts);
|
929
|
+
/* merge excerpts where possible */
|
930
|
+
for (i = 0; i < num_excerpts;) {
|
931
|
+
Excerpt *ei = excerpts[i];
|
932
|
+
int merged = 1; /* 1 means a single excerpt, ie no merges */
|
933
|
+
for (j = i + 1; j < num_excerpts; j++) {
|
934
|
+
Excerpt *ej = excerpts[j];
|
935
|
+
if ((ej->end_offset - ei->start_offset)
|
936
|
+
< (j - i + 1) * excerpt_len) {
|
937
|
+
ei->end = ej->end;
|
938
|
+
ei->end_pos = ej->end_pos;
|
939
|
+
ei->end_offset = ej->end_offset;
|
940
|
+
merged = j - i + 1;
|
941
|
+
}
|
942
|
+
}
|
943
|
+
excerpt_expand(ei, merged * excerpt_len, tv);
|
944
|
+
ary_push(excerpt_strs,
|
945
|
+
excerpt_get_str(ei, mv, lazy_df,
|
946
|
+
pre_tag, post_tag, ellipsis));
|
947
|
+
i += merged;
|
948
|
+
}
|
949
|
+
for (i = 0; i < num_excerpts; i++) {
|
950
|
+
free(excerpts[i]);
|
951
|
+
}
|
952
|
+
free(excerpts);
|
953
|
+
pq_destroy(excerpt_pq);
|
954
|
+
}
|
955
|
+
matchv_destroy(mv);
|
956
|
+
}
|
957
|
+
if (tv) tv_destroy(tv);
|
958
|
+
if (lazy_doc) lazy_doc_close(lazy_doc);
|
959
|
+
return excerpt_strs;
|
960
|
+
}
|
961
|
+
|
962
|
+
static Weight *sea_create_weight(Searcher *self, Query *query)
|
963
|
+
{
|
964
|
+
return q_weight(query, self);
|
965
|
+
}
|
966
|
+
|
967
|
+
static void sea_check_args(int num_docs, int first_doc)
|
968
|
+
{
|
969
|
+
if (num_docs <= 0) {
|
970
|
+
RAISE(ARG_ERROR, ":num_docs was set to %d but should be greater "
|
971
|
+
"than 0 : %d <= 0", num_docs, num_docs);
|
972
|
+
}
|
973
|
+
|
974
|
+
if (first_doc < 0) {
|
975
|
+
RAISE(ARG_ERROR, ":first_doc was set to %d but should be greater "
|
976
|
+
"than or equal to 0 : %d < 0", first_doc, first_doc);
|
977
|
+
}
|
978
|
+
}
|
979
|
+
|
980
|
+
static Similarity *sea_get_similarity(Searcher *self)
|
981
|
+
{
|
982
|
+
return self->similarity;
|
983
|
+
}
|
984
|
+
|
985
|
+
/***************************************************************************
|
986
|
+
*
|
987
|
+
* IndexSearcher
|
988
|
+
*
|
989
|
+
***************************************************************************/
|
990
|
+
|
991
|
+
#define ISEA(searcher) ((IndexSearcher *)(searcher))
|
992
|
+
|
993
|
+
int isea_doc_freq(Searcher *self, const char *field, const char *term)
|
994
|
+
{
|
995
|
+
return ir_doc_freq(ISEA(self)->ir, field, term);
|
996
|
+
}
|
997
|
+
|
998
|
+
static Document *isea_get_doc(Searcher *self, int doc_num)
|
999
|
+
{
|
1000
|
+
IndexReader *ir = ISEA(self)->ir;
|
1001
|
+
return ir->get_doc(ir, doc_num);
|
1002
|
+
}
|
1003
|
+
|
1004
|
+
static LazyDoc *isea_get_lazy_doc(Searcher *self, int doc_num)
|
1005
|
+
{
|
1006
|
+
IndexReader *ir = ISEA(self)->ir;
|
1007
|
+
return ir->get_lazy_doc(ir, doc_num);
|
1008
|
+
}
|
1009
|
+
|
1010
|
+
static int isea_max_doc(Searcher *self)
|
1011
|
+
{
|
1012
|
+
IndexReader *ir = ISEA(self)->ir;
|
1013
|
+
return ir->max_doc(ir);
|
1014
|
+
}
|
1015
|
+
|
1016
|
+
#define IS_FILTERED(bits, filter_func, scorer, searcher) \
|
1017
|
+
((bits && !bv_get(bits, scorer->doc))\
|
1018
|
+
|| (filter_func \
|
1019
|
+
&& !filter_func(scorer->doc, scorer->score(scorer), searcher)))
|
1020
|
+
|
1021
|
+
static TopDocs *isea_search_w(Searcher *self,
|
1022
|
+
Weight *weight,
|
1023
|
+
int first_doc,
|
1024
|
+
int num_docs,
|
1025
|
+
Filter *filter,
|
1026
|
+
Sort *sort,
|
1027
|
+
filter_ft filter_func,
|
1028
|
+
bool load_fields)
|
1029
|
+
{
|
1030
|
+
int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
|
1031
|
+
int i;
|
1032
|
+
Scorer *scorer;
|
1033
|
+
Hit **score_docs = NULL;
|
1034
|
+
Hit hit;
|
1035
|
+
int total_hits = 0;
|
1036
|
+
float score, max_score = 0.0;
|
1037
|
+
BitVector *bits = (filter
|
1038
|
+
? filt_get_bv(filter, ISEA(self)->ir)
|
1039
|
+
: NULL);
|
1040
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
1041
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
1042
|
+
void (*hq_destroy)(PriorityQueue *self);
|
1043
|
+
PriorityQueue *hq;
|
1044
|
+
|
1045
|
+
sea_check_args(num_docs, first_doc);
|
1046
|
+
|
1047
|
+
scorer = weight->scorer(weight, ISEA(self)->ir);
|
1048
|
+
if (!scorer || 0 == ISEA(self)->ir->num_docs(ISEA(self)->ir)) {
|
1049
|
+
if (scorer) scorer->destroy(scorer);
|
1050
|
+
return td_new(0, 0, NULL, 0.0);
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
if (sort) {
|
1054
|
+
hq = fshq_pq_new(max_size, sort, ISEA(self)->ir);
|
1055
|
+
hq_insert = &fshq_pq_insert;
|
1056
|
+
hq_destroy = &fshq_pq_destroy;
|
1057
|
+
if (load_fields) {
|
1058
|
+
hq_pop = &fshq_pq_pop_fd;
|
1059
|
+
}
|
1060
|
+
else {
|
1061
|
+
hq_pop = &fshq_pq_pop;
|
1062
|
+
}
|
1063
|
+
}
|
1064
|
+
else {
|
1065
|
+
hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
|
1066
|
+
hq_pop = &hit_pq_pop;
|
1067
|
+
hq_insert = &hit_pq_insert;
|
1068
|
+
hq_destroy = &pq_destroy;
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
while (scorer->next(scorer)) {
|
1072
|
+
if (IS_FILTERED(bits, filter_func, scorer, self)) {
|
1073
|
+
continue;
|
1074
|
+
}
|
1075
|
+
total_hits++;
|
1076
|
+
score = scorer->score(scorer);
|
1077
|
+
if (score > max_score) max_score = score;
|
1078
|
+
hit.doc = scorer->doc; hit.score = score;
|
1079
|
+
hq_insert(hq, &hit);
|
1080
|
+
}
|
1081
|
+
scorer->destroy(scorer);
|
1082
|
+
|
1083
|
+
if (hq->size > first_doc) {
|
1084
|
+
if ((hq->size - first_doc) < num_docs) {
|
1085
|
+
num_docs = hq->size - first_doc;
|
1086
|
+
}
|
1087
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
1088
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
1089
|
+
score_docs[i] = hq_pop(hq);
|
1090
|
+
/*
|
1091
|
+
printf("score_docs[i][%d] = [%ld] => %d-->%f\n", i,
|
1092
|
+
score_docs[i], score_docs[i]->doc, score_docs[i]->score);
|
1093
|
+
*/
|
1094
|
+
}
|
1095
|
+
}
|
1096
|
+
else {
|
1097
|
+
num_docs = 0;
|
1098
|
+
}
|
1099
|
+
pq_clear(hq);
|
1100
|
+
hq_destroy(hq);
|
1101
|
+
|
1102
|
+
return td_new(total_hits, num_docs, score_docs, max_score);
|
1103
|
+
}
|
1104
|
+
|
1105
|
+
static TopDocs *isea_search(Searcher *self,
|
1106
|
+
Query *query,
|
1107
|
+
int first_doc,
|
1108
|
+
int num_docs,
|
1109
|
+
Filter *filter,
|
1110
|
+
Sort *sort,
|
1111
|
+
filter_ft filter_func,
|
1112
|
+
bool load_fields)
|
1113
|
+
{
|
1114
|
+
TopDocs *td;
|
1115
|
+
Weight *weight = q_weight(query, self);
|
1116
|
+
td = isea_search_w(self, weight, first_doc, num_docs, filter,
|
1117
|
+
sort, filter_func, load_fields);
|
1118
|
+
weight->destroy(weight);
|
1119
|
+
return td;
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
|
1123
|
+
filter_ft filter_func,
|
1124
|
+
void (*fn)(Searcher *, int, float, void *),
|
1125
|
+
void *arg)
|
1126
|
+
{
|
1127
|
+
Scorer *scorer;
|
1128
|
+
BitVector *bits = (filter
|
1129
|
+
? filt_get_bv(filter, ISEA(self)->ir)
|
1130
|
+
: NULL);
|
1131
|
+
|
1132
|
+
scorer = weight->scorer(weight, ISEA(self)->ir);
|
1133
|
+
if (!scorer) {
|
1134
|
+
return;
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
while (scorer->next(scorer)) {
|
1138
|
+
if (IS_FILTERED(bits, filter_func, scorer, self)) {
|
1139
|
+
continue;
|
1140
|
+
}
|
1141
|
+
fn(self, scorer->doc, scorer->score(scorer), arg);
|
1142
|
+
}
|
1143
|
+
scorer->destroy(scorer);
|
1144
|
+
}
|
1145
|
+
|
1146
|
+
static void isea_search_each(Searcher *self, Query *query, Filter *filter,
|
1147
|
+
filter_ft filter_func,
|
1148
|
+
void (*fn)(Searcher *, int, float, void *),
|
1149
|
+
void *arg)
|
1150
|
+
{
|
1151
|
+
Weight *weight = q_weight(query, self);
|
1152
|
+
isea_search_each_w(self, weight, filter, filter_func, fn, arg);
|
1153
|
+
weight->destroy(weight);
|
1154
|
+
}
|
1155
|
+
|
1156
|
+
static Query *isea_rewrite(Searcher *self, Query *original)
|
1157
|
+
{
|
1158
|
+
int q_is_destroyed = false;
|
1159
|
+
Query *query = original;
|
1160
|
+
Query *rewritten_query = query->rewrite(query, ISEA(self)->ir);
|
1161
|
+
while (q_is_destroyed || (query != rewritten_query)) {
|
1162
|
+
query = rewritten_query;
|
1163
|
+
rewritten_query = query->rewrite(query, ISEA(self)->ir);
|
1164
|
+
q_is_destroyed = (query->ref_cnt <= 1);
|
1165
|
+
q_deref(query); /* destroy intermediate queries */
|
1166
|
+
}
|
1167
|
+
return query;
|
1168
|
+
}
|
1169
|
+
|
1170
|
+
static Explanation *isea_explain(Searcher *self, Query *query, int doc_num)
|
1171
|
+
{
|
1172
|
+
Weight *weight = q_weight(query, self);
|
1173
|
+
Explanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
|
1174
|
+
weight->destroy(weight);
|
1175
|
+
return e;
|
1176
|
+
}
|
1177
|
+
|
1178
|
+
static Explanation *isea_explain_w(Searcher *self, Weight *w, int doc_num)
|
1179
|
+
{
|
1180
|
+
return w->explain(w, ISEA(self)->ir, doc_num);
|
1181
|
+
}
|
1182
|
+
|
1183
|
+
static TermVector *isea_get_term_vector(Searcher *self,
|
1184
|
+
const int doc_num,
|
1185
|
+
const char *field)
|
1186
|
+
{
|
1187
|
+
IndexReader *ir = ISEA(self)->ir;
|
1188
|
+
return ir->term_vector(ir, doc_num, field);
|
1189
|
+
}
|
1190
|
+
|
1191
|
+
static void isea_close(Searcher *self)
|
1192
|
+
{
|
1193
|
+
if (ISEA(self)->ir && ISEA(self)->close_ir) {
|
1194
|
+
ir_close(ISEA(self)->ir);
|
1195
|
+
}
|
1196
|
+
free(self);
|
1197
|
+
}
|
1198
|
+
|
1199
|
+
Searcher *isea_new(IndexReader *ir)
|
1200
|
+
{
|
1201
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(IndexSearcher));
|
1202
|
+
|
1203
|
+
ISEA(self)->ir = ir;
|
1204
|
+
ISEA(self)->close_ir = true;
|
1205
|
+
|
1206
|
+
self->similarity = sim_create_default();
|
1207
|
+
self->doc_freq = &isea_doc_freq;
|
1208
|
+
self->get_doc = &isea_get_doc;
|
1209
|
+
self->get_lazy_doc = &isea_get_lazy_doc;
|
1210
|
+
self->max_doc = &isea_max_doc;
|
1211
|
+
self->create_weight = &sea_create_weight;
|
1212
|
+
self->search = &isea_search;
|
1213
|
+
self->search_w = &isea_search_w;
|
1214
|
+
self->search_each = &isea_search_each;
|
1215
|
+
self->search_each_w = &isea_search_each_w;
|
1216
|
+
self->rewrite = &isea_rewrite;
|
1217
|
+
self->explain = &isea_explain;
|
1218
|
+
self->explain_w = &isea_explain_w;
|
1219
|
+
self->get_term_vector = &isea_get_term_vector;
|
1220
|
+
self->get_similarity = &sea_get_similarity;
|
1221
|
+
self->close = &isea_close;
|
1222
|
+
|
1223
|
+
return self;
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
/***************************************************************************
|
1227
|
+
*
|
1228
|
+
* CachedDFSearcher
|
1229
|
+
*
|
1230
|
+
***************************************************************************/
|
1231
|
+
|
1232
|
+
#define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
|
1233
|
+
typedef struct CachedDFSearcher
|
1234
|
+
{
|
1235
|
+
Searcher super;
|
1236
|
+
HashTable *df_map;
|
1237
|
+
int max_doc;
|
1238
|
+
} CachedDFSearcher;
|
1239
|
+
|
1240
|
+
static int cdfsea_doc_freq(Searcher *self, const char *field, const char *text)
|
1241
|
+
{
|
1242
|
+
Term term;
|
1243
|
+
int *df;
|
1244
|
+
term.field = (char *)field;
|
1245
|
+
term.text = (char *)text;
|
1246
|
+
df = (int *)h_get(CDFSEA(self)->df_map, &term);
|
1247
|
+
return df ? *df : 0;
|
1248
|
+
}
|
1249
|
+
|
1250
|
+
static Document *cdfsea_get_doc(Searcher *self, int doc_num)
|
1251
|
+
{
|
1252
|
+
(void)self; (void)doc_num;
|
1253
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1254
|
+
return NULL;
|
1255
|
+
}
|
1256
|
+
|
1257
|
+
static int cdfsea_max_doc(Searcher *self)
|
1258
|
+
{
|
1259
|
+
(void)self;
|
1260
|
+
return CDFSEA(self)->max_doc;
|
1261
|
+
}
|
1262
|
+
|
1263
|
+
static Weight *cdfsea_create_weight(Searcher *self, Query *query)
|
1264
|
+
{
|
1265
|
+
(void)self; (void)query;
|
1266
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1267
|
+
return NULL;
|
1268
|
+
}
|
1269
|
+
|
1270
|
+
static TopDocs *cdfsea_search_w(Searcher *self, Weight *w, int fd, int nd,
|
1271
|
+
Filter *f, Sort *s, filter_ft ff, bool load)
|
1272
|
+
{
|
1273
|
+
(void)self; (void)w; (void)fd; (void)nd;
|
1274
|
+
(void)f; (void)s; (void)ff; (void)load;
|
1275
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1276
|
+
return NULL;
|
1277
|
+
}
|
1278
|
+
|
1279
|
+
static TopDocs *cdfsea_search(Searcher *self, Query *q, int fd, int nd,
|
1280
|
+
Filter *f, Sort *s, filter_ft ff, bool load)
|
1281
|
+
{
|
1282
|
+
(void)self; (void)q; (void)fd; (void)nd;
|
1283
|
+
(void)f; (void)s; (void)ff; (void)load;
|
1284
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1285
|
+
return NULL;
|
1286
|
+
}
|
1287
|
+
|
1288
|
+
static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
|
1289
|
+
filter_ft ff,
|
1290
|
+
void (*fn)(Searcher *, int, float, void *),
|
1291
|
+
void *arg)
|
1292
|
+
{
|
1293
|
+
(void)self; (void)query; (void)filter; (void)ff; (void)fn; (void)arg;
|
1294
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1295
|
+
}
|
1296
|
+
|
1297
|
+
static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
1298
|
+
filter_ft ff,
|
1299
|
+
void (*fn)(Searcher *, int, float, void *),
|
1300
|
+
void *arg)
|
1301
|
+
{
|
1302
|
+
(void)self; (void)w; (void)filter; (void)ff; (void)fn; (void)arg;
|
1303
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
static Query *cdfsea_rewrite(Searcher *self, Query *original)
|
1307
|
+
{
|
1308
|
+
(void)self;
|
1309
|
+
original->ref_cnt++;
|
1310
|
+
return original;
|
1311
|
+
}
|
1312
|
+
|
1313
|
+
static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
|
1314
|
+
{
|
1315
|
+
(void)self; (void)query; (void)doc_num;
|
1316
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1317
|
+
return NULL;
|
1318
|
+
}
|
1319
|
+
|
1320
|
+
static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
|
1321
|
+
{
|
1322
|
+
(void)self; (void)w; (void)doc_num;
|
1323
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1324
|
+
return NULL;
|
1325
|
+
}
|
1326
|
+
|
1327
|
+
static TermVector *cdfsea_get_term_vector(Searcher *self, const int doc_num,
|
1328
|
+
const char *field)
|
1329
|
+
{
|
1330
|
+
(void)self; (void)doc_num; (void)field;
|
1331
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1332
|
+
return NULL;
|
1333
|
+
}
|
1334
|
+
|
1335
|
+
static Similarity *cdfsea_get_similarity(Searcher *self)
|
1336
|
+
{
|
1337
|
+
(void)self;
|
1338
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1339
|
+
return NULL;
|
1340
|
+
}
|
1341
|
+
|
1342
|
+
static void cdfsea_close(Searcher *self)
|
1343
|
+
{
|
1344
|
+
h_destroy(CDFSEA(self)->df_map);
|
1345
|
+
free(self);
|
1346
|
+
}
|
1347
|
+
|
1348
|
+
static Searcher *cdfsea_new(HashTable *df_map, int max_doc)
|
1349
|
+
{
|
1350
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(CachedDFSearcher));
|
1351
|
+
|
1352
|
+
CDFSEA(self)->df_map = df_map;
|
1353
|
+
CDFSEA(self)->max_doc = max_doc;
|
1354
|
+
|
1355
|
+
self->doc_freq = &cdfsea_doc_freq;
|
1356
|
+
self->get_doc = &cdfsea_get_doc;
|
1357
|
+
self->max_doc = &cdfsea_max_doc;
|
1358
|
+
self->create_weight = &cdfsea_create_weight;
|
1359
|
+
self->search = &cdfsea_search;
|
1360
|
+
self->search_w = &cdfsea_search_w;
|
1361
|
+
self->search_each = &cdfsea_search_each;
|
1362
|
+
self->search_each_w = &cdfsea_search_each_w;
|
1363
|
+
self->rewrite = &cdfsea_rewrite;
|
1364
|
+
self->explain = &cdfsea_explain;
|
1365
|
+
self->explain_w = &cdfsea_explain_w;
|
1366
|
+
self->get_term_vector = &cdfsea_get_term_vector;
|
1367
|
+
self->get_similarity = &cdfsea_get_similarity;
|
1368
|
+
self->close = &cdfsea_close;
|
1369
|
+
return self;
|
1370
|
+
}
|
1371
|
+
|
1372
|
+
/***************************************************************************
|
1373
|
+
*
|
1374
|
+
* MultiSearcher
|
1375
|
+
*
|
1376
|
+
***************************************************************************/
|
1377
|
+
|
1378
|
+
#define MSEA(searcher) ((MultiSearcher *)(searcher))
|
1379
|
+
static INLINE int msea_get_searcher_index(Searcher *self, int n)
|
1380
|
+
{
|
1381
|
+
MultiSearcher *msea = MSEA(self);
|
1382
|
+
int lo = 0; /* search starts array */
|
1383
|
+
int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
|
1384
|
+
int mid, mid_val;
|
1385
|
+
|
1386
|
+
while (hi >= lo) {
|
1387
|
+
mid = (lo + hi) >> 1;
|
1388
|
+
mid_val = msea->starts[mid];
|
1389
|
+
if (n < mid_val) {
|
1390
|
+
hi = mid - 1;
|
1391
|
+
}
|
1392
|
+
else if (n > mid_val) {
|
1393
|
+
lo = mid + 1;
|
1394
|
+
}
|
1395
|
+
else { /* found a match */
|
1396
|
+
while (((mid+1) < msea->s_cnt)
|
1397
|
+
&& (msea->starts[mid+1] == mid_val)) {
|
1398
|
+
mid++; /* scan to last match */
|
1399
|
+
}
|
1400
|
+
return mid;
|
1401
|
+
}
|
1402
|
+
}
|
1403
|
+
return hi;
|
1404
|
+
}
|
1405
|
+
|
1406
|
+
static int msea_doc_freq(Searcher *self, const char *field, const char *term)
|
1407
|
+
{
|
1408
|
+
int i;
|
1409
|
+
int doc_freq = 0;
|
1410
|
+
MultiSearcher *msea = MSEA(self);
|
1411
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1412
|
+
Searcher *s = msea->searchers[i];
|
1413
|
+
doc_freq += s->doc_freq(s, field, term);
|
1414
|
+
}
|
1415
|
+
|
1416
|
+
return doc_freq;
|
1417
|
+
}
|
1418
|
+
|
1419
|
+
static Document *msea_get_doc(Searcher *self, int doc_num)
|
1420
|
+
{
|
1421
|
+
MultiSearcher *msea = MSEA(self);
|
1422
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1423
|
+
Searcher *s = msea->searchers[i];
|
1424
|
+
return s->get_doc(s, doc_num - msea->starts[i]);
|
1425
|
+
}
|
1426
|
+
|
1427
|
+
static LazyDoc *msea_get_lazy_doc(Searcher *self, int doc_num)
|
1428
|
+
{
|
1429
|
+
MultiSearcher *msea = MSEA(self);
|
1430
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1431
|
+
Searcher *s = msea->searchers[i];
|
1432
|
+
return s->get_lazy_doc(s, doc_num - msea->starts[i]);
|
1433
|
+
}
|
1434
|
+
|
1435
|
+
static int msea_max_doc(Searcher *self)
|
1436
|
+
{
|
1437
|
+
return MSEA(self)->max_doc;
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
static int *msea_get_doc_freqs(Searcher *self, HashSet *terms)
|
1441
|
+
{
|
1442
|
+
int i;
|
1443
|
+
const int num_terms = terms->size;
|
1444
|
+
int *doc_freqs = ALLOC_N(int, num_terms);
|
1445
|
+
for (i = 0; i < num_terms; i++) {
|
1446
|
+
Term *t = (Term *)terms->elems[i];
|
1447
|
+
doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
|
1448
|
+
}
|
1449
|
+
return doc_freqs;
|
1450
|
+
}
|
1451
|
+
|
1452
|
+
static Weight *msea_create_weight(Searcher *self, Query *query)
|
1453
|
+
{
|
1454
|
+
int i, *doc_freqs;
|
1455
|
+
Searcher *cdfsea;
|
1456
|
+
Weight *w;
|
1457
|
+
HashTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
|
1458
|
+
(free_ft)NULL, free);
|
1459
|
+
Query *rewritten_query = self->rewrite(self, query);
|
1460
|
+
HashSet *terms = term_set_new();
|
1461
|
+
|
1462
|
+
rewritten_query->extract_terms(rewritten_query, terms);
|
1463
|
+
doc_freqs = msea_get_doc_freqs(self, terms);
|
1464
|
+
|
1465
|
+
for (i = 0; i < terms->size; i++) {
|
1466
|
+
h_set(df_map, terms->elems[i], imalloc(doc_freqs[i]));
|
1467
|
+
}
|
1468
|
+
hs_destroy(terms);
|
1469
|
+
free(doc_freqs);
|
1470
|
+
|
1471
|
+
cdfsea = cdfsea_new(df_map, MSEA(self)->max_doc);
|
1472
|
+
|
1473
|
+
w = q_weight(rewritten_query, cdfsea);
|
1474
|
+
q_deref(rewritten_query);
|
1475
|
+
cdfsea->close(cdfsea);
|
1476
|
+
|
1477
|
+
return w;
|
1478
|
+
}
|
1479
|
+
|
1480
|
+
struct MultiSearchEachArg {
|
1481
|
+
int start;
|
1482
|
+
void *arg;
|
1483
|
+
void (*fn)(Searcher *, int, float, void *);
|
1484
|
+
};
|
1485
|
+
|
1486
|
+
void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
|
1487
|
+
{
|
1488
|
+
struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
|
1489
|
+
|
1490
|
+
mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
|
1491
|
+
}
|
1492
|
+
|
1493
|
+
static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
1494
|
+
filter_ft filter_func,
|
1495
|
+
void (*fn)(Searcher *, int, float, void *),
|
1496
|
+
void *arg)
|
1497
|
+
{
|
1498
|
+
int i;
|
1499
|
+
struct MultiSearchEachArg mse_arg;
|
1500
|
+
MultiSearcher *msea = MSEA(self);
|
1501
|
+
Searcher *s;
|
1502
|
+
|
1503
|
+
mse_arg.fn = fn;
|
1504
|
+
mse_arg.arg = arg;
|
1505
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1506
|
+
s = msea->searchers[i];
|
1507
|
+
mse_arg.start = msea->starts[i];
|
1508
|
+
s->search_each_w(s, w, filter, filter_func,
|
1509
|
+
&msea_search_each_i, &mse_arg);
|
1510
|
+
}
|
1511
|
+
}
|
1512
|
+
|
1513
|
+
static void msea_search_each(Searcher *self, Query *query, Filter *filter,
|
1514
|
+
filter_ft filter_func,
|
1515
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
1516
|
+
{
|
1517
|
+
Weight *w = q_weight(query, self);
|
1518
|
+
msea_search_each_w(self, w, filter, filter_func, fn, arg);
|
1519
|
+
w->destroy(w);
|
1520
|
+
}
|
1521
|
+
|
1522
|
+
struct MultiSearchArg {
|
1523
|
+
int total_hits, max_size;
|
1524
|
+
PriorityQueue *hq;
|
1525
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
1526
|
+
};
|
1527
|
+
|
1528
|
+
void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
|
1529
|
+
{
|
1530
|
+
struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
|
1531
|
+
Hit hit;
|
1532
|
+
(void)self;
|
1533
|
+
|
1534
|
+
ms_arg->total_hits++;
|
1535
|
+
hit.doc = doc_num;
|
1536
|
+
hit.score = score;
|
1537
|
+
ms_arg->hq_insert(ms_arg->hq, &hit);
|
1538
|
+
}
|
1539
|
+
|
1540
|
+
static TopDocs *msea_search_w(Searcher *self,
|
1541
|
+
Weight *weight,
|
1542
|
+
int first_doc,
|
1543
|
+
int num_docs,
|
1544
|
+
Filter *filter,
|
1545
|
+
Sort *sort,
|
1546
|
+
filter_ft filter_func,
|
1547
|
+
bool load_fields)
|
1548
|
+
{
|
1549
|
+
int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
|
1550
|
+
int i;
|
1551
|
+
int total_hits = 0;
|
1552
|
+
Hit **score_docs = NULL;
|
1553
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
1554
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
1555
|
+
PriorityQueue *hq;
|
1556
|
+
float max_score = 0.0;
|
1557
|
+
(void)load_fields; /* does it automatically */
|
1558
|
+
|
1559
|
+
sea_check_args(num_docs, first_doc);
|
1560
|
+
|
1561
|
+
if (sort) {
|
1562
|
+
hq = pq_new(max_size, (lt_ft)fdshq_lt, &free);
|
1563
|
+
hq_insert = (void (*)(PriorityQueue *pq, Hit *hit))&pq_insert;
|
1564
|
+
hq_pop = (Hit *(*)(PriorityQueue *pq))&pq_pop;
|
1565
|
+
}
|
1566
|
+
else {
|
1567
|
+
hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
|
1568
|
+
hq_insert = &hit_pq_multi_insert;
|
1569
|
+
hq_pop = &hit_pq_pop;
|
1570
|
+
}
|
1571
|
+
|
1572
|
+
/*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
|
1573
|
+
for (i = 0; i < MSEA(self)->s_cnt; i++) {
|
1574
|
+
Searcher *s = MSEA(self)->searchers[i];
|
1575
|
+
TopDocs *td = s->search_w(s, weight, 0, max_size,
|
1576
|
+
filter, sort, filter_func, true);
|
1577
|
+
/*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
|
1578
|
+
if (td->size > 0) {
|
1579
|
+
/*printf("td->size = %d %d\n", td->size, num_docs); */
|
1580
|
+
int j;
|
1581
|
+
int start = MSEA(self)->starts[i];
|
1582
|
+
for (j = 0; j < td->size; j++) {
|
1583
|
+
Hit *hit = td->hits[j];
|
1584
|
+
hit->doc += start;
|
1585
|
+
/*
|
1586
|
+
printf("adding hit = %d:%f\n", hit->doc, hit->score);
|
1587
|
+
*/
|
1588
|
+
hq_insert(hq, hit);
|
1589
|
+
}
|
1590
|
+
td->size = 0;
|
1591
|
+
if (td->max_score > max_score) max_score = td->max_score;
|
1592
|
+
}
|
1593
|
+
total_hits += td->total_hits;
|
1594
|
+
td_destroy(td);
|
1595
|
+
}
|
1596
|
+
|
1597
|
+
if (hq->size > first_doc) {
|
1598
|
+
if ((hq->size - first_doc) < num_docs) {
|
1599
|
+
num_docs = hq->size - first_doc;
|
1600
|
+
}
|
1601
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
1602
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
1603
|
+
score_docs[i] = hq_pop(hq);
|
1604
|
+
/*
|
1605
|
+
Hit *hit = score_docs[i] = hq_pop(hq);
|
1606
|
+
printf("popped hit = %d-->%f\n", hit->doc, hit->score);
|
1607
|
+
*/
|
1608
|
+
}
|
1609
|
+
}
|
1610
|
+
else {
|
1611
|
+
num_docs = 0;
|
1612
|
+
}
|
1613
|
+
pq_clear(hq);
|
1614
|
+
pq_destroy(hq);
|
1615
|
+
|
1616
|
+
return td_new(total_hits, num_docs, score_docs, max_score);
|
1617
|
+
}
|
1618
|
+
|
1619
|
+
static TopDocs *msea_search(Searcher *self,
|
1620
|
+
Query *query,
|
1621
|
+
int first_doc,
|
1622
|
+
int num_docs,
|
1623
|
+
Filter *filter,
|
1624
|
+
Sort *sort,
|
1625
|
+
filter_ft filter_func,
|
1626
|
+
bool load_fields)
|
1627
|
+
{
|
1628
|
+
TopDocs *td;
|
1629
|
+
Weight *weight = q_weight(query, self);
|
1630
|
+
td = msea_search_w(self, weight, first_doc, num_docs, filter,
|
1631
|
+
sort, filter_func, load_fields);
|
1632
|
+
weight->destroy(weight);
|
1633
|
+
return td;
|
1634
|
+
}
|
1635
|
+
|
1636
|
+
static Query *msea_rewrite(Searcher *self, Query *original)
|
1637
|
+
{
|
1638
|
+
int i;
|
1639
|
+
Searcher *s;
|
1640
|
+
MultiSearcher *msea = MSEA(self);
|
1641
|
+
Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
|
1642
|
+
|
1643
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1644
|
+
s = msea->searchers[i];
|
1645
|
+
queries[i] = s->rewrite(s, original);
|
1646
|
+
}
|
1647
|
+
rewritten = q_combine(queries, msea->s_cnt);
|
1648
|
+
|
1649
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1650
|
+
q_deref(queries[i]);
|
1651
|
+
}
|
1652
|
+
free(queries);
|
1653
|
+
return rewritten;
|
1654
|
+
}
|
1655
|
+
|
1656
|
+
static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
|
1657
|
+
{
|
1658
|
+
MultiSearcher *msea = MSEA(self);
|
1659
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1660
|
+
Weight *w = q_weight(query, self);
|
1661
|
+
Searcher *s = msea->searchers[i];
|
1662
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
1663
|
+
w->destroy(w);
|
1664
|
+
return e;
|
1665
|
+
}
|
1666
|
+
|
1667
|
+
static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
|
1668
|
+
{
|
1669
|
+
MultiSearcher *msea = MSEA(self);
|
1670
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1671
|
+
Searcher *s = msea->searchers[i];
|
1672
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
1673
|
+
return e;
|
1674
|
+
}
|
1675
|
+
|
1676
|
+
static TermVector *msea_get_term_vector(Searcher *self, const int doc_num,
|
1677
|
+
const char *field)
|
1678
|
+
{
|
1679
|
+
MultiSearcher *msea = MSEA(self);
|
1680
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1681
|
+
Searcher *s = msea->searchers[i];
|
1682
|
+
return s->get_term_vector(s, doc_num - msea->starts[i],
|
1683
|
+
field);
|
1684
|
+
}
|
1685
|
+
|
1686
|
+
static Similarity *msea_get_similarity(Searcher *self)
|
1687
|
+
{
|
1688
|
+
return self->similarity;
|
1689
|
+
}
|
1690
|
+
|
1691
|
+
static void msea_close(Searcher *self)
|
1692
|
+
{
|
1693
|
+
int i;
|
1694
|
+
Searcher *s;
|
1695
|
+
MultiSearcher *msea = MSEA(self);
|
1696
|
+
if (msea->close_subs) {
|
1697
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1698
|
+
s = msea->searchers[i];
|
1699
|
+
s->close(s);
|
1700
|
+
}
|
1701
|
+
}
|
1702
|
+
free(msea->searchers);
|
1703
|
+
free(msea->starts);
|
1704
|
+
free(self);
|
1705
|
+
}
|
1706
|
+
|
1707
|
+
Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
|
1708
|
+
{
|
1709
|
+
int i, max_doc = 0;
|
1710
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(MultiSearcher));
|
1711
|
+
int *starts = ALLOC_N(int, s_cnt + 1);
|
1712
|
+
for (i = 0; i < s_cnt; i++) {
|
1713
|
+
starts[i] = max_doc;
|
1714
|
+
max_doc += searchers[i]->max_doc(searchers[i]);
|
1715
|
+
}
|
1716
|
+
starts[i] = max_doc;
|
1717
|
+
|
1718
|
+
MSEA(self)->s_cnt = s_cnt;
|
1719
|
+
MSEA(self)->searchers = searchers;
|
1720
|
+
MSEA(self)->starts = starts;
|
1721
|
+
MSEA(self)->max_doc = max_doc;
|
1722
|
+
MSEA(self)->close_subs = close_subs;
|
1723
|
+
|
1724
|
+
self->similarity = sim_create_default();
|
1725
|
+
self->doc_freq = &msea_doc_freq;
|
1726
|
+
self->get_doc = &msea_get_doc;
|
1727
|
+
self->get_lazy_doc = &msea_get_lazy_doc;
|
1728
|
+
self->max_doc = &msea_max_doc;
|
1729
|
+
self->create_weight = &msea_create_weight;
|
1730
|
+
self->search = &msea_search;
|
1731
|
+
self->search_w = &msea_search_w;
|
1732
|
+
self->search_each = &msea_search_each;
|
1733
|
+
self->search_each_w = &msea_search_each_w;
|
1734
|
+
self->rewrite = &msea_rewrite;
|
1735
|
+
self->explain = &msea_explain;
|
1736
|
+
self->explain_w = &msea_explain_w;
|
1737
|
+
self->get_term_vector = &msea_get_term_vector;
|
1738
|
+
self->get_similarity = &msea_get_similarity;
|
1739
|
+
self->close = &msea_close;
|
1740
|
+
return self;
|
1741
|
+
}
|