ferret 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/r_store.c
CHANGED
@@ -6,6 +6,10 @@ VALUE cDirectory;
|
|
6
6
|
VALUE cRAMDirectory;
|
7
7
|
VALUE cFSDirectory;
|
8
8
|
|
9
|
+
|
10
|
+
static ID id_mkdir_p;
|
11
|
+
static ID id_is_directory;
|
12
|
+
|
9
13
|
/****************************************************************************
|
10
14
|
*
|
11
15
|
* Lock Methods
|
@@ -87,11 +91,10 @@ frt_lock_release(VALUE self)
|
|
87
91
|
****************************************************************************/
|
88
92
|
|
89
93
|
void
|
90
|
-
frt_dir_free(
|
94
|
+
frt_dir_free(Store *store)
|
91
95
|
{
|
92
|
-
|
93
|
-
|
94
|
-
store->close(store);
|
96
|
+
object_del(store);
|
97
|
+
store_deref(store);
|
95
98
|
}
|
96
99
|
|
97
100
|
#define GET_STORE Store *store; Data_Get_Struct(self, Store, store)
|
@@ -99,10 +102,11 @@ static VALUE
|
|
99
102
|
frt_dir_close(VALUE self)
|
100
103
|
{
|
101
104
|
/*
|
105
|
+
* No need to do anything here. Leave it do the garbage collector
|
102
106
|
GET_STORE;
|
103
107
|
Frt_Unwrap_Struct(self);
|
104
108
|
object_del(store);
|
105
|
-
|
109
|
+
store_deref(store);
|
106
110
|
*/
|
107
111
|
return Qnil;
|
108
112
|
}
|
@@ -212,11 +216,23 @@ frt_fsdir_new(VALUE klass, VALUE rpath, VALUE rcreate)
|
|
212
216
|
Store *store;
|
213
217
|
bool create = RTEST(rcreate);
|
214
218
|
rpath = rb_obj_as_string(rpath);
|
219
|
+
if (create) {
|
220
|
+
VALUE mFileUtils;
|
221
|
+
rb_require("fileutils");
|
222
|
+
mFileUtils = rb_define_module("FileUtils");
|
223
|
+
rb_funcall(mFileUtils, id_mkdir_p, 1, rpath);
|
224
|
+
}
|
225
|
+
if (!rb_funcall(rb_cFile, id_is_directory, 1, rpath)) {
|
226
|
+
rb_raise(rb_eIOError, "There is no directory: %s. Use create = true to "
|
227
|
+
"create one.", RSTRING(rpath)->ptr);
|
228
|
+
}
|
215
229
|
store = open_fs_store(RSTRING(rpath)->ptr);
|
216
230
|
if (create) store->clear_all(store);
|
217
231
|
if ((self = object_get(store)) == Qnil) {
|
218
232
|
self = Data_Wrap_Struct(klass, NULL, &frt_dir_free, store);
|
219
233
|
object_add(store, self);
|
234
|
+
} else {
|
235
|
+
store_deref(store);
|
220
236
|
}
|
221
237
|
return self;
|
222
238
|
}
|
@@ -227,19 +243,12 @@ frt_fsdir_new(VALUE klass, VALUE rpath, VALUE rcreate)
|
|
227
243
|
*
|
228
244
|
****************************************************************************/
|
229
245
|
|
230
|
-
#define DIR_METHODS(dir)\
|
231
|
-
rb_define_method(dir, "close", frt_dir_close, 0);\
|
232
|
-
rb_define_method(dir, "exists?", frt_dir_exists, 1);\
|
233
|
-
rb_define_method(dir, "touch", frt_dir_touch, 1);\
|
234
|
-
rb_define_method(dir, "delete", frt_dir_delete, 1);\
|
235
|
-
rb_define_method(dir, "file_count", frt_dir_file_count, 0);\
|
236
|
-
rb_define_method(dir, "refresh", frt_dir_refresh, 0);\
|
237
|
-
rb_define_method(dir, "rename", frt_dir_rename, 2);\
|
238
|
-
rb_define_method(dir, "make_lock", frt_dir_make_lock, 1);
|
239
|
-
|
240
246
|
void
|
241
247
|
Init_dir(void)
|
242
248
|
{
|
249
|
+
id_mkdir_p = rb_intern("mkdir_p");
|
250
|
+
id_is_directory = rb_intern("directory?");
|
251
|
+
|
243
252
|
cLock = rb_define_class_under(mStore, "Lock", rb_cObject);
|
244
253
|
rb_define_method(cLock, "obtain", frt_lock_obtain, -1);
|
245
254
|
rb_define_method(cLock, "while_locked", frt_lock_while_locked, -1);
|
@@ -248,16 +257,22 @@ Init_dir(void)
|
|
248
257
|
|
249
258
|
cDirectory = rb_define_class_under(mStore, "Directory", rb_cObject);
|
250
259
|
rb_define_const(cDirectory, "LOCK_PREFIX", rb_str_new2(LOCK_PREFIX));
|
260
|
+
rb_define_method(cDirectory, "close", frt_dir_close, 0);\
|
261
|
+
rb_define_method(cDirectory, "exists?", frt_dir_exists, 1);\
|
262
|
+
rb_define_method(cDirectory, "touch", frt_dir_touch, 1);\
|
263
|
+
rb_define_method(cDirectory, "delete", frt_dir_delete, 1);\
|
264
|
+
rb_define_method(cDirectory, "file_count", frt_dir_file_count, 0);\
|
265
|
+
rb_define_method(cDirectory, "refresh", frt_dir_refresh, 0);\
|
266
|
+
rb_define_method(cDirectory, "rename", frt_dir_rename, 2);\
|
267
|
+
rb_define_method(cDirectory, "make_lock", frt_dir_make_lock, 1);
|
251
268
|
|
252
269
|
/* RAMDirectory */
|
253
270
|
cRAMDirectory = rb_define_class_under(mStore, "RAMDirectory", cDirectory);
|
254
271
|
rb_define_alloc_func(cRAMDirectory, frt_data_alloc);
|
255
272
|
rb_define_method(cRAMDirectory, "initialize", frt_ramdir_init, -1);
|
256
|
-
DIR_METHODS(cRAMDirectory);
|
257
273
|
|
258
274
|
/* FSDirectory */
|
259
275
|
cFSDirectory = rb_define_class_under(mStore, "FSDirectory", cDirectory);
|
260
276
|
rb_define_alloc_func(cFSDirectory, frt_data_alloc);
|
261
277
|
rb_define_singleton_method(cFSDirectory, "new", frt_fsdir_new, 2);
|
262
|
-
DIR_METHODS(cFSDirectory);
|
263
278
|
}
|
data/ext/ram_store.c
CHANGED
@@ -4,12 +4,14 @@
|
|
4
4
|
static char * const RENAME_ERROR_MSG = "tried to rename a file that doesn't exist";
|
5
5
|
static char * const MISSING_RAMFILE_ERROR_MSG ="Couldn't open the ram file to read";
|
6
6
|
|
7
|
+
extern void store_destroy(Store *store);
|
8
|
+
|
7
9
|
typedef struct RamFile {
|
8
10
|
char *name;
|
9
11
|
uchar **buffers;
|
10
12
|
int bufcnt;
|
11
13
|
int len;
|
12
|
-
int
|
14
|
+
int ref_cnt;
|
13
15
|
bool alive;
|
14
16
|
} RamFile;
|
15
17
|
|
@@ -21,7 +23,7 @@ RamFile *rf_create(const char *name)
|
|
21
23
|
rf->name = estrdup(name);
|
22
24
|
rf->len = 0;
|
23
25
|
rf->bufcnt = 1;
|
24
|
-
rf->
|
26
|
+
rf->ref_cnt = 0;
|
25
27
|
rf->alive = true;
|
26
28
|
return rf;
|
27
29
|
}
|
@@ -38,7 +40,7 @@ void rf_close(void *p)
|
|
38
40
|
{
|
39
41
|
int i;
|
40
42
|
RamFile *rf = (RamFile *)p;
|
41
|
-
if (rf->
|
43
|
+
if (rf->ref_cnt > 0 || rf->alive) return;
|
42
44
|
free(rf->name);
|
43
45
|
for (i = 0; i < rf->bufcnt; i++) {
|
44
46
|
free(rf->buffers[i]);
|
@@ -76,17 +78,21 @@ int ram_remove(Store *store, char *filename)
|
|
76
78
|
int ram_rename(Store *store, char *from, char *to)
|
77
79
|
{
|
78
80
|
RamFile *rf = (RamFile *)h_rem(store->dir.ht, from, false);
|
79
|
-
|
81
|
+
RamFile *tmp;
|
82
|
+
|
83
|
+
if (rf == NULL) {
|
80
84
|
RAISE(IO_ERROR, RENAME_ERROR_MSG);
|
85
|
+
}
|
81
86
|
|
82
87
|
free(rf->name);
|
83
88
|
|
84
89
|
rf->name = estrdup(to);
|
85
90
|
|
86
|
-
|
87
|
-
|
88
|
-
if (tmp != NULL)
|
91
|
+
/* clean up the file we are overwriting */
|
92
|
+
tmp = (RamFile *)h_get(store->dir.ht, to);
|
93
|
+
if (tmp != NULL) {
|
89
94
|
tmp->alive = false;
|
95
|
+
}
|
90
96
|
|
91
97
|
h_set(store->dir.ht, rf->name, rf);
|
92
98
|
return true;
|
@@ -112,7 +118,7 @@ void ram_each(Store *store, void (*func)(char *fname, void *arg), void *arg)
|
|
112
118
|
}
|
113
119
|
}
|
114
120
|
|
115
|
-
void
|
121
|
+
void ram_close_i(Store *store)
|
116
122
|
{
|
117
123
|
HshTable *ht = store->dir.ht;
|
118
124
|
RamFile *rf;
|
@@ -185,6 +191,7 @@ int ramo_length(OutStream *os)
|
|
185
191
|
|
186
192
|
void ramo_flush_internal(OutStream *os, uchar *src, int len)
|
187
193
|
{
|
194
|
+
uchar *buffer;
|
188
195
|
RamFile *rf = (RamFile *)os->file;
|
189
196
|
int buffer_number, buffer_offset, bytes_in_buffer, bytes_to_copy;
|
190
197
|
int src_offset;
|
@@ -197,7 +204,7 @@ void ramo_flush_internal(OutStream *os, uchar *src, int len)
|
|
197
204
|
|
198
205
|
rf_extend_if_necessary(rf, buffer_number);
|
199
206
|
|
200
|
-
|
207
|
+
buffer = rf->buffers[buffer_number];
|
201
208
|
memcpy(buffer + buffer_offset, src, bytes_to_copy);
|
202
209
|
|
203
210
|
if (bytes_to_copy < len) {
|
@@ -230,7 +237,7 @@ void ramo_reset(OutStream *os)
|
|
230
237
|
void ramo_close_internal(OutStream *os)
|
231
238
|
{
|
232
239
|
RamFile *rf = (RamFile *)os->file;
|
233
|
-
rf->
|
240
|
+
rf->ref_cnt--;
|
234
241
|
rf_close(rf);
|
235
242
|
}
|
236
243
|
|
@@ -238,9 +245,12 @@ void ramo_write_to(OutStream *os, OutStream *other_o)
|
|
238
245
|
{
|
239
246
|
int i, len;
|
240
247
|
RamFile *rf = (RamFile *)os->file;
|
248
|
+
int last_buffer_number;
|
249
|
+
int last_buffer_offset;
|
250
|
+
|
241
251
|
os_flush(os);
|
242
|
-
|
243
|
-
|
252
|
+
last_buffer_number = (int)(rf->len / BUFFER_SIZE);
|
253
|
+
last_buffer_offset = rf->len % BUFFER_SIZE;
|
244
254
|
for (i = 0; i <= last_buffer_number; i++) {
|
245
255
|
len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE);
|
246
256
|
os_write_bytes(other_o, rf->buffers[i], len);
|
@@ -250,8 +260,9 @@ void ramo_write_to(OutStream *os, OutStream *other_o)
|
|
250
260
|
OutStream *ram_create_buffer()
|
251
261
|
{
|
252
262
|
RamFile *rf = rf_create("");
|
253
|
-
rf->alive = false;
|
254
263
|
OutStream *os = os_create();
|
264
|
+
|
265
|
+
rf->alive = false;
|
255
266
|
os->file = rf;
|
256
267
|
os->pointer = 0;
|
257
268
|
os->flush_internal = &ramo_flush_internal;
|
@@ -269,12 +280,13 @@ void ram_destroy_buffer(OutStream *os)
|
|
269
280
|
OutStream *ram_create_output(Store *store, const char *filename)
|
270
281
|
{
|
271
282
|
RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
|
283
|
+
OutStream *os = os_create();
|
284
|
+
|
272
285
|
if (rf == NULL) {
|
273
286
|
rf = rf_create(filename);
|
274
287
|
h_set(store->dir.ht, rf->name, rf);
|
275
288
|
}
|
276
|
-
rf->
|
277
|
-
OutStream *os = os_create();
|
289
|
+
rf->ref_cnt++;
|
278
290
|
os->pointer = 0;
|
279
291
|
os->file = rf;
|
280
292
|
os->flush_internal = &ramo_flush_internal;
|
@@ -325,23 +337,24 @@ void rami_seek_internal(InStream *is, int pos)
|
|
325
337
|
void rami_close_internal(InStream *is)
|
326
338
|
{
|
327
339
|
RamFile *rf = (RamFile *)is->file;
|
328
|
-
rf->
|
340
|
+
rf->ref_cnt--;
|
329
341
|
rf_close(rf);
|
330
342
|
}
|
331
343
|
|
332
344
|
void rami_clone_internal(InStream *is, InStream *new_index_i)
|
333
345
|
{
|
334
|
-
((RamFile *)is->file)->
|
346
|
+
((RamFile *)is->file)->ref_cnt++;
|
335
347
|
}
|
336
348
|
|
337
349
|
InStream *ram_open_input(Store *store, const char *filename)
|
338
350
|
{
|
339
351
|
RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
|
352
|
+
InStream *is = is_create();
|
353
|
+
|
340
354
|
if (rf == NULL) {
|
341
355
|
RAISE(IO_ERROR, MISSING_RAMFILE_ERROR_MSG);
|
342
356
|
}
|
343
|
-
rf->
|
344
|
-
InStream *is = is_create();
|
357
|
+
rf->ref_cnt++;
|
345
358
|
is->file = rf;
|
346
359
|
is->d.pointer = 0;
|
347
360
|
is->is_clone = false;
|
@@ -404,7 +417,6 @@ Store *open_ram_store()
|
|
404
417
|
new_store->remove = &ram_remove;
|
405
418
|
new_store->rename = &ram_rename;
|
406
419
|
new_store->count = &ram_count;
|
407
|
-
new_store->close = &ram_close;
|
408
420
|
new_store->clear = &ram_clear;
|
409
421
|
new_store->clear_all = &ram_clear_all;
|
410
422
|
new_store->clear_locks = &ram_clear_locks;
|
@@ -414,6 +426,7 @@ Store *open_ram_store()
|
|
414
426
|
new_store->open_input = &ram_open_input;
|
415
427
|
new_store->open_lock = &ram_open_lock;
|
416
428
|
new_store->close_lock = &ram_close_lock;
|
429
|
+
new_store->close_i = &ram_close_i;
|
417
430
|
return new_store;
|
418
431
|
}
|
419
432
|
|
@@ -427,11 +440,14 @@ static void copy_files(char *fname, void *arg)
|
|
427
440
|
OutStream *os = cfa->to_store->create_output(cfa->to_store, fname);
|
428
441
|
InStream *is = cfa->from_store->open_input(cfa->from_store, fname);
|
429
442
|
int len = is_length(is);
|
430
|
-
uchar buffer
|
443
|
+
uchar *buffer = ALLOC_N(uchar, len+1);
|
444
|
+
|
431
445
|
is_read_bytes(is, buffer, 0, len);
|
432
446
|
os_write_bytes(os, buffer, len);
|
447
|
+
|
433
448
|
is_close(is);
|
434
449
|
os_close(os);
|
450
|
+
free(buffer);
|
435
451
|
}
|
436
452
|
|
437
453
|
Store *open_ram_store_and_copy(Store *from_store, bool close_dir)
|
@@ -444,7 +460,7 @@ Store *open_ram_store_and_copy(Store *from_store, bool close_dir)
|
|
444
460
|
from_store->each(from_store, ©_files, &cfa);
|
445
461
|
|
446
462
|
if (close_dir)
|
447
|
-
|
463
|
+
store_deref(from_store);
|
448
464
|
|
449
465
|
return store;
|
450
466
|
}
|
data/ext/search.c
CHANGED
@@ -135,30 +135,34 @@ Hit *hit_pq_pop(PriorityQueue *pq)
|
|
135
135
|
|
136
136
|
inline void hit_pq_up(PriorityQueue *pq)
|
137
137
|
{
|
138
|
-
int i,j;
|
139
|
-
i = pq->count;
|
140
|
-
j = i >> 1;
|
141
138
|
Hit **heap = (Hit **)pq->heap;
|
142
|
-
Hit *node
|
139
|
+
Hit *node;
|
140
|
+
int i = pq->count;
|
141
|
+
int j = i >> 1;
|
142
|
+
node = heap[i];
|
143
143
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
while ((j > 0) && hit_lt(node, heap[j])) {
|
145
|
+
heap[i] = heap[j];
|
146
|
+
i = j;
|
147
|
+
j = j >> 1;
|
148
|
+
}
|
149
149
|
heap[i] = node;
|
150
150
|
}
|
151
151
|
|
152
|
-
|
153
|
-
void hit_pq_push(PriorityQueue *pq, void *elem)
|
152
|
+
void hit_pq_insert(PriorityQueue *pq, Hit *hit)
|
154
153
|
{
|
155
|
-
pq->count
|
156
|
-
|
157
|
-
|
154
|
+
if (pq->count < pq->size) {
|
155
|
+
Hit *new_hit = ALLOC(Hit);
|
156
|
+
memcpy(new_hit, hit, sizeof(Hit));
|
157
|
+
pq->count++;
|
158
|
+
pq->heap[pq->count] = new_hit;
|
159
|
+
hit_pq_up(pq);
|
160
|
+
} else if (pq->count > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
|
161
|
+
memcpy(pq->heap[1], hit, sizeof(Hit));
|
162
|
+
hit_pq_down(pq);
|
163
|
+
}
|
158
164
|
}
|
159
165
|
|
160
|
-
|
161
|
-
|
162
166
|
/***************************************************************************
|
163
167
|
*
|
164
168
|
* TopDocs
|
@@ -174,9 +178,8 @@ TopDocs *td_create(int total_hits, int size, Hit **hits)
|
|
174
178
|
return td;
|
175
179
|
}
|
176
180
|
|
177
|
-
void td_destroy(
|
181
|
+
void td_destroy(TopDocs *td)
|
178
182
|
{
|
179
|
-
TopDocs *td = (TopDocs *)p;
|
180
183
|
int i;
|
181
184
|
for (i = 0; i < td->size; i++) {
|
182
185
|
free(td->hits[i]);
|
@@ -226,54 +229,78 @@ void w_normalize(Weight *self, float normalization_factor)
|
|
226
229
|
self->value = self->qweight * self->idf; // idf for document
|
227
230
|
}
|
228
231
|
|
232
|
+
void w_destroy(Weight *self)
|
233
|
+
{
|
234
|
+
q_deref(self->query);
|
235
|
+
free(self);
|
236
|
+
}
|
237
|
+
|
238
|
+
Weight *w_create(Query *query)
|
239
|
+
{
|
240
|
+
Weight *self = ALLOC_AND_ZERO_N(Weight, 1);
|
241
|
+
ref(query);
|
242
|
+
self->query = query;
|
243
|
+
|
244
|
+
self->get_query = &w_get_query;
|
245
|
+
self->get_value = &w_get_value;
|
246
|
+
self->normalize = &w_normalize;
|
247
|
+
self->destroy = &w_destroy;
|
248
|
+
return self;
|
249
|
+
}
|
250
|
+
|
229
251
|
/***************************************************************************
|
230
252
|
*
|
231
253
|
* Query
|
232
254
|
*
|
233
255
|
***************************************************************************/
|
234
256
|
|
235
|
-
Similarity *
|
257
|
+
Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
|
236
258
|
{
|
237
259
|
return searcher->get_similarity(searcher);
|
238
260
|
}
|
239
261
|
|
240
262
|
Query *q_rewrite(Query *self, IndexReader *ir)
|
241
263
|
{
|
264
|
+
self->ref_cnt++;
|
242
265
|
return self;
|
243
266
|
}
|
244
267
|
|
245
268
|
Weight *q_weight(Query *self, Searcher *searcher)
|
246
269
|
{
|
247
|
-
if (self->weight) {
|
248
|
-
self->weight->destroy(self->weight);
|
249
|
-
}
|
250
270
|
Query *query = searcher->rewrite(searcher, self);
|
251
|
-
Weight *weight = query->
|
271
|
+
Weight *weight = query->create_weight_i(query, searcher);
|
252
272
|
float sum = weight->sum_of_squared_weights(weight);
|
253
273
|
Similarity *sim = query->get_similarity(query, searcher);
|
254
274
|
float norm = sim_query_norm(sim, sum);
|
275
|
+
q_deref(query);
|
255
276
|
|
256
277
|
weight->normalize(weight, norm);
|
257
278
|
return self->weight = weight;
|
258
279
|
}
|
259
280
|
|
260
|
-
|
281
|
+
Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
|
282
|
+
{
|
283
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
284
|
+
return NULL;
|
285
|
+
}
|
286
|
+
|
287
|
+
void q_destroy_i(Query *self)
|
261
288
|
{
|
262
|
-
if (self->rewritten) {
|
263
|
-
self->rewritten->destroy(self->rewritten);
|
264
|
-
self->rewritten = NULL;
|
265
|
-
}
|
266
|
-
if (self->weight) {
|
267
|
-
self->weight->destroy(self->weight);
|
268
|
-
}
|
269
289
|
free(self);
|
270
290
|
}
|
271
291
|
|
272
|
-
void q_extract_terms(Query *self,
|
292
|
+
void q_extract_terms(Query *self, HashSet *terms)
|
273
293
|
{
|
274
294
|
/* do nothing by default */
|
275
295
|
}
|
276
296
|
|
297
|
+
void q_deref(Query *self)
|
298
|
+
{
|
299
|
+
if (--self->ref_cnt == 0) {
|
300
|
+
self->destroy_i(self);
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
277
304
|
Query *q_create()
|
278
305
|
{
|
279
306
|
Query *self = ALLOC(Query);
|
@@ -281,30 +308,92 @@ Query *q_create()
|
|
281
308
|
self->destroy_all = true;
|
282
309
|
self->boost = 1.0;
|
283
310
|
self->rewrite = &q_rewrite;
|
284
|
-
self->get_similarity = &
|
311
|
+
self->get_similarity = &q_get_similarity_i;
|
285
312
|
self->extract_terms = &q_extract_terms;
|
286
313
|
self->weight = NULL;
|
287
|
-
self->
|
314
|
+
self->ref_cnt = 1;
|
288
315
|
return self;
|
289
316
|
}
|
290
317
|
|
318
|
+
uint q_hash(Query *self)
|
319
|
+
{
|
320
|
+
return (self->hash(self) << 4) | self->type;
|
321
|
+
}
|
322
|
+
|
323
|
+
int q_eq(Query *self, Query *o)
|
324
|
+
{
|
325
|
+
return (self == o) || ((self->type == o->type) &&
|
326
|
+
(self->boost == o->boost) &&
|
327
|
+
self->eq(self, o));
|
328
|
+
}
|
329
|
+
|
330
|
+
Query *q_combine(Query **queries, int q_cnt)
|
331
|
+
{
|
332
|
+
int i;
|
333
|
+
Query *q, *ret_q;
|
334
|
+
HashSet *uniques =
|
335
|
+
hs_create((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
|
336
|
+
|
337
|
+
for (i = 0; i < q_cnt; i++) {
|
338
|
+
q = queries[i];
|
339
|
+
if (q->type == BOOLEAN_QUERY) {
|
340
|
+
int j;
|
341
|
+
bool splittable = true;
|
342
|
+
BooleanQuery *bq = (BooleanQuery *)q->data;
|
343
|
+
if (bq->coord_disabled == false) {
|
344
|
+
splittable = false;
|
345
|
+
} else {
|
346
|
+
for (j = 0; j < bq->clause_cnt; j++) {
|
347
|
+
if (bq->clauses[j]->occur != BC_SHOULD) {
|
348
|
+
splittable = false;
|
349
|
+
break;
|
350
|
+
}
|
351
|
+
}
|
352
|
+
}
|
353
|
+
if (splittable) {
|
354
|
+
for (j = 0; j < bq->clause_cnt; j++) {
|
355
|
+
q = bq->clauses[j]->query;
|
356
|
+
hs_add(uniques, q);
|
357
|
+
}
|
358
|
+
} else {
|
359
|
+
hs_add(uniques, q);
|
360
|
+
}
|
361
|
+
} else {
|
362
|
+
hs_add(uniques, q);
|
363
|
+
}
|
364
|
+
}
|
365
|
+
if (uniques->size == 1) {
|
366
|
+
ret_q = (Query *)uniques->elems[0];
|
367
|
+
ref(ret_q);
|
368
|
+
} else {
|
369
|
+
ret_q = bq_create(true);
|
370
|
+
for (i = 0; i < uniques->size; i++) {
|
371
|
+
q = (Query *)uniques->elems[i];
|
372
|
+
ref(q);
|
373
|
+
bq_add_query(ret_q, q, BC_SHOULD);
|
374
|
+
}
|
375
|
+
}
|
376
|
+
hs_destroy(uniques);
|
377
|
+
|
378
|
+
return ret_q;
|
379
|
+
}
|
380
|
+
|
291
381
|
/***************************************************************************
|
292
382
|
*
|
293
383
|
* Scorer
|
294
384
|
*
|
295
385
|
***************************************************************************/
|
296
386
|
|
297
|
-
void
|
387
|
+
void scorer_destroy_i(Scorer *self)
|
298
388
|
{
|
299
|
-
|
300
|
-
free(
|
301
|
-
free(scorer);
|
389
|
+
free(self->data);
|
390
|
+
free(self);
|
302
391
|
}
|
303
392
|
|
304
393
|
Scorer *scorer_create(Similarity *similarity)
|
305
394
|
{
|
306
395
|
Scorer *self = ALLOC(Scorer);
|
307
|
-
self->destroy = &
|
396
|
+
self->destroy = &scorer_destroy_i;
|
308
397
|
self->data = NULL;
|
309
398
|
self->similarity = similarity;
|
310
399
|
return self;
|
@@ -326,43 +415,58 @@ int scorer_doc_cmp(const void *p1, const void *p2)
|
|
326
415
|
{
|
327
416
|
return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
|
328
417
|
}
|
418
|
+
|
329
419
|
/***************************************************************************
|
330
420
|
*
|
331
421
|
* Searcher
|
332
422
|
*
|
333
423
|
***************************************************************************/
|
334
424
|
|
335
|
-
int
|
425
|
+
static int s_doc_freq(Searcher *self, Term *term)
|
336
426
|
{
|
337
427
|
return self->ir->doc_freq(self->ir, term);
|
338
428
|
}
|
339
429
|
|
340
|
-
int *
|
430
|
+
static int *s_doc_freqs(Searcher *self, Term **terms, int tcnt)
|
341
431
|
{
|
342
|
-
int *freqs = ALLOC_N(int, tcnt);
|
343
432
|
int i;
|
433
|
+
int *freqs = ALLOC_N(int, tcnt);
|
434
|
+
|
344
435
|
for (i = 0; i < tcnt; i++) {
|
345
436
|
freqs[i] = self->ir->doc_freq(self->ir, terms[i]);
|
346
437
|
}
|
347
438
|
return freqs;
|
348
439
|
}
|
349
440
|
|
350
|
-
|
441
|
+
static int *ss_doc_freqs(Searcher *self, Term **terms, int tcnt)
|
442
|
+
{
|
443
|
+
int i;
|
444
|
+
int *freqs = ALLOC_N(int, tcnt);
|
445
|
+
|
446
|
+
for (i = 0; i < tcnt; i++) {
|
447
|
+
freqs[i] = self->doc_freq(self, terms[i]);
|
448
|
+
}
|
449
|
+
|
450
|
+
return freqs;
|
451
|
+
}
|
452
|
+
|
453
|
+
|
454
|
+
static Document *s_get_doc(Searcher *self, int doc_num)
|
351
455
|
{
|
352
456
|
return self->ir->get_doc(self->ir, doc_num);
|
353
457
|
}
|
354
458
|
|
355
|
-
int
|
459
|
+
static int s_max_doc(Searcher *self)
|
356
460
|
{
|
357
461
|
return self->ir->max_doc(self->ir);
|
358
462
|
}
|
359
463
|
|
360
|
-
Weight *
|
464
|
+
static Weight *s_create_weight(Searcher *self, Query *query)
|
361
465
|
{
|
362
466
|
return q_weight(query, self);
|
363
467
|
}
|
364
468
|
|
365
|
-
TopDocs *
|
469
|
+
static TopDocs *s_search(Searcher *self, Query *query, int first_doc,
|
366
470
|
int num_docs, Filter *filter, Sort *sort)
|
367
471
|
{
|
368
472
|
int max_size = first_doc + num_docs;
|
@@ -370,14 +474,13 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
370
474
|
Weight *weight;
|
371
475
|
Scorer *scorer;
|
372
476
|
Hit **score_docs = NULL;
|
373
|
-
Hit
|
477
|
+
Hit hit;
|
374
478
|
int total_hits = 0;
|
375
|
-
float
|
479
|
+
float score;
|
376
480
|
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
377
481
|
Hit *(*hq_pop)(PriorityQueue *pq);
|
378
|
-
void (*
|
379
|
-
void (*
|
380
|
-
void (*hq_destroy)(void *p);
|
482
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
483
|
+
void (*hq_destroy)(PriorityQueue *self);
|
381
484
|
PriorityQueue *hq;
|
382
485
|
|
383
486
|
|
@@ -391,20 +494,19 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
391
494
|
scorer = weight->scorer(weight, self->ir);
|
392
495
|
if (!scorer) {
|
393
496
|
if (bits) bv_destroy(bits);
|
497
|
+
weight->destroy(weight);
|
394
498
|
return td_create(0, 0, NULL);
|
395
499
|
}
|
396
500
|
|
397
501
|
if (sort) {
|
398
502
|
hq = fshq_pq_create(max_size, sort, self->ir);
|
399
503
|
hq_pop = &fshq_pq_pop;
|
400
|
-
|
401
|
-
hq_push = &fshq_pq_push;
|
504
|
+
hq_insert = &fshq_pq_insert;
|
402
505
|
hq_destroy = &fshq_pq_destroy;
|
403
506
|
} else {
|
404
507
|
hq = pq_create(max_size, &hit_less_than);
|
405
508
|
hq_pop = &hit_pq_pop;
|
406
|
-
|
407
|
-
hq_push = &hit_pq_push;
|
509
|
+
hq_insert = &hit_pq_insert;
|
408
510
|
hq_destroy = &pq_destroy;
|
409
511
|
}
|
410
512
|
|
@@ -412,19 +514,11 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
412
514
|
if (bits && !bv_get(bits, scorer->doc)) continue;
|
413
515
|
total_hits++;
|
414
516
|
score = scorer->score(scorer);
|
415
|
-
|
416
|
-
|
417
|
-
hit->doc = scorer->doc; hit->score = score;
|
418
|
-
hq_push(hq, hit);
|
419
|
-
min_score = ((Hit *)pq_top(hq))->score; // maintain min_score
|
420
|
-
} else if (score > min_score) {
|
421
|
-
hit = pq_top(hq);
|
422
|
-
hit->doc = scorer->doc; hit->score = score;
|
423
|
-
hq_down(hq);
|
424
|
-
min_score = ((Hit *)pq_top(hq))->score; // maintain min_score
|
425
|
-
}
|
517
|
+
hit.doc = scorer->doc; hit.score = score;
|
518
|
+
hq_insert(hq, &hit);
|
426
519
|
}
|
427
520
|
scorer->destroy(scorer);
|
521
|
+
weight->destroy(weight);
|
428
522
|
|
429
523
|
if (hq->count > first_doc) {
|
430
524
|
if ((hq->count - first_doc) < num_docs) {
|
@@ -446,14 +540,12 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
446
540
|
return td_create(total_hits, num_docs, score_docs);
|
447
541
|
}
|
448
542
|
|
449
|
-
void
|
450
|
-
void (*fn)(Searcher
|
543
|
+
static void s_search_each_w(Searcher *self, Weight *weight, Filter *filter,
|
544
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
451
545
|
{
|
452
|
-
Weight *weight;
|
453
546
|
Scorer *scorer;
|
454
547
|
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
455
548
|
|
456
|
-
weight = q_weight(query, self);
|
457
549
|
scorer = weight->scorer(weight, self->ir);
|
458
550
|
if (!scorer) {
|
459
551
|
if (bits) bv_destroy(bits);
|
@@ -462,34 +554,53 @@ void sea_search_each(Searcher *self, Query *query, Filter *filter,
|
|
462
554
|
|
463
555
|
while (scorer->next(scorer)) {
|
464
556
|
if (bits && !bv_get(bits, scorer->doc)) continue;
|
465
|
-
fn(self, scorer->doc, arg);
|
557
|
+
fn(self, scorer->doc, scorer->score(scorer), arg);
|
466
558
|
}
|
467
559
|
scorer->destroy(scorer);
|
468
560
|
}
|
469
561
|
|
470
|
-
|
562
|
+
static void s_search_each(Searcher *self, Query *query, Filter *filter,
|
563
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
471
564
|
{
|
565
|
+
Weight *weight;
|
566
|
+
weight = q_weight(query, self);
|
567
|
+
s_search_each_w(self, weight, filter, fn, arg);
|
568
|
+
weight->destroy(weight);
|
569
|
+
}
|
570
|
+
|
571
|
+
static Query *s_rewrite(Searcher *self, Query *original)
|
572
|
+
{
|
573
|
+
int q_is_destroyed = false;
|
472
574
|
Query *query = original;
|
473
575
|
Query *rewritten_query = query->rewrite(query, self->ir);
|
474
|
-
while (query != rewritten_query) {
|
576
|
+
while (q_is_destroyed || (query != rewritten_query)) {
|
475
577
|
query = rewritten_query;
|
476
578
|
rewritten_query = query->rewrite(query, self->ir);
|
579
|
+
q_is_destroyed = (query->ref_cnt <= 1);
|
580
|
+
q_deref(query); /* destroy intermediate queries */
|
477
581
|
}
|
478
582
|
return query;
|
479
583
|
}
|
480
584
|
|
481
|
-
Explanation *
|
585
|
+
static Explanation *s_explain(Searcher *self, Query *query, int doc_num)
|
482
586
|
{
|
483
587
|
Weight *weight = q_weight(query, self);
|
484
|
-
|
588
|
+
Explanation *e = weight->explain(weight, self->ir, doc_num);
|
589
|
+
weight->destroy(weight);
|
590
|
+
return e;
|
591
|
+
}
|
592
|
+
|
593
|
+
static Explanation *s_explain_w(Searcher *self, Weight *w, int doc_num)
|
594
|
+
{
|
595
|
+
return w->explain(w, self->ir, doc_num);
|
485
596
|
}
|
486
597
|
|
487
|
-
Similarity *
|
598
|
+
static Similarity *s_get_similarity(Searcher *self)
|
488
599
|
{
|
489
600
|
return self->similarity;
|
490
601
|
}
|
491
602
|
|
492
|
-
void
|
603
|
+
static void s_close(Searcher *self)
|
493
604
|
{
|
494
605
|
if (self->ir && self->close_ir)
|
495
606
|
ir_close(self->ir);
|
@@ -502,17 +613,436 @@ Searcher *sea_create(IndexReader *ir)
|
|
502
613
|
self->ir = ir;
|
503
614
|
self->close_ir = true;
|
504
615
|
self->similarity = sim_create_default();
|
505
|
-
self->doc_freq = &
|
506
|
-
self->doc_freqs = &
|
507
|
-
self->get_doc = &
|
508
|
-
self->max_doc = &
|
509
|
-
self->create_weight = &
|
510
|
-
self->search = &
|
511
|
-
self->
|
512
|
-
self->
|
513
|
-
self->
|
514
|
-
self->
|
616
|
+
self->doc_freq = &s_doc_freq;
|
617
|
+
self->doc_freqs = &s_doc_freqs;
|
618
|
+
self->get_doc = &s_get_doc;
|
619
|
+
self->max_doc = &s_max_doc;
|
620
|
+
self->create_weight = &s_create_weight;
|
621
|
+
self->search = &s_search;
|
622
|
+
self->search_each = &s_search_each;
|
623
|
+
self->search_each_w = &s_search_each_w;
|
624
|
+
self->rewrite = &s_rewrite;
|
625
|
+
self->explain = &s_explain;
|
626
|
+
self->explain_w = &s_explain_w;
|
627
|
+
self->get_similarity = &s_get_similarity;
|
628
|
+
self->close = &s_close;
|
629
|
+
return self;
|
630
|
+
}
|
631
|
+
|
632
|
+
/***************************************************************************
|
633
|
+
*
|
634
|
+
* CachedDFSearcher
|
635
|
+
*
|
636
|
+
***************************************************************************/
|
637
|
+
|
638
|
+
typedef struct CachedDFSearcher {
|
639
|
+
HshTable *df_map;
|
640
|
+
int max_doc;
|
641
|
+
} CachedDFSearcher;
|
642
|
+
|
643
|
+
static int cdfsea_doc_freq(Searcher *self, Term *term)
|
644
|
+
{
|
645
|
+
CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
|
646
|
+
return (int)h_get(cdfsea->df_map, term);
|
647
|
+
}
|
648
|
+
|
649
|
+
static Document *cdfsea_get_doc(Searcher *self, int doc_num)
|
650
|
+
{
|
651
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
652
|
+
return NULL;
|
653
|
+
}
|
654
|
+
|
655
|
+
static int cdfsea_max_doc(Searcher *self)
|
656
|
+
{
|
657
|
+
return ((CachedDFSearcher *)self->data)->max_doc;
|
658
|
+
}
|
659
|
+
|
660
|
+
static Weight *cdfsea_create_weight(Searcher *self, Query *query)
|
661
|
+
{
|
662
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
663
|
+
return NULL;
|
664
|
+
}
|
665
|
+
|
666
|
+
static TopDocs *cdfsea_search(Searcher *self, Query *query, int first_doc,
|
667
|
+
int num_docs, Filter *filter, Sort *sort)
|
668
|
+
{
|
669
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
670
|
+
return NULL;
|
671
|
+
}
|
672
|
+
|
673
|
+
static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
|
674
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
675
|
+
{
|
676
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
677
|
+
}
|
678
|
+
|
679
|
+
static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
680
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
681
|
+
{
|
682
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
683
|
+
}
|
684
|
+
|
685
|
+
static Query *cdfsea_rewrite(Searcher *self, Query *original)
|
686
|
+
{
|
687
|
+
original->ref_cnt++;
|
688
|
+
return original;
|
689
|
+
}
|
690
|
+
|
691
|
+
static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
|
692
|
+
{
|
693
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
694
|
+
return NULL;
|
695
|
+
}
|
696
|
+
|
697
|
+
static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
|
698
|
+
{
|
699
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
700
|
+
return NULL;
|
701
|
+
}
|
702
|
+
|
703
|
+
static Similarity *cdfsea_get_similarity(Searcher *self)
|
704
|
+
{
|
705
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
706
|
+
return NULL;
|
707
|
+
}
|
708
|
+
|
709
|
+
static void cdfsea_close(Searcher *self)
|
710
|
+
{
|
711
|
+
CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
|
712
|
+
h_destroy(cdfsea->df_map);
|
713
|
+
free(cdfsea);
|
714
|
+
free(self);
|
715
|
+
}
|
716
|
+
|
717
|
+
Searcher *cdfsea_create(HshTable *df_map, int max_doc)
|
718
|
+
{
|
719
|
+
Searcher *self = ALLOC(Searcher);
|
720
|
+
|
721
|
+
CachedDFSearcher *cdfsea = ALLOC(CachedDFSearcher);
|
722
|
+
|
723
|
+
cdfsea->df_map = df_map;
|
724
|
+
cdfsea->max_doc = max_doc;
|
725
|
+
self->data = cdfsea;
|
726
|
+
|
727
|
+
self->doc_freq = &cdfsea_doc_freq;
|
728
|
+
self->doc_freqs = &ss_doc_freqs;
|
729
|
+
self->get_doc = &cdfsea_get_doc;
|
730
|
+
self->max_doc = &cdfsea_max_doc;
|
731
|
+
self->create_weight = &cdfsea_create_weight;
|
732
|
+
self->search = &cdfsea_search;
|
733
|
+
self->search_each = &cdfsea_search_each;
|
734
|
+
self->search_each_w = &cdfsea_search_each_w;
|
735
|
+
self->rewrite = &cdfsea_rewrite;
|
736
|
+
self->explain = &cdfsea_explain;
|
737
|
+
self->explain_w = &cdfsea_explain_w;
|
738
|
+
self->get_similarity = &cdfsea_get_similarity;
|
739
|
+
self->close = &cdfsea_close;
|
515
740
|
return self;
|
516
741
|
}
|
517
742
|
|
743
|
+
/***************************************************************************
|
744
|
+
*
|
745
|
+
* MultiSearcher
|
746
|
+
*
|
747
|
+
***************************************************************************/
|
748
|
+
|
749
|
+
static inline int msea_get_searcher_index(Searcher *self, int n)
|
750
|
+
{
|
751
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
752
|
+
int lo = 0; /* search starts array */
|
753
|
+
int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
|
754
|
+
int mid, mid_val;
|
755
|
+
|
756
|
+
while (hi >= lo) {
|
757
|
+
mid = (lo + hi) >> 1;
|
758
|
+
mid_val = msea->starts[mid];
|
759
|
+
if (n < mid_val) {
|
760
|
+
hi = mid - 1;
|
761
|
+
} else if (n > mid_val) {
|
762
|
+
lo = mid + 1;
|
763
|
+
} else { /* found a match */
|
764
|
+
while (((mid+1) < msea->s_cnt) && (msea->starts[mid+1] == mid_val)) {
|
765
|
+
mid++; /* scan to last match */
|
766
|
+
}
|
767
|
+
return mid;
|
768
|
+
}
|
769
|
+
}
|
770
|
+
return hi;
|
771
|
+
}
|
772
|
+
|
773
|
+
static int msea_doc_freq(Searcher *self, Term *term)
|
774
|
+
{
|
775
|
+
int i;
|
776
|
+
int doc_freq = 0;
|
777
|
+
Searcher *s;
|
778
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
779
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
780
|
+
s = msea->searchers[i];
|
781
|
+
doc_freq += s->doc_freq(s, term);
|
782
|
+
}
|
783
|
+
|
784
|
+
return doc_freq;
|
785
|
+
}
|
786
|
+
|
787
|
+
static Document *msea_get_doc(Searcher *self, int doc_num)
|
788
|
+
{
|
789
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
790
|
+
int i = msea_get_searcher_index(self, doc_num);
|
791
|
+
Searcher *s = msea->searchers[i];
|
792
|
+
return s->get_doc(s, doc_num - msea->starts[i]);
|
793
|
+
}
|
794
|
+
|
795
|
+
static int msea_max_doc(Searcher *self)
|
796
|
+
{
|
797
|
+
return ((MultiSearcher *)self->data)->max_doc;
|
798
|
+
}
|
799
|
+
|
800
|
+
static Weight *msea_create_weight(Searcher *self, Query *query)
|
801
|
+
{
|
802
|
+
int i, *dfs;
|
803
|
+
Searcher *cdfsea;
|
804
|
+
Weight *w;
|
805
|
+
HshTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
|
806
|
+
(free_ft)NULL, (free_ft)NULL);
|
807
|
+
Query *rq = self->rewrite(self, query);
|
808
|
+
HashSet *terms = term_set_create();
|
809
|
+
rq->extract_terms(rq, terms);
|
810
|
+
dfs = self->doc_freqs(self, (Term **)terms->elems, terms->size);
|
811
|
+
|
812
|
+
for (i = 0; i < terms->size; i++) {
|
813
|
+
h_set(df_map, terms->elems[i], (void *)dfs[i]);
|
814
|
+
}
|
815
|
+
/* don't destroy the individual terms, only the HashSet */
|
816
|
+
hs_destroy(terms);
|
817
|
+
free(dfs);
|
818
|
+
|
819
|
+
cdfsea = cdfsea_create(df_map, ((MultiSearcher *)self->data)->max_doc);
|
820
|
+
|
821
|
+
w = q_weight(rq, cdfsea);
|
822
|
+
q_deref(rq);
|
823
|
+
cdfsea->close(cdfsea);
|
824
|
+
|
825
|
+
return w;
|
826
|
+
}
|
827
|
+
|
828
|
+
struct MultiSearchEachArg {
|
829
|
+
int start;
|
830
|
+
void *arg;
|
831
|
+
void (*fn)(Searcher *, int, float, void *);
|
832
|
+
};
|
833
|
+
|
834
|
+
void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
|
835
|
+
{
|
836
|
+
struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
|
837
|
+
|
838
|
+
mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
|
839
|
+
}
|
840
|
+
|
841
|
+
static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
842
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
843
|
+
{
|
844
|
+
int i;
|
845
|
+
struct MultiSearchEachArg mse_arg;
|
846
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
847
|
+
Searcher *s;
|
848
|
+
|
849
|
+
mse_arg.fn = fn;
|
850
|
+
mse_arg.arg = arg;
|
851
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
852
|
+
s = msea->searchers[i];
|
853
|
+
mse_arg.start = msea->starts[i];
|
854
|
+
s->search_each_w(s, w, filter, &msea_search_each_i, &mse_arg);
|
855
|
+
}
|
856
|
+
}
|
518
857
|
|
858
|
+
static void msea_search_each(Searcher *self, Query *query, Filter *filter,
|
859
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
860
|
+
{
|
861
|
+
Weight *w = q_weight(query, self);
|
862
|
+
msea_search_each_w(self, w, filter, fn, arg);
|
863
|
+
w->destroy(w);
|
864
|
+
}
|
865
|
+
|
866
|
+
struct MultiSearchArg {
|
867
|
+
int total_hits, max_size;
|
868
|
+
float min_score;
|
869
|
+
PriorityQueue *hq;
|
870
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
871
|
+
};
|
872
|
+
|
873
|
+
void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
|
874
|
+
{
|
875
|
+
struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
|
876
|
+
Hit hit;
|
877
|
+
|
878
|
+
ms_arg->total_hits++;
|
879
|
+
hit.doc = doc_num;
|
880
|
+
hit.score = score;
|
881
|
+
ms_arg->hq_insert(ms_arg->hq, &hit);
|
882
|
+
}
|
883
|
+
|
884
|
+
static TopDocs *msea_search(Searcher *self, Query *query, int first_doc,
|
885
|
+
int num_docs, Filter *filter, Sort *sort)
|
886
|
+
{
|
887
|
+
int max_size = first_doc + num_docs;
|
888
|
+
int i;
|
889
|
+
Weight *weight;
|
890
|
+
Hit **score_docs = NULL;
|
891
|
+
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
892
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
893
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
894
|
+
void (*hq_destroy)(PriorityQueue *self);
|
895
|
+
PriorityQueue *hq;
|
896
|
+
struct MultiSearchArg ms_arg;
|
897
|
+
|
898
|
+
|
899
|
+
if (num_docs <= 0)
|
900
|
+
RAISE(ARG_ERROR, NUM_DOCS_ARG_ERROR_MSG);
|
901
|
+
|
902
|
+
if (first_doc < 0)
|
903
|
+
RAISE(ARG_ERROR, FIRST_DOC_ARG_ERROR_MSG);
|
904
|
+
|
905
|
+
weight = q_weight(query, self);
|
906
|
+
if (sort) {
|
907
|
+
hq = fshq_pq_create(max_size, sort, self->ir);
|
908
|
+
hq_pop = &fshq_pq_pop;
|
909
|
+
hq_insert = &fshq_pq_insert;
|
910
|
+
hq_destroy = &fshq_pq_destroy;
|
911
|
+
} else {
|
912
|
+
hq = pq_create(max_size, &hit_less_than);
|
913
|
+
hq_pop = &hit_pq_pop;
|
914
|
+
hq_insert = &hit_pq_insert;
|
915
|
+
hq_destroy = &pq_destroy;
|
916
|
+
}
|
917
|
+
|
918
|
+
|
919
|
+
ms_arg.hq = hq;
|
920
|
+
ms_arg.total_hits = 0;
|
921
|
+
ms_arg.max_size = max_size;
|
922
|
+
ms_arg.min_score = 0.0;
|
923
|
+
ms_arg.hq_insert = hq_insert;
|
924
|
+
|
925
|
+
msea_search_each_w(self, weight, filter, msea_search_i, &ms_arg);
|
926
|
+
|
927
|
+
weight->destroy(weight);
|
928
|
+
|
929
|
+
if (hq->count > first_doc) {
|
930
|
+
if ((hq->count - first_doc) < num_docs) {
|
931
|
+
num_docs = hq->count - first_doc;
|
932
|
+
}
|
933
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
934
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
935
|
+
score_docs[i] = hq_pop(hq);
|
936
|
+
//hit = score_docs[i] = pq_pop(hq);
|
937
|
+
//printf("hit = %d-->%f\n", hit->doc, hit->score);
|
938
|
+
}
|
939
|
+
} else {
|
940
|
+
num_docs = 0;
|
941
|
+
}
|
942
|
+
pq_clear(hq);
|
943
|
+
hq_destroy(hq);
|
944
|
+
|
945
|
+
if (bits) bv_destroy(bits);
|
946
|
+
return td_create(ms_arg.total_hits, num_docs, score_docs);
|
947
|
+
}
|
948
|
+
|
949
|
+
static Query *msea_rewrite(Searcher *self, Query *original)
|
950
|
+
{
|
951
|
+
int i;
|
952
|
+
Searcher *s;
|
953
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
954
|
+
Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
|
955
|
+
|
956
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
957
|
+
s = msea->searchers[i];
|
958
|
+
queries[i] = s->rewrite(s, original);
|
959
|
+
}
|
960
|
+
rewritten = q_combine(queries, msea->s_cnt);
|
961
|
+
|
962
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
963
|
+
q_deref(queries[i]);
|
964
|
+
}
|
965
|
+
free(queries);
|
966
|
+
return rewritten;
|
967
|
+
}
|
968
|
+
|
969
|
+
static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
|
970
|
+
{
|
971
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
972
|
+
int i = msea_get_searcher_index(self, doc_num);
|
973
|
+
Weight *w = q_weight(query, self);
|
974
|
+
Searcher *s = msea->searchers[i];
|
975
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
976
|
+
w->destroy(w);
|
977
|
+
return e;
|
978
|
+
}
|
979
|
+
|
980
|
+
static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
|
981
|
+
{
|
982
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
983
|
+
int i = msea_get_searcher_index(self, doc_num);
|
984
|
+
Searcher *s = msea->searchers[i];
|
985
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
986
|
+
return e;
|
987
|
+
}
|
988
|
+
|
989
|
+
static Similarity *msea_get_similarity(Searcher *self)
|
990
|
+
{
|
991
|
+
return self->similarity;
|
992
|
+
}
|
993
|
+
|
994
|
+
static void msea_close(Searcher *self)
|
995
|
+
{
|
996
|
+
int i;
|
997
|
+
Searcher *s;
|
998
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
999
|
+
if (msea->close_subs) {
|
1000
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1001
|
+
s = msea->searchers[i];
|
1002
|
+
s->close(s);
|
1003
|
+
}
|
1004
|
+
free(msea->searchers);
|
1005
|
+
}
|
1006
|
+
free(msea->starts);
|
1007
|
+
free(msea);
|
1008
|
+
free(self);
|
1009
|
+
}
|
1010
|
+
|
1011
|
+
Searcher *msea_create(Searcher **searchers, int s_cnt, bool close_subs)
|
1012
|
+
{
|
1013
|
+
int i, max_doc = 0, *starts;
|
1014
|
+
Searcher *self = ALLOC(Searcher);
|
1015
|
+
|
1016
|
+
MultiSearcher *msea = ALLOC(MultiSearcher);
|
1017
|
+
|
1018
|
+
starts = ALLOC_N(int, s_cnt + 1);
|
1019
|
+
for (i = 0; i < s_cnt; i++) {
|
1020
|
+
starts[i] = max_doc;
|
1021
|
+
max_doc += searchers[i]->max_doc(searchers[i]);
|
1022
|
+
}
|
1023
|
+
starts[i] = max_doc;
|
1024
|
+
|
1025
|
+
msea->s_cnt = s_cnt;
|
1026
|
+
msea->searchers = searchers;
|
1027
|
+
msea->starts = starts;
|
1028
|
+
msea->max_doc = max_doc;
|
1029
|
+
msea->close_subs = close_subs;
|
1030
|
+
self->data = msea;
|
1031
|
+
|
1032
|
+
self->ir = (IndexReader *)NULL;
|
1033
|
+
self->similarity = sim_create_default();
|
1034
|
+
self->doc_freq = &msea_doc_freq;
|
1035
|
+
self->doc_freqs = &ss_doc_freqs;
|
1036
|
+
self->get_doc = &msea_get_doc;
|
1037
|
+
self->max_doc = &msea_max_doc;
|
1038
|
+
self->create_weight = &msea_create_weight;
|
1039
|
+
self->search = &msea_search;
|
1040
|
+
self->search_each = &msea_search_each;
|
1041
|
+
self->search_each_w = &msea_search_each_w;
|
1042
|
+
self->rewrite = &msea_rewrite;
|
1043
|
+
self->explain = &msea_explain;
|
1044
|
+
self->explain_w = &msea_explain_w;
|
1045
|
+
self->get_similarity = &msea_get_similarity;
|
1046
|
+
self->close = &msea_close;
|
1047
|
+
return self;
|
1048
|
+
}
|