ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/r_store.c
CHANGED
@@ -6,6 +6,10 @@ VALUE cDirectory;
|
|
6
6
|
VALUE cRAMDirectory;
|
7
7
|
VALUE cFSDirectory;
|
8
8
|
|
9
|
+
|
10
|
+
static ID id_mkdir_p;
|
11
|
+
static ID id_is_directory;
|
12
|
+
|
9
13
|
/****************************************************************************
|
10
14
|
*
|
11
15
|
* Lock Methods
|
@@ -87,11 +91,10 @@ frt_lock_release(VALUE self)
|
|
87
91
|
****************************************************************************/
|
88
92
|
|
89
93
|
void
|
90
|
-
frt_dir_free(
|
94
|
+
frt_dir_free(Store *store)
|
91
95
|
{
|
92
|
-
|
93
|
-
|
94
|
-
store->close(store);
|
96
|
+
object_del(store);
|
97
|
+
store_deref(store);
|
95
98
|
}
|
96
99
|
|
97
100
|
#define GET_STORE Store *store; Data_Get_Struct(self, Store, store)
|
@@ -99,10 +102,11 @@ static VALUE
|
|
99
102
|
frt_dir_close(VALUE self)
|
100
103
|
{
|
101
104
|
/*
|
105
|
+
* No need to do anything here. Leave it do the garbage collector
|
102
106
|
GET_STORE;
|
103
107
|
Frt_Unwrap_Struct(self);
|
104
108
|
object_del(store);
|
105
|
-
|
109
|
+
store_deref(store);
|
106
110
|
*/
|
107
111
|
return Qnil;
|
108
112
|
}
|
@@ -212,11 +216,23 @@ frt_fsdir_new(VALUE klass, VALUE rpath, VALUE rcreate)
|
|
212
216
|
Store *store;
|
213
217
|
bool create = RTEST(rcreate);
|
214
218
|
rpath = rb_obj_as_string(rpath);
|
219
|
+
if (create) {
|
220
|
+
VALUE mFileUtils;
|
221
|
+
rb_require("fileutils");
|
222
|
+
mFileUtils = rb_define_module("FileUtils");
|
223
|
+
rb_funcall(mFileUtils, id_mkdir_p, 1, rpath);
|
224
|
+
}
|
225
|
+
if (!rb_funcall(rb_cFile, id_is_directory, 1, rpath)) {
|
226
|
+
rb_raise(rb_eIOError, "There is no directory: %s. Use create = true to "
|
227
|
+
"create one.", RSTRING(rpath)->ptr);
|
228
|
+
}
|
215
229
|
store = open_fs_store(RSTRING(rpath)->ptr);
|
216
230
|
if (create) store->clear_all(store);
|
217
231
|
if ((self = object_get(store)) == Qnil) {
|
218
232
|
self = Data_Wrap_Struct(klass, NULL, &frt_dir_free, store);
|
219
233
|
object_add(store, self);
|
234
|
+
} else {
|
235
|
+
store_deref(store);
|
220
236
|
}
|
221
237
|
return self;
|
222
238
|
}
|
@@ -227,19 +243,12 @@ frt_fsdir_new(VALUE klass, VALUE rpath, VALUE rcreate)
|
|
227
243
|
*
|
228
244
|
****************************************************************************/
|
229
245
|
|
230
|
-
#define DIR_METHODS(dir)\
|
231
|
-
rb_define_method(dir, "close", frt_dir_close, 0);\
|
232
|
-
rb_define_method(dir, "exists?", frt_dir_exists, 1);\
|
233
|
-
rb_define_method(dir, "touch", frt_dir_touch, 1);\
|
234
|
-
rb_define_method(dir, "delete", frt_dir_delete, 1);\
|
235
|
-
rb_define_method(dir, "file_count", frt_dir_file_count, 0);\
|
236
|
-
rb_define_method(dir, "refresh", frt_dir_refresh, 0);\
|
237
|
-
rb_define_method(dir, "rename", frt_dir_rename, 2);\
|
238
|
-
rb_define_method(dir, "make_lock", frt_dir_make_lock, 1);
|
239
|
-
|
240
246
|
void
|
241
247
|
Init_dir(void)
|
242
248
|
{
|
249
|
+
id_mkdir_p = rb_intern("mkdir_p");
|
250
|
+
id_is_directory = rb_intern("directory?");
|
251
|
+
|
243
252
|
cLock = rb_define_class_under(mStore, "Lock", rb_cObject);
|
244
253
|
rb_define_method(cLock, "obtain", frt_lock_obtain, -1);
|
245
254
|
rb_define_method(cLock, "while_locked", frt_lock_while_locked, -1);
|
@@ -248,16 +257,22 @@ Init_dir(void)
|
|
248
257
|
|
249
258
|
cDirectory = rb_define_class_under(mStore, "Directory", rb_cObject);
|
250
259
|
rb_define_const(cDirectory, "LOCK_PREFIX", rb_str_new2(LOCK_PREFIX));
|
260
|
+
rb_define_method(cDirectory, "close", frt_dir_close, 0);\
|
261
|
+
rb_define_method(cDirectory, "exists?", frt_dir_exists, 1);\
|
262
|
+
rb_define_method(cDirectory, "touch", frt_dir_touch, 1);\
|
263
|
+
rb_define_method(cDirectory, "delete", frt_dir_delete, 1);\
|
264
|
+
rb_define_method(cDirectory, "file_count", frt_dir_file_count, 0);\
|
265
|
+
rb_define_method(cDirectory, "refresh", frt_dir_refresh, 0);\
|
266
|
+
rb_define_method(cDirectory, "rename", frt_dir_rename, 2);\
|
267
|
+
rb_define_method(cDirectory, "make_lock", frt_dir_make_lock, 1);
|
251
268
|
|
252
269
|
/* RAMDirectory */
|
253
270
|
cRAMDirectory = rb_define_class_under(mStore, "RAMDirectory", cDirectory);
|
254
271
|
rb_define_alloc_func(cRAMDirectory, frt_data_alloc);
|
255
272
|
rb_define_method(cRAMDirectory, "initialize", frt_ramdir_init, -1);
|
256
|
-
DIR_METHODS(cRAMDirectory);
|
257
273
|
|
258
274
|
/* FSDirectory */
|
259
275
|
cFSDirectory = rb_define_class_under(mStore, "FSDirectory", cDirectory);
|
260
276
|
rb_define_alloc_func(cFSDirectory, frt_data_alloc);
|
261
277
|
rb_define_singleton_method(cFSDirectory, "new", frt_fsdir_new, 2);
|
262
|
-
DIR_METHODS(cFSDirectory);
|
263
278
|
}
|
data/ext/ram_store.c
CHANGED
@@ -4,12 +4,14 @@
|
|
4
4
|
static char * const RENAME_ERROR_MSG = "tried to rename a file that doesn't exist";
|
5
5
|
static char * const MISSING_RAMFILE_ERROR_MSG ="Couldn't open the ram file to read";
|
6
6
|
|
7
|
+
extern void store_destroy(Store *store);
|
8
|
+
|
7
9
|
typedef struct RamFile {
|
8
10
|
char *name;
|
9
11
|
uchar **buffers;
|
10
12
|
int bufcnt;
|
11
13
|
int len;
|
12
|
-
int
|
14
|
+
int ref_cnt;
|
13
15
|
bool alive;
|
14
16
|
} RamFile;
|
15
17
|
|
@@ -21,7 +23,7 @@ RamFile *rf_create(const char *name)
|
|
21
23
|
rf->name = estrdup(name);
|
22
24
|
rf->len = 0;
|
23
25
|
rf->bufcnt = 1;
|
24
|
-
rf->
|
26
|
+
rf->ref_cnt = 0;
|
25
27
|
rf->alive = true;
|
26
28
|
return rf;
|
27
29
|
}
|
@@ -38,7 +40,7 @@ void rf_close(void *p)
|
|
38
40
|
{
|
39
41
|
int i;
|
40
42
|
RamFile *rf = (RamFile *)p;
|
41
|
-
if (rf->
|
43
|
+
if (rf->ref_cnt > 0 || rf->alive) return;
|
42
44
|
free(rf->name);
|
43
45
|
for (i = 0; i < rf->bufcnt; i++) {
|
44
46
|
free(rf->buffers[i]);
|
@@ -76,17 +78,21 @@ int ram_remove(Store *store, char *filename)
|
|
76
78
|
int ram_rename(Store *store, char *from, char *to)
|
77
79
|
{
|
78
80
|
RamFile *rf = (RamFile *)h_rem(store->dir.ht, from, false);
|
79
|
-
|
81
|
+
RamFile *tmp;
|
82
|
+
|
83
|
+
if (rf == NULL) {
|
80
84
|
RAISE(IO_ERROR, RENAME_ERROR_MSG);
|
85
|
+
}
|
81
86
|
|
82
87
|
free(rf->name);
|
83
88
|
|
84
89
|
rf->name = estrdup(to);
|
85
90
|
|
86
|
-
|
87
|
-
|
88
|
-
if (tmp != NULL)
|
91
|
+
/* clean up the file we are overwriting */
|
92
|
+
tmp = (RamFile *)h_get(store->dir.ht, to);
|
93
|
+
if (tmp != NULL) {
|
89
94
|
tmp->alive = false;
|
95
|
+
}
|
90
96
|
|
91
97
|
h_set(store->dir.ht, rf->name, rf);
|
92
98
|
return true;
|
@@ -112,7 +118,7 @@ void ram_each(Store *store, void (*func)(char *fname, void *arg), void *arg)
|
|
112
118
|
}
|
113
119
|
}
|
114
120
|
|
115
|
-
void
|
121
|
+
void ram_close_i(Store *store)
|
116
122
|
{
|
117
123
|
HshTable *ht = store->dir.ht;
|
118
124
|
RamFile *rf;
|
@@ -185,6 +191,7 @@ int ramo_length(OutStream *os)
|
|
185
191
|
|
186
192
|
void ramo_flush_internal(OutStream *os, uchar *src, int len)
|
187
193
|
{
|
194
|
+
uchar *buffer;
|
188
195
|
RamFile *rf = (RamFile *)os->file;
|
189
196
|
int buffer_number, buffer_offset, bytes_in_buffer, bytes_to_copy;
|
190
197
|
int src_offset;
|
@@ -197,7 +204,7 @@ void ramo_flush_internal(OutStream *os, uchar *src, int len)
|
|
197
204
|
|
198
205
|
rf_extend_if_necessary(rf, buffer_number);
|
199
206
|
|
200
|
-
|
207
|
+
buffer = rf->buffers[buffer_number];
|
201
208
|
memcpy(buffer + buffer_offset, src, bytes_to_copy);
|
202
209
|
|
203
210
|
if (bytes_to_copy < len) {
|
@@ -230,7 +237,7 @@ void ramo_reset(OutStream *os)
|
|
230
237
|
void ramo_close_internal(OutStream *os)
|
231
238
|
{
|
232
239
|
RamFile *rf = (RamFile *)os->file;
|
233
|
-
rf->
|
240
|
+
rf->ref_cnt--;
|
234
241
|
rf_close(rf);
|
235
242
|
}
|
236
243
|
|
@@ -238,9 +245,12 @@ void ramo_write_to(OutStream *os, OutStream *other_o)
|
|
238
245
|
{
|
239
246
|
int i, len;
|
240
247
|
RamFile *rf = (RamFile *)os->file;
|
248
|
+
int last_buffer_number;
|
249
|
+
int last_buffer_offset;
|
250
|
+
|
241
251
|
os_flush(os);
|
242
|
-
|
243
|
-
|
252
|
+
last_buffer_number = (int)(rf->len / BUFFER_SIZE);
|
253
|
+
last_buffer_offset = rf->len % BUFFER_SIZE;
|
244
254
|
for (i = 0; i <= last_buffer_number; i++) {
|
245
255
|
len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE);
|
246
256
|
os_write_bytes(other_o, rf->buffers[i], len);
|
@@ -250,8 +260,9 @@ void ramo_write_to(OutStream *os, OutStream *other_o)
|
|
250
260
|
OutStream *ram_create_buffer()
|
251
261
|
{
|
252
262
|
RamFile *rf = rf_create("");
|
253
|
-
rf->alive = false;
|
254
263
|
OutStream *os = os_create();
|
264
|
+
|
265
|
+
rf->alive = false;
|
255
266
|
os->file = rf;
|
256
267
|
os->pointer = 0;
|
257
268
|
os->flush_internal = &ramo_flush_internal;
|
@@ -269,12 +280,13 @@ void ram_destroy_buffer(OutStream *os)
|
|
269
280
|
OutStream *ram_create_output(Store *store, const char *filename)
|
270
281
|
{
|
271
282
|
RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
|
283
|
+
OutStream *os = os_create();
|
284
|
+
|
272
285
|
if (rf == NULL) {
|
273
286
|
rf = rf_create(filename);
|
274
287
|
h_set(store->dir.ht, rf->name, rf);
|
275
288
|
}
|
276
|
-
rf->
|
277
|
-
OutStream *os = os_create();
|
289
|
+
rf->ref_cnt++;
|
278
290
|
os->pointer = 0;
|
279
291
|
os->file = rf;
|
280
292
|
os->flush_internal = &ramo_flush_internal;
|
@@ -325,23 +337,24 @@ void rami_seek_internal(InStream *is, int pos)
|
|
325
337
|
void rami_close_internal(InStream *is)
|
326
338
|
{
|
327
339
|
RamFile *rf = (RamFile *)is->file;
|
328
|
-
rf->
|
340
|
+
rf->ref_cnt--;
|
329
341
|
rf_close(rf);
|
330
342
|
}
|
331
343
|
|
332
344
|
void rami_clone_internal(InStream *is, InStream *new_index_i)
|
333
345
|
{
|
334
|
-
((RamFile *)is->file)->
|
346
|
+
((RamFile *)is->file)->ref_cnt++;
|
335
347
|
}
|
336
348
|
|
337
349
|
InStream *ram_open_input(Store *store, const char *filename)
|
338
350
|
{
|
339
351
|
RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
|
352
|
+
InStream *is = is_create();
|
353
|
+
|
340
354
|
if (rf == NULL) {
|
341
355
|
RAISE(IO_ERROR, MISSING_RAMFILE_ERROR_MSG);
|
342
356
|
}
|
343
|
-
rf->
|
344
|
-
InStream *is = is_create();
|
357
|
+
rf->ref_cnt++;
|
345
358
|
is->file = rf;
|
346
359
|
is->d.pointer = 0;
|
347
360
|
is->is_clone = false;
|
@@ -404,7 +417,6 @@ Store *open_ram_store()
|
|
404
417
|
new_store->remove = &ram_remove;
|
405
418
|
new_store->rename = &ram_rename;
|
406
419
|
new_store->count = &ram_count;
|
407
|
-
new_store->close = &ram_close;
|
408
420
|
new_store->clear = &ram_clear;
|
409
421
|
new_store->clear_all = &ram_clear_all;
|
410
422
|
new_store->clear_locks = &ram_clear_locks;
|
@@ -414,6 +426,7 @@ Store *open_ram_store()
|
|
414
426
|
new_store->open_input = &ram_open_input;
|
415
427
|
new_store->open_lock = &ram_open_lock;
|
416
428
|
new_store->close_lock = &ram_close_lock;
|
429
|
+
new_store->close_i = &ram_close_i;
|
417
430
|
return new_store;
|
418
431
|
}
|
419
432
|
|
@@ -427,11 +440,14 @@ static void copy_files(char *fname, void *arg)
|
|
427
440
|
OutStream *os = cfa->to_store->create_output(cfa->to_store, fname);
|
428
441
|
InStream *is = cfa->from_store->open_input(cfa->from_store, fname);
|
429
442
|
int len = is_length(is);
|
430
|
-
uchar buffer
|
443
|
+
uchar *buffer = ALLOC_N(uchar, len+1);
|
444
|
+
|
431
445
|
is_read_bytes(is, buffer, 0, len);
|
432
446
|
os_write_bytes(os, buffer, len);
|
447
|
+
|
433
448
|
is_close(is);
|
434
449
|
os_close(os);
|
450
|
+
free(buffer);
|
435
451
|
}
|
436
452
|
|
437
453
|
Store *open_ram_store_and_copy(Store *from_store, bool close_dir)
|
@@ -444,7 +460,7 @@ Store *open_ram_store_and_copy(Store *from_store, bool close_dir)
|
|
444
460
|
from_store->each(from_store, ©_files, &cfa);
|
445
461
|
|
446
462
|
if (close_dir)
|
447
|
-
|
463
|
+
store_deref(from_store);
|
448
464
|
|
449
465
|
return store;
|
450
466
|
}
|
data/ext/search.c
CHANGED
@@ -135,30 +135,34 @@ Hit *hit_pq_pop(PriorityQueue *pq)
|
|
135
135
|
|
136
136
|
inline void hit_pq_up(PriorityQueue *pq)
|
137
137
|
{
|
138
|
-
int i,j;
|
139
|
-
i = pq->count;
|
140
|
-
j = i >> 1;
|
141
138
|
Hit **heap = (Hit **)pq->heap;
|
142
|
-
Hit *node
|
139
|
+
Hit *node;
|
140
|
+
int i = pq->count;
|
141
|
+
int j = i >> 1;
|
142
|
+
node = heap[i];
|
143
143
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
while ((j > 0) && hit_lt(node, heap[j])) {
|
145
|
+
heap[i] = heap[j];
|
146
|
+
i = j;
|
147
|
+
j = j >> 1;
|
148
|
+
}
|
149
149
|
heap[i] = node;
|
150
150
|
}
|
151
151
|
|
152
|
-
|
153
|
-
void hit_pq_push(PriorityQueue *pq, void *elem)
|
152
|
+
void hit_pq_insert(PriorityQueue *pq, Hit *hit)
|
154
153
|
{
|
155
|
-
pq->count
|
156
|
-
|
157
|
-
|
154
|
+
if (pq->count < pq->size) {
|
155
|
+
Hit *new_hit = ALLOC(Hit);
|
156
|
+
memcpy(new_hit, hit, sizeof(Hit));
|
157
|
+
pq->count++;
|
158
|
+
pq->heap[pq->count] = new_hit;
|
159
|
+
hit_pq_up(pq);
|
160
|
+
} else if (pq->count > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
|
161
|
+
memcpy(pq->heap[1], hit, sizeof(Hit));
|
162
|
+
hit_pq_down(pq);
|
163
|
+
}
|
158
164
|
}
|
159
165
|
|
160
|
-
|
161
|
-
|
162
166
|
/***************************************************************************
|
163
167
|
*
|
164
168
|
* TopDocs
|
@@ -174,9 +178,8 @@ TopDocs *td_create(int total_hits, int size, Hit **hits)
|
|
174
178
|
return td;
|
175
179
|
}
|
176
180
|
|
177
|
-
void td_destroy(
|
181
|
+
void td_destroy(TopDocs *td)
|
178
182
|
{
|
179
|
-
TopDocs *td = (TopDocs *)p;
|
180
183
|
int i;
|
181
184
|
for (i = 0; i < td->size; i++) {
|
182
185
|
free(td->hits[i]);
|
@@ -226,54 +229,78 @@ void w_normalize(Weight *self, float normalization_factor)
|
|
226
229
|
self->value = self->qweight * self->idf; // idf for document
|
227
230
|
}
|
228
231
|
|
232
|
+
void w_destroy(Weight *self)
|
233
|
+
{
|
234
|
+
q_deref(self->query);
|
235
|
+
free(self);
|
236
|
+
}
|
237
|
+
|
238
|
+
Weight *w_create(Query *query)
|
239
|
+
{
|
240
|
+
Weight *self = ALLOC_AND_ZERO_N(Weight, 1);
|
241
|
+
ref(query);
|
242
|
+
self->query = query;
|
243
|
+
|
244
|
+
self->get_query = &w_get_query;
|
245
|
+
self->get_value = &w_get_value;
|
246
|
+
self->normalize = &w_normalize;
|
247
|
+
self->destroy = &w_destroy;
|
248
|
+
return self;
|
249
|
+
}
|
250
|
+
|
229
251
|
/***************************************************************************
|
230
252
|
*
|
231
253
|
* Query
|
232
254
|
*
|
233
255
|
***************************************************************************/
|
234
256
|
|
235
|
-
Similarity *
|
257
|
+
Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
|
236
258
|
{
|
237
259
|
return searcher->get_similarity(searcher);
|
238
260
|
}
|
239
261
|
|
240
262
|
Query *q_rewrite(Query *self, IndexReader *ir)
|
241
263
|
{
|
264
|
+
self->ref_cnt++;
|
242
265
|
return self;
|
243
266
|
}
|
244
267
|
|
245
268
|
Weight *q_weight(Query *self, Searcher *searcher)
|
246
269
|
{
|
247
|
-
if (self->weight) {
|
248
|
-
self->weight->destroy(self->weight);
|
249
|
-
}
|
250
270
|
Query *query = searcher->rewrite(searcher, self);
|
251
|
-
Weight *weight = query->
|
271
|
+
Weight *weight = query->create_weight_i(query, searcher);
|
252
272
|
float sum = weight->sum_of_squared_weights(weight);
|
253
273
|
Similarity *sim = query->get_similarity(query, searcher);
|
254
274
|
float norm = sim_query_norm(sim, sum);
|
275
|
+
q_deref(query);
|
255
276
|
|
256
277
|
weight->normalize(weight, norm);
|
257
278
|
return self->weight = weight;
|
258
279
|
}
|
259
280
|
|
260
|
-
|
281
|
+
Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
|
282
|
+
{
|
283
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
284
|
+
return NULL;
|
285
|
+
}
|
286
|
+
|
287
|
+
void q_destroy_i(Query *self)
|
261
288
|
{
|
262
|
-
if (self->rewritten) {
|
263
|
-
self->rewritten->destroy(self->rewritten);
|
264
|
-
self->rewritten = NULL;
|
265
|
-
}
|
266
|
-
if (self->weight) {
|
267
|
-
self->weight->destroy(self->weight);
|
268
|
-
}
|
269
289
|
free(self);
|
270
290
|
}
|
271
291
|
|
272
|
-
void q_extract_terms(Query *self,
|
292
|
+
void q_extract_terms(Query *self, HashSet *terms)
|
273
293
|
{
|
274
294
|
/* do nothing by default */
|
275
295
|
}
|
276
296
|
|
297
|
+
void q_deref(Query *self)
|
298
|
+
{
|
299
|
+
if (--self->ref_cnt == 0) {
|
300
|
+
self->destroy_i(self);
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
277
304
|
Query *q_create()
|
278
305
|
{
|
279
306
|
Query *self = ALLOC(Query);
|
@@ -281,30 +308,92 @@ Query *q_create()
|
|
281
308
|
self->destroy_all = true;
|
282
309
|
self->boost = 1.0;
|
283
310
|
self->rewrite = &q_rewrite;
|
284
|
-
self->get_similarity = &
|
311
|
+
self->get_similarity = &q_get_similarity_i;
|
285
312
|
self->extract_terms = &q_extract_terms;
|
286
313
|
self->weight = NULL;
|
287
|
-
self->
|
314
|
+
self->ref_cnt = 1;
|
288
315
|
return self;
|
289
316
|
}
|
290
317
|
|
318
|
+
uint q_hash(Query *self)
|
319
|
+
{
|
320
|
+
return (self->hash(self) << 4) | self->type;
|
321
|
+
}
|
322
|
+
|
323
|
+
int q_eq(Query *self, Query *o)
|
324
|
+
{
|
325
|
+
return (self == o) || ((self->type == o->type) &&
|
326
|
+
(self->boost == o->boost) &&
|
327
|
+
self->eq(self, o));
|
328
|
+
}
|
329
|
+
|
330
|
+
Query *q_combine(Query **queries, int q_cnt)
|
331
|
+
{
|
332
|
+
int i;
|
333
|
+
Query *q, *ret_q;
|
334
|
+
HashSet *uniques =
|
335
|
+
hs_create((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
|
336
|
+
|
337
|
+
for (i = 0; i < q_cnt; i++) {
|
338
|
+
q = queries[i];
|
339
|
+
if (q->type == BOOLEAN_QUERY) {
|
340
|
+
int j;
|
341
|
+
bool splittable = true;
|
342
|
+
BooleanQuery *bq = (BooleanQuery *)q->data;
|
343
|
+
if (bq->coord_disabled == false) {
|
344
|
+
splittable = false;
|
345
|
+
} else {
|
346
|
+
for (j = 0; j < bq->clause_cnt; j++) {
|
347
|
+
if (bq->clauses[j]->occur != BC_SHOULD) {
|
348
|
+
splittable = false;
|
349
|
+
break;
|
350
|
+
}
|
351
|
+
}
|
352
|
+
}
|
353
|
+
if (splittable) {
|
354
|
+
for (j = 0; j < bq->clause_cnt; j++) {
|
355
|
+
q = bq->clauses[j]->query;
|
356
|
+
hs_add(uniques, q);
|
357
|
+
}
|
358
|
+
} else {
|
359
|
+
hs_add(uniques, q);
|
360
|
+
}
|
361
|
+
} else {
|
362
|
+
hs_add(uniques, q);
|
363
|
+
}
|
364
|
+
}
|
365
|
+
if (uniques->size == 1) {
|
366
|
+
ret_q = (Query *)uniques->elems[0];
|
367
|
+
ref(ret_q);
|
368
|
+
} else {
|
369
|
+
ret_q = bq_create(true);
|
370
|
+
for (i = 0; i < uniques->size; i++) {
|
371
|
+
q = (Query *)uniques->elems[i];
|
372
|
+
ref(q);
|
373
|
+
bq_add_query(ret_q, q, BC_SHOULD);
|
374
|
+
}
|
375
|
+
}
|
376
|
+
hs_destroy(uniques);
|
377
|
+
|
378
|
+
return ret_q;
|
379
|
+
}
|
380
|
+
|
291
381
|
/***************************************************************************
|
292
382
|
*
|
293
383
|
* Scorer
|
294
384
|
*
|
295
385
|
***************************************************************************/
|
296
386
|
|
297
|
-
void
|
387
|
+
void scorer_destroy_i(Scorer *self)
|
298
388
|
{
|
299
|
-
|
300
|
-
free(
|
301
|
-
free(scorer);
|
389
|
+
free(self->data);
|
390
|
+
free(self);
|
302
391
|
}
|
303
392
|
|
304
393
|
Scorer *scorer_create(Similarity *similarity)
|
305
394
|
{
|
306
395
|
Scorer *self = ALLOC(Scorer);
|
307
|
-
self->destroy = &
|
396
|
+
self->destroy = &scorer_destroy_i;
|
308
397
|
self->data = NULL;
|
309
398
|
self->similarity = similarity;
|
310
399
|
return self;
|
@@ -326,43 +415,58 @@ int scorer_doc_cmp(const void *p1, const void *p2)
|
|
326
415
|
{
|
327
416
|
return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
|
328
417
|
}
|
418
|
+
|
329
419
|
/***************************************************************************
|
330
420
|
*
|
331
421
|
* Searcher
|
332
422
|
*
|
333
423
|
***************************************************************************/
|
334
424
|
|
335
|
-
int
|
425
|
+
static int s_doc_freq(Searcher *self, Term *term)
|
336
426
|
{
|
337
427
|
return self->ir->doc_freq(self->ir, term);
|
338
428
|
}
|
339
429
|
|
340
|
-
int *
|
430
|
+
static int *s_doc_freqs(Searcher *self, Term **terms, int tcnt)
|
341
431
|
{
|
342
|
-
int *freqs = ALLOC_N(int, tcnt);
|
343
432
|
int i;
|
433
|
+
int *freqs = ALLOC_N(int, tcnt);
|
434
|
+
|
344
435
|
for (i = 0; i < tcnt; i++) {
|
345
436
|
freqs[i] = self->ir->doc_freq(self->ir, terms[i]);
|
346
437
|
}
|
347
438
|
return freqs;
|
348
439
|
}
|
349
440
|
|
350
|
-
|
441
|
+
static int *ss_doc_freqs(Searcher *self, Term **terms, int tcnt)
|
442
|
+
{
|
443
|
+
int i;
|
444
|
+
int *freqs = ALLOC_N(int, tcnt);
|
445
|
+
|
446
|
+
for (i = 0; i < tcnt; i++) {
|
447
|
+
freqs[i] = self->doc_freq(self, terms[i]);
|
448
|
+
}
|
449
|
+
|
450
|
+
return freqs;
|
451
|
+
}
|
452
|
+
|
453
|
+
|
454
|
+
static Document *s_get_doc(Searcher *self, int doc_num)
|
351
455
|
{
|
352
456
|
return self->ir->get_doc(self->ir, doc_num);
|
353
457
|
}
|
354
458
|
|
355
|
-
int
|
459
|
+
static int s_max_doc(Searcher *self)
|
356
460
|
{
|
357
461
|
return self->ir->max_doc(self->ir);
|
358
462
|
}
|
359
463
|
|
360
|
-
Weight *
|
464
|
+
static Weight *s_create_weight(Searcher *self, Query *query)
|
361
465
|
{
|
362
466
|
return q_weight(query, self);
|
363
467
|
}
|
364
468
|
|
365
|
-
TopDocs *
|
469
|
+
static TopDocs *s_search(Searcher *self, Query *query, int first_doc,
|
366
470
|
int num_docs, Filter *filter, Sort *sort)
|
367
471
|
{
|
368
472
|
int max_size = first_doc + num_docs;
|
@@ -370,14 +474,13 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
370
474
|
Weight *weight;
|
371
475
|
Scorer *scorer;
|
372
476
|
Hit **score_docs = NULL;
|
373
|
-
Hit
|
477
|
+
Hit hit;
|
374
478
|
int total_hits = 0;
|
375
|
-
float
|
479
|
+
float score;
|
376
480
|
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
377
481
|
Hit *(*hq_pop)(PriorityQueue *pq);
|
378
|
-
void (*
|
379
|
-
void (*
|
380
|
-
void (*hq_destroy)(void *p);
|
482
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
483
|
+
void (*hq_destroy)(PriorityQueue *self);
|
381
484
|
PriorityQueue *hq;
|
382
485
|
|
383
486
|
|
@@ -391,20 +494,19 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
391
494
|
scorer = weight->scorer(weight, self->ir);
|
392
495
|
if (!scorer) {
|
393
496
|
if (bits) bv_destroy(bits);
|
497
|
+
weight->destroy(weight);
|
394
498
|
return td_create(0, 0, NULL);
|
395
499
|
}
|
396
500
|
|
397
501
|
if (sort) {
|
398
502
|
hq = fshq_pq_create(max_size, sort, self->ir);
|
399
503
|
hq_pop = &fshq_pq_pop;
|
400
|
-
|
401
|
-
hq_push = &fshq_pq_push;
|
504
|
+
hq_insert = &fshq_pq_insert;
|
402
505
|
hq_destroy = &fshq_pq_destroy;
|
403
506
|
} else {
|
404
507
|
hq = pq_create(max_size, &hit_less_than);
|
405
508
|
hq_pop = &hit_pq_pop;
|
406
|
-
|
407
|
-
hq_push = &hit_pq_push;
|
509
|
+
hq_insert = &hit_pq_insert;
|
408
510
|
hq_destroy = &pq_destroy;
|
409
511
|
}
|
410
512
|
|
@@ -412,19 +514,11 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
412
514
|
if (bits && !bv_get(bits, scorer->doc)) continue;
|
413
515
|
total_hits++;
|
414
516
|
score = scorer->score(scorer);
|
415
|
-
|
416
|
-
|
417
|
-
hit->doc = scorer->doc; hit->score = score;
|
418
|
-
hq_push(hq, hit);
|
419
|
-
min_score = ((Hit *)pq_top(hq))->score; // maintain min_score
|
420
|
-
} else if (score > min_score) {
|
421
|
-
hit = pq_top(hq);
|
422
|
-
hit->doc = scorer->doc; hit->score = score;
|
423
|
-
hq_down(hq);
|
424
|
-
min_score = ((Hit *)pq_top(hq))->score; // maintain min_score
|
425
|
-
}
|
517
|
+
hit.doc = scorer->doc; hit.score = score;
|
518
|
+
hq_insert(hq, &hit);
|
426
519
|
}
|
427
520
|
scorer->destroy(scorer);
|
521
|
+
weight->destroy(weight);
|
428
522
|
|
429
523
|
if (hq->count > first_doc) {
|
430
524
|
if ((hq->count - first_doc) < num_docs) {
|
@@ -446,14 +540,12 @@ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
|
|
446
540
|
return td_create(total_hits, num_docs, score_docs);
|
447
541
|
}
|
448
542
|
|
449
|
-
void
|
450
|
-
void (*fn)(Searcher
|
543
|
+
static void s_search_each_w(Searcher *self, Weight *weight, Filter *filter,
|
544
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
451
545
|
{
|
452
|
-
Weight *weight;
|
453
546
|
Scorer *scorer;
|
454
547
|
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
455
548
|
|
456
|
-
weight = q_weight(query, self);
|
457
549
|
scorer = weight->scorer(weight, self->ir);
|
458
550
|
if (!scorer) {
|
459
551
|
if (bits) bv_destroy(bits);
|
@@ -462,34 +554,53 @@ void sea_search_each(Searcher *self, Query *query, Filter *filter,
|
|
462
554
|
|
463
555
|
while (scorer->next(scorer)) {
|
464
556
|
if (bits && !bv_get(bits, scorer->doc)) continue;
|
465
|
-
fn(self, scorer->doc, arg);
|
557
|
+
fn(self, scorer->doc, scorer->score(scorer), arg);
|
466
558
|
}
|
467
559
|
scorer->destroy(scorer);
|
468
560
|
}
|
469
561
|
|
470
|
-
|
562
|
+
static void s_search_each(Searcher *self, Query *query, Filter *filter,
|
563
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
471
564
|
{
|
565
|
+
Weight *weight;
|
566
|
+
weight = q_weight(query, self);
|
567
|
+
s_search_each_w(self, weight, filter, fn, arg);
|
568
|
+
weight->destroy(weight);
|
569
|
+
}
|
570
|
+
|
571
|
+
static Query *s_rewrite(Searcher *self, Query *original)
|
572
|
+
{
|
573
|
+
int q_is_destroyed = false;
|
472
574
|
Query *query = original;
|
473
575
|
Query *rewritten_query = query->rewrite(query, self->ir);
|
474
|
-
while (query != rewritten_query) {
|
576
|
+
while (q_is_destroyed || (query != rewritten_query)) {
|
475
577
|
query = rewritten_query;
|
476
578
|
rewritten_query = query->rewrite(query, self->ir);
|
579
|
+
q_is_destroyed = (query->ref_cnt <= 1);
|
580
|
+
q_deref(query); /* destroy intermediate queries */
|
477
581
|
}
|
478
582
|
return query;
|
479
583
|
}
|
480
584
|
|
481
|
-
Explanation *
|
585
|
+
static Explanation *s_explain(Searcher *self, Query *query, int doc_num)
|
482
586
|
{
|
483
587
|
Weight *weight = q_weight(query, self);
|
484
|
-
|
588
|
+
Explanation *e = weight->explain(weight, self->ir, doc_num);
|
589
|
+
weight->destroy(weight);
|
590
|
+
return e;
|
591
|
+
}
|
592
|
+
|
593
|
+
static Explanation *s_explain_w(Searcher *self, Weight *w, int doc_num)
|
594
|
+
{
|
595
|
+
return w->explain(w, self->ir, doc_num);
|
485
596
|
}
|
486
597
|
|
487
|
-
Similarity *
|
598
|
+
static Similarity *s_get_similarity(Searcher *self)
|
488
599
|
{
|
489
600
|
return self->similarity;
|
490
601
|
}
|
491
602
|
|
492
|
-
void
|
603
|
+
static void s_close(Searcher *self)
|
493
604
|
{
|
494
605
|
if (self->ir && self->close_ir)
|
495
606
|
ir_close(self->ir);
|
@@ -502,17 +613,436 @@ Searcher *sea_create(IndexReader *ir)
|
|
502
613
|
self->ir = ir;
|
503
614
|
self->close_ir = true;
|
504
615
|
self->similarity = sim_create_default();
|
505
|
-
self->doc_freq = &
|
506
|
-
self->doc_freqs = &
|
507
|
-
self->get_doc = &
|
508
|
-
self->max_doc = &
|
509
|
-
self->create_weight = &
|
510
|
-
self->search = &
|
511
|
-
self->
|
512
|
-
self->
|
513
|
-
self->
|
514
|
-
self->
|
616
|
+
self->doc_freq = &s_doc_freq;
|
617
|
+
self->doc_freqs = &s_doc_freqs;
|
618
|
+
self->get_doc = &s_get_doc;
|
619
|
+
self->max_doc = &s_max_doc;
|
620
|
+
self->create_weight = &s_create_weight;
|
621
|
+
self->search = &s_search;
|
622
|
+
self->search_each = &s_search_each;
|
623
|
+
self->search_each_w = &s_search_each_w;
|
624
|
+
self->rewrite = &s_rewrite;
|
625
|
+
self->explain = &s_explain;
|
626
|
+
self->explain_w = &s_explain_w;
|
627
|
+
self->get_similarity = &s_get_similarity;
|
628
|
+
self->close = &s_close;
|
629
|
+
return self;
|
630
|
+
}
|
631
|
+
|
632
|
+
/***************************************************************************
|
633
|
+
*
|
634
|
+
* CachedDFSearcher
|
635
|
+
*
|
636
|
+
***************************************************************************/
|
637
|
+
|
638
|
+
typedef struct CachedDFSearcher {
|
639
|
+
HshTable *df_map;
|
640
|
+
int max_doc;
|
641
|
+
} CachedDFSearcher;
|
642
|
+
|
643
|
+
static int cdfsea_doc_freq(Searcher *self, Term *term)
|
644
|
+
{
|
645
|
+
CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
|
646
|
+
return (int)h_get(cdfsea->df_map, term);
|
647
|
+
}
|
648
|
+
|
649
|
+
static Document *cdfsea_get_doc(Searcher *self, int doc_num)
|
650
|
+
{
|
651
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
652
|
+
return NULL;
|
653
|
+
}
|
654
|
+
|
655
|
+
static int cdfsea_max_doc(Searcher *self)
|
656
|
+
{
|
657
|
+
return ((CachedDFSearcher *)self->data)->max_doc;
|
658
|
+
}
|
659
|
+
|
660
|
+
static Weight *cdfsea_create_weight(Searcher *self, Query *query)
|
661
|
+
{
|
662
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
663
|
+
return NULL;
|
664
|
+
}
|
665
|
+
|
666
|
+
static TopDocs *cdfsea_search(Searcher *self, Query *query, int first_doc,
|
667
|
+
int num_docs, Filter *filter, Sort *sort)
|
668
|
+
{
|
669
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
670
|
+
return NULL;
|
671
|
+
}
|
672
|
+
|
673
|
+
static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
|
674
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
675
|
+
{
|
676
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
677
|
+
}
|
678
|
+
|
679
|
+
static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
680
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
681
|
+
{
|
682
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
683
|
+
}
|
684
|
+
|
685
|
+
static Query *cdfsea_rewrite(Searcher *self, Query *original)
|
686
|
+
{
|
687
|
+
original->ref_cnt++;
|
688
|
+
return original;
|
689
|
+
}
|
690
|
+
|
691
|
+
static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
|
692
|
+
{
|
693
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
694
|
+
return NULL;
|
695
|
+
}
|
696
|
+
|
697
|
+
static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
|
698
|
+
{
|
699
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
700
|
+
return NULL;
|
701
|
+
}
|
702
|
+
|
703
|
+
static Similarity *cdfsea_get_similarity(Searcher *self)
|
704
|
+
{
|
705
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
706
|
+
return NULL;
|
707
|
+
}
|
708
|
+
|
709
|
+
static void cdfsea_close(Searcher *self)
|
710
|
+
{
|
711
|
+
CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
|
712
|
+
h_destroy(cdfsea->df_map);
|
713
|
+
free(cdfsea);
|
714
|
+
free(self);
|
715
|
+
}
|
716
|
+
|
717
|
+
Searcher *cdfsea_create(HshTable *df_map, int max_doc)
|
718
|
+
{
|
719
|
+
Searcher *self = ALLOC(Searcher);
|
720
|
+
|
721
|
+
CachedDFSearcher *cdfsea = ALLOC(CachedDFSearcher);
|
722
|
+
|
723
|
+
cdfsea->df_map = df_map;
|
724
|
+
cdfsea->max_doc = max_doc;
|
725
|
+
self->data = cdfsea;
|
726
|
+
|
727
|
+
self->doc_freq = &cdfsea_doc_freq;
|
728
|
+
self->doc_freqs = &ss_doc_freqs;
|
729
|
+
self->get_doc = &cdfsea_get_doc;
|
730
|
+
self->max_doc = &cdfsea_max_doc;
|
731
|
+
self->create_weight = &cdfsea_create_weight;
|
732
|
+
self->search = &cdfsea_search;
|
733
|
+
self->search_each = &cdfsea_search_each;
|
734
|
+
self->search_each_w = &cdfsea_search_each_w;
|
735
|
+
self->rewrite = &cdfsea_rewrite;
|
736
|
+
self->explain = &cdfsea_explain;
|
737
|
+
self->explain_w = &cdfsea_explain_w;
|
738
|
+
self->get_similarity = &cdfsea_get_similarity;
|
739
|
+
self->close = &cdfsea_close;
|
515
740
|
return self;
|
516
741
|
}
|
517
742
|
|
743
|
+
/***************************************************************************
|
744
|
+
*
|
745
|
+
* MultiSearcher
|
746
|
+
*
|
747
|
+
***************************************************************************/
|
748
|
+
|
749
|
+
static inline int msea_get_searcher_index(Searcher *self, int n)
|
750
|
+
{
|
751
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
752
|
+
int lo = 0; /* search starts array */
|
753
|
+
int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
|
754
|
+
int mid, mid_val;
|
755
|
+
|
756
|
+
while (hi >= lo) {
|
757
|
+
mid = (lo + hi) >> 1;
|
758
|
+
mid_val = msea->starts[mid];
|
759
|
+
if (n < mid_val) {
|
760
|
+
hi = mid - 1;
|
761
|
+
} else if (n > mid_val) {
|
762
|
+
lo = mid + 1;
|
763
|
+
} else { /* found a match */
|
764
|
+
while (((mid+1) < msea->s_cnt) && (msea->starts[mid+1] == mid_val)) {
|
765
|
+
mid++; /* scan to last match */
|
766
|
+
}
|
767
|
+
return mid;
|
768
|
+
}
|
769
|
+
}
|
770
|
+
return hi;
|
771
|
+
}
|
772
|
+
|
773
|
+
static int msea_doc_freq(Searcher *self, Term *term)
|
774
|
+
{
|
775
|
+
int i;
|
776
|
+
int doc_freq = 0;
|
777
|
+
Searcher *s;
|
778
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
779
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
780
|
+
s = msea->searchers[i];
|
781
|
+
doc_freq += s->doc_freq(s, term);
|
782
|
+
}
|
783
|
+
|
784
|
+
return doc_freq;
|
785
|
+
}
|
786
|
+
|
787
|
+
static Document *msea_get_doc(Searcher *self, int doc_num)
|
788
|
+
{
|
789
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
790
|
+
int i = msea_get_searcher_index(self, doc_num);
|
791
|
+
Searcher *s = msea->searchers[i];
|
792
|
+
return s->get_doc(s, doc_num - msea->starts[i]);
|
793
|
+
}
|
794
|
+
|
795
|
+
static int msea_max_doc(Searcher *self)
|
796
|
+
{
|
797
|
+
return ((MultiSearcher *)self->data)->max_doc;
|
798
|
+
}
|
799
|
+
|
800
|
+
static Weight *msea_create_weight(Searcher *self, Query *query)
|
801
|
+
{
|
802
|
+
int i, *dfs;
|
803
|
+
Searcher *cdfsea;
|
804
|
+
Weight *w;
|
805
|
+
HshTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
|
806
|
+
(free_ft)NULL, (free_ft)NULL);
|
807
|
+
Query *rq = self->rewrite(self, query);
|
808
|
+
HashSet *terms = term_set_create();
|
809
|
+
rq->extract_terms(rq, terms);
|
810
|
+
dfs = self->doc_freqs(self, (Term **)terms->elems, terms->size);
|
811
|
+
|
812
|
+
for (i = 0; i < terms->size; i++) {
|
813
|
+
h_set(df_map, terms->elems[i], (void *)dfs[i]);
|
814
|
+
}
|
815
|
+
/* don't destroy the individual terms, only the HashSet */
|
816
|
+
hs_destroy(terms);
|
817
|
+
free(dfs);
|
818
|
+
|
819
|
+
cdfsea = cdfsea_create(df_map, ((MultiSearcher *)self->data)->max_doc);
|
820
|
+
|
821
|
+
w = q_weight(rq, cdfsea);
|
822
|
+
q_deref(rq);
|
823
|
+
cdfsea->close(cdfsea);
|
824
|
+
|
825
|
+
return w;
|
826
|
+
}
|
827
|
+
|
828
|
+
struct MultiSearchEachArg {
|
829
|
+
int start;
|
830
|
+
void *arg;
|
831
|
+
void (*fn)(Searcher *, int, float, void *);
|
832
|
+
};
|
833
|
+
|
834
|
+
void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
|
835
|
+
{
|
836
|
+
struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
|
837
|
+
|
838
|
+
mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
|
839
|
+
}
|
840
|
+
|
841
|
+
static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
842
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
843
|
+
{
|
844
|
+
int i;
|
845
|
+
struct MultiSearchEachArg mse_arg;
|
846
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
847
|
+
Searcher *s;
|
848
|
+
|
849
|
+
mse_arg.fn = fn;
|
850
|
+
mse_arg.arg = arg;
|
851
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
852
|
+
s = msea->searchers[i];
|
853
|
+
mse_arg.start = msea->starts[i];
|
854
|
+
s->search_each_w(s, w, filter, &msea_search_each_i, &mse_arg);
|
855
|
+
}
|
856
|
+
}
|
518
857
|
|
858
|
+
static void msea_search_each(Searcher *self, Query *query, Filter *filter,
|
859
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
860
|
+
{
|
861
|
+
Weight *w = q_weight(query, self);
|
862
|
+
msea_search_each_w(self, w, filter, fn, arg);
|
863
|
+
w->destroy(w);
|
864
|
+
}
|
865
|
+
|
866
|
+
struct MultiSearchArg {
|
867
|
+
int total_hits, max_size;
|
868
|
+
float min_score;
|
869
|
+
PriorityQueue *hq;
|
870
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
871
|
+
};
|
872
|
+
|
873
|
+
void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
|
874
|
+
{
|
875
|
+
struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
|
876
|
+
Hit hit;
|
877
|
+
|
878
|
+
ms_arg->total_hits++;
|
879
|
+
hit.doc = doc_num;
|
880
|
+
hit.score = score;
|
881
|
+
ms_arg->hq_insert(ms_arg->hq, &hit);
|
882
|
+
}
|
883
|
+
|
884
|
+
static TopDocs *msea_search(Searcher *self, Query *query, int first_doc,
|
885
|
+
int num_docs, Filter *filter, Sort *sort)
|
886
|
+
{
|
887
|
+
int max_size = first_doc + num_docs;
|
888
|
+
int i;
|
889
|
+
Weight *weight;
|
890
|
+
Hit **score_docs = NULL;
|
891
|
+
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
892
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
893
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
894
|
+
void (*hq_destroy)(PriorityQueue *self);
|
895
|
+
PriorityQueue *hq;
|
896
|
+
struct MultiSearchArg ms_arg;
|
897
|
+
|
898
|
+
|
899
|
+
if (num_docs <= 0)
|
900
|
+
RAISE(ARG_ERROR, NUM_DOCS_ARG_ERROR_MSG);
|
901
|
+
|
902
|
+
if (first_doc < 0)
|
903
|
+
RAISE(ARG_ERROR, FIRST_DOC_ARG_ERROR_MSG);
|
904
|
+
|
905
|
+
weight = q_weight(query, self);
|
906
|
+
if (sort) {
|
907
|
+
hq = fshq_pq_create(max_size, sort, self->ir);
|
908
|
+
hq_pop = &fshq_pq_pop;
|
909
|
+
hq_insert = &fshq_pq_insert;
|
910
|
+
hq_destroy = &fshq_pq_destroy;
|
911
|
+
} else {
|
912
|
+
hq = pq_create(max_size, &hit_less_than);
|
913
|
+
hq_pop = &hit_pq_pop;
|
914
|
+
hq_insert = &hit_pq_insert;
|
915
|
+
hq_destroy = &pq_destroy;
|
916
|
+
}
|
917
|
+
|
918
|
+
|
919
|
+
ms_arg.hq = hq;
|
920
|
+
ms_arg.total_hits = 0;
|
921
|
+
ms_arg.max_size = max_size;
|
922
|
+
ms_arg.min_score = 0.0;
|
923
|
+
ms_arg.hq_insert = hq_insert;
|
924
|
+
|
925
|
+
msea_search_each_w(self, weight, filter, msea_search_i, &ms_arg);
|
926
|
+
|
927
|
+
weight->destroy(weight);
|
928
|
+
|
929
|
+
if (hq->count > first_doc) {
|
930
|
+
if ((hq->count - first_doc) < num_docs) {
|
931
|
+
num_docs = hq->count - first_doc;
|
932
|
+
}
|
933
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
934
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
935
|
+
score_docs[i] = hq_pop(hq);
|
936
|
+
//hit = score_docs[i] = pq_pop(hq);
|
937
|
+
//printf("hit = %d-->%f\n", hit->doc, hit->score);
|
938
|
+
}
|
939
|
+
} else {
|
940
|
+
num_docs = 0;
|
941
|
+
}
|
942
|
+
pq_clear(hq);
|
943
|
+
hq_destroy(hq);
|
944
|
+
|
945
|
+
if (bits) bv_destroy(bits);
|
946
|
+
return td_create(ms_arg.total_hits, num_docs, score_docs);
|
947
|
+
}
|
948
|
+
|
949
|
+
static Query *msea_rewrite(Searcher *self, Query *original)
|
950
|
+
{
|
951
|
+
int i;
|
952
|
+
Searcher *s;
|
953
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
954
|
+
Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
|
955
|
+
|
956
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
957
|
+
s = msea->searchers[i];
|
958
|
+
queries[i] = s->rewrite(s, original);
|
959
|
+
}
|
960
|
+
rewritten = q_combine(queries, msea->s_cnt);
|
961
|
+
|
962
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
963
|
+
q_deref(queries[i]);
|
964
|
+
}
|
965
|
+
free(queries);
|
966
|
+
return rewritten;
|
967
|
+
}
|
968
|
+
|
969
|
+
static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
|
970
|
+
{
|
971
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
972
|
+
int i = msea_get_searcher_index(self, doc_num);
|
973
|
+
Weight *w = q_weight(query, self);
|
974
|
+
Searcher *s = msea->searchers[i];
|
975
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
976
|
+
w->destroy(w);
|
977
|
+
return e;
|
978
|
+
}
|
979
|
+
|
980
|
+
static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
|
981
|
+
{
|
982
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
983
|
+
int i = msea_get_searcher_index(self, doc_num);
|
984
|
+
Searcher *s = msea->searchers[i];
|
985
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
986
|
+
return e;
|
987
|
+
}
|
988
|
+
|
989
|
+
static Similarity *msea_get_similarity(Searcher *self)
|
990
|
+
{
|
991
|
+
return self->similarity;
|
992
|
+
}
|
993
|
+
|
994
|
+
static void msea_close(Searcher *self)
|
995
|
+
{
|
996
|
+
int i;
|
997
|
+
Searcher *s;
|
998
|
+
MultiSearcher *msea = (MultiSearcher *)self->data;
|
999
|
+
if (msea->close_subs) {
|
1000
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1001
|
+
s = msea->searchers[i];
|
1002
|
+
s->close(s);
|
1003
|
+
}
|
1004
|
+
free(msea->searchers);
|
1005
|
+
}
|
1006
|
+
free(msea->starts);
|
1007
|
+
free(msea);
|
1008
|
+
free(self);
|
1009
|
+
}
|
1010
|
+
|
1011
|
+
Searcher *msea_create(Searcher **searchers, int s_cnt, bool close_subs)
|
1012
|
+
{
|
1013
|
+
int i, max_doc = 0, *starts;
|
1014
|
+
Searcher *self = ALLOC(Searcher);
|
1015
|
+
|
1016
|
+
MultiSearcher *msea = ALLOC(MultiSearcher);
|
1017
|
+
|
1018
|
+
starts = ALLOC_N(int, s_cnt + 1);
|
1019
|
+
for (i = 0; i < s_cnt; i++) {
|
1020
|
+
starts[i] = max_doc;
|
1021
|
+
max_doc += searchers[i]->max_doc(searchers[i]);
|
1022
|
+
}
|
1023
|
+
starts[i] = max_doc;
|
1024
|
+
|
1025
|
+
msea->s_cnt = s_cnt;
|
1026
|
+
msea->searchers = searchers;
|
1027
|
+
msea->starts = starts;
|
1028
|
+
msea->max_doc = max_doc;
|
1029
|
+
msea->close_subs = close_subs;
|
1030
|
+
self->data = msea;
|
1031
|
+
|
1032
|
+
self->ir = (IndexReader *)NULL;
|
1033
|
+
self->similarity = sim_create_default();
|
1034
|
+
self->doc_freq = &msea_doc_freq;
|
1035
|
+
self->doc_freqs = &ss_doc_freqs;
|
1036
|
+
self->get_doc = &msea_get_doc;
|
1037
|
+
self->max_doc = &msea_max_doc;
|
1038
|
+
self->create_weight = &msea_create_weight;
|
1039
|
+
self->search = &msea_search;
|
1040
|
+
self->search_each = &msea_search_each;
|
1041
|
+
self->search_each_w = &msea_search_each_w;
|
1042
|
+
self->rewrite = &msea_rewrite;
|
1043
|
+
self->explain = &msea_explain;
|
1044
|
+
self->explain_w = &msea_explain_w;
|
1045
|
+
self->get_similarity = &msea_get_similarity;
|
1046
|
+
self->close = &msea_close;
|
1047
|
+
return self;
|
1048
|
+
}
|