ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/lang.c
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#include <stdarg.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <errno.h>
|
5
|
+
#include "global.h"
|
6
|
+
|
7
|
+
void ft_raise(char *file, int line_num, VALUE etype, const char *fmt, ...)
|
8
|
+
{
|
9
|
+
va_list args;
|
10
|
+
char buf[MAX_ERROR_LEN];
|
11
|
+
char *buf_ptr = buf;
|
12
|
+
|
13
|
+
if (progname() != NULL) {
|
14
|
+
sprintf(buf_ptr, "%s: ", progname());
|
15
|
+
buf_ptr += strlen(buf_ptr);
|
16
|
+
}
|
17
|
+
|
18
|
+
sprintf(buf_ptr, "Error occured at <%s>:%d\n", file, line_num);
|
19
|
+
buf_ptr += strlen(buf_ptr);
|
20
|
+
va_start(args, fmt);
|
21
|
+
vsprintf(buf_ptr, fmt, args);
|
22
|
+
buf_ptr += strlen(buf_ptr);
|
23
|
+
va_end(args);
|
24
|
+
|
25
|
+
if (fmt[0] != '\0' && fmt[strlen(fmt)-1] == ':') {
|
26
|
+
sprintf(buf_ptr, " %s", strerror(errno));
|
27
|
+
buf_ptr += strlen(buf_ptr);
|
28
|
+
}
|
29
|
+
sprintf(buf_ptr, "\n");
|
30
|
+
rb_raise(etype, buf); /* conventional value for failed execution */
|
31
|
+
}
|
data/ext/lang.h
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#ifndef FRT_LANG_H
|
2
|
+
#define FRT_LANG_H
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
|
6
|
+
#define FERRET_EXT
|
7
|
+
|
8
|
+
#define MAX_ERROR_LEN 2048
|
9
|
+
#define eprintf(...) ft_raise(__FILE__, __LINE__, __VA_ARGS__)
|
10
|
+
extern void ft_raise(char *file, int line_num, VALUE etype, const char *fmt, ...);
|
11
|
+
extern void weprintf(const char *fmt, ...);
|
12
|
+
extern char *progname(void);
|
13
|
+
extern void setprogname(const char *str);
|
14
|
+
|
15
|
+
extern VALUE cQueryParseException;
|
16
|
+
|
17
|
+
#define ERROR rb_eException
|
18
|
+
#define IO_ERROR rb_eIOError
|
19
|
+
#define ARG_ERROR rb_eArgError
|
20
|
+
#define EOF_ERROR rb_eEOFError
|
21
|
+
#define UNSUPPORTED_ERROR rb_eNotImpError
|
22
|
+
#define STATE_ERROR rb_eException
|
23
|
+
#define PARSE_ERROR cQueryParseException
|
24
|
+
#define MEM_ERROR rb_eNoMemError
|
25
|
+
|
26
|
+
typedef void * mutex_t;
|
27
|
+
typedef void * thread_key_t;
|
28
|
+
#define MUTEX_INITIALIZER NULL
|
29
|
+
#define MUTEX_RECURSIVE_INITIALIZER NULL
|
30
|
+
#define mutex_init(a, b)
|
31
|
+
#define mutex_lock(a)
|
32
|
+
#define mutex_trylock(a)
|
33
|
+
#define mutex_unlock(a)
|
34
|
+
#define mutex_destroy(a)
|
35
|
+
#define thread_key_create(a, b)
|
36
|
+
#define thread_key_delete(a)
|
37
|
+
#define thread_setspecific(a, b)
|
38
|
+
#define thread_getspecific(a) NULL
|
39
|
+
#define thread_exit(a)
|
40
|
+
|
41
|
+
#endif
|
data/ext/priorityqueue.c
ADDED
@@ -0,0 +1,228 @@
|
|
1
|
+
#include <priorityqueue.h>
|
2
|
+
|
3
|
+
PriorityQueue *pq_create(int max_size, bool (*less_than)(void *p1, void *p2))
|
4
|
+
{
|
5
|
+
PriorityQueue *pq = ALLOC(PriorityQueue);
|
6
|
+
pq->count = 0;
|
7
|
+
pq->size = max_size;
|
8
|
+
pq->heap = ALLOC_N(void *, (max_size + 1));
|
9
|
+
pq->lt = less_than;
|
10
|
+
pq->free_elem = &free;
|
11
|
+
return pq;
|
12
|
+
}
|
13
|
+
|
14
|
+
void pq_destroy(void *p)
|
15
|
+
{
|
16
|
+
PriorityQueue *pq = (PriorityQueue *)p;
|
17
|
+
free(pq->heap);
|
18
|
+
free(p);
|
19
|
+
}
|
20
|
+
|
21
|
+
void pq_up(PriorityQueue *pq)
|
22
|
+
{
|
23
|
+
int i,j;
|
24
|
+
i = pq->count;
|
25
|
+
j = i >> 1;
|
26
|
+
void **heap = pq->heap;
|
27
|
+
void *node = heap[i];
|
28
|
+
|
29
|
+
while ((j > 0) && pq->lt(node, heap[j])) {
|
30
|
+
heap[i] = heap[j];
|
31
|
+
i = j;
|
32
|
+
j = j >> 1;
|
33
|
+
}
|
34
|
+
heap[i] = node;
|
35
|
+
}
|
36
|
+
|
37
|
+
void pq_down(PriorityQueue *pq)
|
38
|
+
{
|
39
|
+
register int i = 1;
|
40
|
+
register int j = 2; //i << 1;
|
41
|
+
register int k = 3; //j + 1;
|
42
|
+
register int count = pq->count;
|
43
|
+
void **heap = pq->heap;
|
44
|
+
void *node = heap[i]; // save top node
|
45
|
+
|
46
|
+
if ((k <= count) && (pq->lt(heap[k], heap[j])))
|
47
|
+
j = k;
|
48
|
+
|
49
|
+
while ((j <= count) && pq->lt(heap[j], node)) {
|
50
|
+
heap[i] = heap[j]; // shift up child
|
51
|
+
i = j;
|
52
|
+
j = i << 1;
|
53
|
+
k = j + 1;
|
54
|
+
if ((k <= count) && pq->lt(heap[k], heap[j]))
|
55
|
+
j = k;
|
56
|
+
}
|
57
|
+
heap[i] = node;
|
58
|
+
}
|
59
|
+
|
60
|
+
void pq_push(PriorityQueue *pq, void *elem)
|
61
|
+
{
|
62
|
+
pq->count++;
|
63
|
+
pq->heap[pq->count] = elem;
|
64
|
+
pq_up(pq);
|
65
|
+
}
|
66
|
+
|
67
|
+
void *pq_top(PriorityQueue *pq)
|
68
|
+
{
|
69
|
+
return pq->heap[1];
|
70
|
+
}
|
71
|
+
|
72
|
+
void *pq_pop(PriorityQueue *pq)
|
73
|
+
{
|
74
|
+
if (pq->count > 0) {
|
75
|
+
void *result = pq->heap[1]; // save first value
|
76
|
+
pq->heap[1] = pq->heap[pq->count]; // move last to first
|
77
|
+
pq->heap[pq->count] = NULL;
|
78
|
+
pq->count--;
|
79
|
+
pq_down(pq); // adjust heap
|
80
|
+
return result;
|
81
|
+
} else {
|
82
|
+
return NULL;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
void pq_clear(PriorityQueue *pq)
|
87
|
+
{
|
88
|
+
int i;
|
89
|
+
for (i = 1; i <= pq->count; i++) {
|
90
|
+
pq->free_elem(pq->heap[i]);
|
91
|
+
pq->heap[i] = NULL;
|
92
|
+
}
|
93
|
+
pq->count = 0;
|
94
|
+
}
|
95
|
+
|
96
|
+
int pq_insert(PriorityQueue *pq, void *elem)
|
97
|
+
{
|
98
|
+
if (pq->count < pq->size) {
|
99
|
+
pq_push(pq, elem);
|
100
|
+
return true;
|
101
|
+
} else if (pq->count > 0 && pq->lt(pq_top(pq), elem)) {
|
102
|
+
pq->free_elem(pq->heap[1]);
|
103
|
+
pq->heap[1] = elem;
|
104
|
+
pq_down(pq);
|
105
|
+
return true;
|
106
|
+
} else {
|
107
|
+
pq->free_elem(elem);
|
108
|
+
return false;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
/*****************************************************************************
|
113
|
+
*
|
114
|
+
* PriorityQueue2
|
115
|
+
*
|
116
|
+
*****************************************************************************/
|
117
|
+
|
118
|
+
PriorityQueue2 *pq2_create(int max_size,
|
119
|
+
bool (*less_than)(PriorityQueue2 *pq, void *p1, void *p2),
|
120
|
+
void (*destroy)(void *p))
|
121
|
+
{
|
122
|
+
PriorityQueue2 *pq = ALLOC(PriorityQueue2);
|
123
|
+
pq->count = 0;
|
124
|
+
pq->size = max_size;
|
125
|
+
pq->heap = ALLOC_N(void *, (max_size + 1));
|
126
|
+
pq->lt = less_than;
|
127
|
+
pq->free_elem = &free;
|
128
|
+
pq->destroy = destroy;
|
129
|
+
return pq;
|
130
|
+
}
|
131
|
+
|
132
|
+
void pq2_destroy(void *p)
|
133
|
+
{
|
134
|
+
PriorityQueue2 *pq = (PriorityQueue2 *)p;
|
135
|
+
free(pq->heap);
|
136
|
+
free(p);
|
137
|
+
}
|
138
|
+
|
139
|
+
void pq2_up(PriorityQueue2 *pq)
|
140
|
+
{
|
141
|
+
int i,j;
|
142
|
+
i = pq->count;
|
143
|
+
j = i >> 1;
|
144
|
+
void **heap = pq->heap;
|
145
|
+
void *node = heap[i];
|
146
|
+
|
147
|
+
while ((j > 0) && pq->lt(pq, node, heap[j])) {
|
148
|
+
heap[i] = heap[j];
|
149
|
+
i = j;
|
150
|
+
j = j >> 1;
|
151
|
+
}
|
152
|
+
heap[i] = node;
|
153
|
+
}
|
154
|
+
|
155
|
+
void pq2_down(PriorityQueue2 *pq)
|
156
|
+
{
|
157
|
+
register int i = 1;
|
158
|
+
register int j = 2; //i << 1;
|
159
|
+
register int k = 3; //j + 1;
|
160
|
+
register int count = pq->count;
|
161
|
+
void **heap = pq->heap;
|
162
|
+
void *node = heap[i]; // save top node
|
163
|
+
|
164
|
+
if ((k <= count) && (pq->lt(pq, heap[k], heap[j])))
|
165
|
+
j = k;
|
166
|
+
|
167
|
+
while ((j <= count) && pq->lt(pq, heap[j], node)) {
|
168
|
+
heap[i] = heap[j]; // shift up child
|
169
|
+
i = j;
|
170
|
+
j = i << 1;
|
171
|
+
k = j + 1;
|
172
|
+
if ((k <= count) && pq->lt(pq, heap[k], heap[j]))
|
173
|
+
j = k;
|
174
|
+
}
|
175
|
+
heap[i] = node;
|
176
|
+
}
|
177
|
+
|
178
|
+
void pq2_push(PriorityQueue2 *pq, void *elem)
|
179
|
+
{
|
180
|
+
pq->count++;
|
181
|
+
pq->heap[pq->count] = elem;
|
182
|
+
pq2_up(pq);
|
183
|
+
}
|
184
|
+
|
185
|
+
void *pq2_top(PriorityQueue2 *pq)
|
186
|
+
{
|
187
|
+
return pq->heap[1];
|
188
|
+
}
|
189
|
+
|
190
|
+
void *pq2_pop(PriorityQueue2 *pq)
|
191
|
+
{
|
192
|
+
if (pq->count > 0) {
|
193
|
+
void *result = pq->heap[1]; // save first value
|
194
|
+
pq->heap[1] = pq->heap[pq->count]; // move last to first
|
195
|
+
pq->heap[pq->count] = NULL;
|
196
|
+
pq->count--;
|
197
|
+
pq2_down(pq); // adjust heap
|
198
|
+
return result;
|
199
|
+
} else {
|
200
|
+
return NULL;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
void pq2_clear(PriorityQueue2 *pq)
|
205
|
+
{
|
206
|
+
int i;
|
207
|
+
for (i = 1; i <= pq->count; i++) {
|
208
|
+
pq->free_elem(pq->heap[i]);
|
209
|
+
pq->heap[i] = NULL;
|
210
|
+
}
|
211
|
+
pq->count = 0;
|
212
|
+
}
|
213
|
+
|
214
|
+
int pq2_insert(PriorityQueue2 *pq, void *elem)
|
215
|
+
{
|
216
|
+
if (pq->count < pq->size) {
|
217
|
+
pq2_push(pq, elem);
|
218
|
+
return true;
|
219
|
+
} else if (pq->count > 0 && pq->lt(pq, pq2_top(pq), elem)) {
|
220
|
+
pq->free_elem(pq->heap[1]);
|
221
|
+
pq->heap[1] = elem;
|
222
|
+
pq2_down(pq);
|
223
|
+
return true;
|
224
|
+
} else {
|
225
|
+
pq->free_elem(elem);
|
226
|
+
return false;
|
227
|
+
}
|
228
|
+
}
|
data/ext/priorityqueue.h
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#ifndef FRT_PRIORITYQUEUE_H
|
2
|
+
#define FRT_PRIORITYQUEUE_H
|
3
|
+
|
4
|
+
#include "global.h"
|
5
|
+
|
6
|
+
typedef struct PriorityQueue {
|
7
|
+
int count;
|
8
|
+
int size;
|
9
|
+
void **heap;
|
10
|
+
bool (*lt)(void *p1, void *p2);
|
11
|
+
void (*free_elem)(void *p1);
|
12
|
+
} PriorityQueue;
|
13
|
+
|
14
|
+
PriorityQueue *pq_create(int max_size, bool (*less_than)(void *p1, void *p2));
|
15
|
+
void pq_destroy(void *p);
|
16
|
+
void pq_push(PriorityQueue *pq, void *elem);
|
17
|
+
void *pq_top(PriorityQueue *pq);
|
18
|
+
void *pq_pop(PriorityQueue *pq);
|
19
|
+
void pq_down(PriorityQueue *pq);
|
20
|
+
void pq_clear(PriorityQueue *pq);
|
21
|
+
int pq_insert(PriorityQueue *pq, void *elem);
|
22
|
+
#define pq_full(pq) ((pq)->count == (pq)->size)
|
23
|
+
|
24
|
+
typedef struct PriorityQueue2 {
|
25
|
+
int count;
|
26
|
+
int size;
|
27
|
+
void **heap;
|
28
|
+
void *data;
|
29
|
+
bool (*lt)(struct PriorityQueue2 *pq, void *p1, void *p2);
|
30
|
+
void (*free_elem)(void *p);
|
31
|
+
void (*destroy)(void *p);
|
32
|
+
} PriorityQueue2;
|
33
|
+
|
34
|
+
PriorityQueue2 *pq2_create(int max_size,
|
35
|
+
bool (*less_than)(PriorityQueue2 *pq, void *p1, void *p2),
|
36
|
+
void (*destroy)(void *p));
|
37
|
+
void pq2_destroy(void *p);
|
38
|
+
void pq2_push(PriorityQueue2 *pq, void *elem);
|
39
|
+
void *pq2_top(PriorityQueue2 *pq);
|
40
|
+
void *pq2_pop(PriorityQueue2 *pq);
|
41
|
+
void pq2_down(PriorityQueue2 *pq);
|
42
|
+
void pq2_clear(PriorityQueue2 *pq);
|
43
|
+
int pq2_insert(PriorityQueue2 *pq, void *elem);
|
44
|
+
#endif
|
data/ext/q_boolean.c
ADDED
@@ -0,0 +1,1331 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include "search.h"
|
3
|
+
|
4
|
+
/***************************************************************************
|
5
|
+
*
|
6
|
+
* BooleanWeight
|
7
|
+
*
|
8
|
+
***************************************************************************/
|
9
|
+
|
10
|
+
float bw_sum_of_squared_weights(Weight *self)
|
11
|
+
{
|
12
|
+
BooleanWeight *bw = (BooleanWeight *)self->data;
|
13
|
+
BooleanQuery *bq = (BooleanQuery *)self->query->data;
|
14
|
+
Weight *weight;
|
15
|
+
|
16
|
+
float sum = 0.0;
|
17
|
+
int i;
|
18
|
+
|
19
|
+
for (i = 0; i < bw->w_cnt; i++) {
|
20
|
+
if (! bq->clauses[i]->is_prohibited) {
|
21
|
+
weight = bw->weights[i];
|
22
|
+
sum += weight->sum_of_squared_weights(weight); // sum sub-weights
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
sum *= self->value * self->value; // boost each sub-weight
|
27
|
+
|
28
|
+
return sum;
|
29
|
+
}
|
30
|
+
|
31
|
+
void bw_normalize(Weight *self, float normalization_factor)
|
32
|
+
{
|
33
|
+
BooleanWeight *bw = (BooleanWeight *)self->data;
|
34
|
+
BooleanQuery *bq = (BooleanQuery *)self->query->data;
|
35
|
+
normalization_factor *= self->value; // multiply by query boost
|
36
|
+
Weight *weight;
|
37
|
+
int i;
|
38
|
+
|
39
|
+
for (i = 0; i < bw->w_cnt; i++) {
|
40
|
+
if (! bq->clauses[i]->is_prohibited) {
|
41
|
+
weight = bw->weights[i];
|
42
|
+
weight->normalize(weight, normalization_factor); // sum sub-weights
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
Scorer *bw_scorer(Weight *self, IndexReader *ir)
|
48
|
+
{
|
49
|
+
Scorer *sub_scorer, *bsc = bsc_create(self->similarity);
|
50
|
+
BooleanWeight *bw = (BooleanWeight *)self->data;
|
51
|
+
BooleanQuery *bq = (BooleanQuery *)self->query->data;
|
52
|
+
BooleanClause *clause;
|
53
|
+
Weight *weight;
|
54
|
+
int i;
|
55
|
+
|
56
|
+
for (i = 0; i < bw->w_cnt; i++) {
|
57
|
+
clause = bq->clauses[i];
|
58
|
+
weight = bw->weights[i];
|
59
|
+
sub_scorer = weight->scorer(weight, ir);
|
60
|
+
if (sub_scorer) {
|
61
|
+
bsc_add_scorer(bsc, sub_scorer, clause->occur);
|
62
|
+
} else if (clause->is_required) {
|
63
|
+
bsc->destroy(bsc);
|
64
|
+
return NULL;
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
return bsc;
|
69
|
+
}
|
70
|
+
|
71
|
+
char *bw_to_s(Weight *self)
|
72
|
+
{
|
73
|
+
char dbuf[32];
|
74
|
+
dbl_to_s(dbuf, self->value);
|
75
|
+
return epstrdup("BooleanWeight(%s)", strlen(dbuf), dbuf);
|
76
|
+
}
|
77
|
+
|
78
|
+
void bw_destroy(void *p)
|
79
|
+
{
|
80
|
+
Weight *weight = (Weight *)p;
|
81
|
+
BooleanWeight *bw = (BooleanWeight *)weight->data;
|
82
|
+
free(bw->weights);
|
83
|
+
free(bw);
|
84
|
+
free(weight);
|
85
|
+
}
|
86
|
+
|
87
|
+
Explanation *bw_explain(Weight *self, IndexReader *ir, int doc_num)
|
88
|
+
{
|
89
|
+
BooleanWeight *bw = (BooleanWeight *)self->data;
|
90
|
+
BooleanQuery *bq = (BooleanQuery *)self->query->data;
|
91
|
+
Explanation *sum_expl = expl_create(0.0, estrdup("sum of:"));
|
92
|
+
BooleanClause *clause;
|
93
|
+
Weight *weight;
|
94
|
+
Explanation *explanation;
|
95
|
+
int coord = 0;
|
96
|
+
int max_coord = 0;
|
97
|
+
float coord_factor = 0.0;
|
98
|
+
float sum = 0.0;
|
99
|
+
int i;
|
100
|
+
|
101
|
+
for (i = 0; i < bw->w_cnt; i++) {
|
102
|
+
weight = bw->weights[i];
|
103
|
+
clause = bq->clauses[i];
|
104
|
+
explanation = weight->explain(weight, ir, doc_num);
|
105
|
+
if (!clause->is_prohibited) max_coord++;
|
106
|
+
if (explanation->value > 0.0) {
|
107
|
+
if (!clause->is_prohibited) {
|
108
|
+
expl_add_detail(sum_expl, explanation);
|
109
|
+
sum += explanation->value;
|
110
|
+
coord++;
|
111
|
+
} else {
|
112
|
+
expl_destoy(explanation);
|
113
|
+
expl_destoy(sum_expl);
|
114
|
+
return expl_create(0.0, estrdup("match prohibited"));
|
115
|
+
}
|
116
|
+
} else if (clause->is_required) {
|
117
|
+
expl_destoy(explanation);
|
118
|
+
expl_destoy(sum_expl);
|
119
|
+
return expl_create(0.0, estrdup("match required"));
|
120
|
+
} else {
|
121
|
+
expl_destoy(explanation);
|
122
|
+
}
|
123
|
+
}
|
124
|
+
sum_expl->value = sum;
|
125
|
+
|
126
|
+
if (coord == 1) { // only one clause matched
|
127
|
+
explanation = sum_expl; // eliminate wrapper
|
128
|
+
sum_expl->dcnt = 0;
|
129
|
+
sum_expl = sum_expl->details[0];
|
130
|
+
expl_destoy(explanation);
|
131
|
+
}
|
132
|
+
|
133
|
+
coord_factor = sim_coord(self->similarity, coord, max_coord);
|
134
|
+
|
135
|
+
if (coord_factor == 1.0) { // coord is no-op
|
136
|
+
return sum_expl; // eliminate wrapper
|
137
|
+
} else {
|
138
|
+
explanation = expl_create(sum * coord_factor, estrdup("product of:"));
|
139
|
+
expl_add_detail(explanation, sum_expl);
|
140
|
+
expl_add_detail(explanation, expl_create(coord_factor,
|
141
|
+
epstrdup("coord(%d/%d)", 40, coord, max_coord)));
|
142
|
+
return explanation;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
Weight *bw_create(Query *query, Searcher *searcher)
|
147
|
+
{
|
148
|
+
int i;
|
149
|
+
BooleanQuery *bq = (BooleanQuery *)query->data;
|
150
|
+
BooleanWeight *bw = ALLOC(BooleanWeight);
|
151
|
+
Weight *self = ALLOC(Weight);
|
152
|
+
ZEROSET(self, Weight, 1);
|
153
|
+
self->data = bw;
|
154
|
+
self->get_query = &w_get_query;
|
155
|
+
self->get_value = &w_get_value;
|
156
|
+
self->normalize = &bw_normalize;
|
157
|
+
self->scorer = &bw_scorer;
|
158
|
+
self->explain = &bw_explain;
|
159
|
+
self->to_s = &bw_to_s;
|
160
|
+
self->destroy = &bw_destroy;
|
161
|
+
self->sum_of_squared_weights = &bw_sum_of_squared_weights;
|
162
|
+
|
163
|
+
self->similarity = query->get_similarity(query, searcher);
|
164
|
+
self->query = query;
|
165
|
+
self->value = query->boost;
|
166
|
+
|
167
|
+
bw->w_cnt = bq->clause_cnt;
|
168
|
+
bw->weights = ALLOC_N(Weight *, bw->w_cnt);
|
169
|
+
for (i = 0; i < bw->w_cnt; i++) {
|
170
|
+
bw->weights[i] = q_weight(bq->clauses[i]->query, searcher);
|
171
|
+
}
|
172
|
+
|
173
|
+
return self;
|
174
|
+
}
|
175
|
+
|
176
|
+
/***************************************************************************
|
177
|
+
*
|
178
|
+
* BooleanClause
|
179
|
+
*
|
180
|
+
***************************************************************************/
|
181
|
+
|
182
|
+
void bc_set_occur(BooleanClause *self, unsigned int occur)
|
183
|
+
{
|
184
|
+
self->occur = occur;
|
185
|
+
switch (occur) {
|
186
|
+
case BC_SHOULD:
|
187
|
+
self->is_prohibited = false;
|
188
|
+
self->is_required = false;
|
189
|
+
break;
|
190
|
+
case BC_MUST:
|
191
|
+
self->is_prohibited = false;
|
192
|
+
self->is_required = true;
|
193
|
+
break;
|
194
|
+
case BC_MUST_NOT:
|
195
|
+
self->is_prohibited = true;
|
196
|
+
self->is_required = false;
|
197
|
+
break;
|
198
|
+
default:
|
199
|
+
eprintf(ARG_ERROR, "Invalid value %d for BooleanClause Type", occur);
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
BooleanClause *bc_create(Query *query, unsigned int occur)
|
204
|
+
{
|
205
|
+
BooleanClause *self = ALLOC(BooleanClause);
|
206
|
+
self->query = query;
|
207
|
+
bc_set_occur(self, occur);
|
208
|
+
return self;
|
209
|
+
}
|
210
|
+
|
211
|
+
/***************************************************************************
|
212
|
+
*
|
213
|
+
* BooleanQuery
|
214
|
+
*
|
215
|
+
***************************************************************************/
|
216
|
+
|
217
|
+
Query *bq_rewrite(Query *self, IndexReader *ir)
|
218
|
+
{
|
219
|
+
BooleanQuery *bq = (BooleanQuery *)self->data;
|
220
|
+
BooleanClause *clause;
|
221
|
+
Query *query;
|
222
|
+
int i;
|
223
|
+
|
224
|
+
if (bq->clause_cnt == 1) { // optimize 1-clause queries
|
225
|
+
clause = bq->clauses[0];
|
226
|
+
if (! clause->is_prohibited) { // just return clause
|
227
|
+
query = clause->query->rewrite(clause->query, ir); // rewrite first
|
228
|
+
|
229
|
+
if (self->boost != 1.0) {// incorporate boost
|
230
|
+
// original_boost is initialized to 0.0. If it has been set to
|
231
|
+
// something else it means this query has already been boosted before
|
232
|
+
// so boost from the original value
|
233
|
+
if ((query == clause->query) && query->original_boost) { // rewrite was no-op
|
234
|
+
query->boost = query->original_boost * self->boost;
|
235
|
+
} else {
|
236
|
+
query->original_boost = query->boost; // save original boost
|
237
|
+
query->boost *= self->boost;
|
238
|
+
}
|
239
|
+
}
|
240
|
+
|
241
|
+
return query;
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
for (i = 0; i < bq->clause_cnt; i++) {
|
246
|
+
clause = bq->clauses[i];
|
247
|
+
clause->rewritten = clause->query->rewrite(clause->query, ir);
|
248
|
+
}
|
249
|
+
return self; // no clauses rewritten
|
250
|
+
}
|
251
|
+
|
252
|
+
void bq_extract_terms(Query *self, Array *terms)
|
253
|
+
{
|
254
|
+
BooleanQuery *bq = (BooleanQuery *)self->data;
|
255
|
+
BooleanClause *clause;
|
256
|
+
int i;
|
257
|
+
for (i = 0; i < bq->clause_cnt; i++) {
|
258
|
+
clause = bq->clauses[i];
|
259
|
+
clause->query->extract_terms(clause->query, terms);
|
260
|
+
}
|
261
|
+
}
|
262
|
+
|
263
|
+
char *bq_to_s(Query *self, char *field)
|
264
|
+
{
|
265
|
+
BooleanQuery *bq = (BooleanQuery *)self->data;
|
266
|
+
BooleanClause *clause;
|
267
|
+
Query *sub_query;
|
268
|
+
char *buffer;
|
269
|
+
char *clause_str;
|
270
|
+
int bp = 0;
|
271
|
+
int size = QUERY_STRING_START_SIZE;
|
272
|
+
int needed;
|
273
|
+
int clause_len;
|
274
|
+
|
275
|
+
buffer = ALLOC_N(char, size);
|
276
|
+
if (self->boost != 1.0) {
|
277
|
+
buffer[0] = '(';
|
278
|
+
bp++;
|
279
|
+
}
|
280
|
+
|
281
|
+
int i;
|
282
|
+
for (i = 0; i < bq->clause_cnt; i++) {
|
283
|
+
clause = bq->clauses[i];
|
284
|
+
clause_str = clause->query->to_s(clause->query, field);
|
285
|
+
clause_len = strlen(clause_str);
|
286
|
+
needed = clause_len + 5;
|
287
|
+
while ((size - bp) < needed) {
|
288
|
+
size *= 2;
|
289
|
+
REALLOC_N(buffer, char, size);
|
290
|
+
}
|
291
|
+
|
292
|
+
if (i > 0) {
|
293
|
+
buffer[bp++] = ' ';
|
294
|
+
}
|
295
|
+
if (clause->is_prohibited) {
|
296
|
+
buffer[bp++] = '-';
|
297
|
+
} else if (clause->is_required) {
|
298
|
+
buffer[bp++] = '+';
|
299
|
+
}
|
300
|
+
|
301
|
+
sub_query = clause->query;
|
302
|
+
if (sub_query->type == BOOLEAN_QUERY) { // wrap sub-bools in parens
|
303
|
+
buffer[bp++] = '(';
|
304
|
+
memcpy(buffer + bp, clause_str, sizeof(char) * clause_len);
|
305
|
+
bp += clause_len;
|
306
|
+
buffer[bp++] = ')';
|
307
|
+
} else {
|
308
|
+
memcpy(buffer + bp, clause_str, sizeof(char) * clause_len);
|
309
|
+
bp += clause_len;
|
310
|
+
}
|
311
|
+
free(clause_str);
|
312
|
+
}
|
313
|
+
|
314
|
+
if (self->boost != 1.0) {
|
315
|
+
char dbuf[32];
|
316
|
+
dbl_to_s(dbuf, self->boost);
|
317
|
+
char *boost_str = epstrdup(")^%s", strlen(dbuf), dbuf);
|
318
|
+
int boost_len = strlen(boost_str);
|
319
|
+
REALLOC_N(buffer, char, bp + boost_len + 1);
|
320
|
+
memcpy(buffer + bp, boost_str, sizeof(char) * boost_len);
|
321
|
+
bp += boost_len;
|
322
|
+
free(boost_str);
|
323
|
+
}
|
324
|
+
buffer[bp] = 0;
|
325
|
+
return buffer;
|
326
|
+
}
|
327
|
+
|
328
|
+
void bq_destroy(void *p)
|
329
|
+
{
|
330
|
+
Query *self = (Query *)p;
|
331
|
+
BooleanQuery *bq = (BooleanQuery *)self->data;
|
332
|
+
BooleanClause *clause;
|
333
|
+
int i;
|
334
|
+
for (i = 0; i < bq->clause_cnt; i++) {
|
335
|
+
clause = bq->clauses[i];
|
336
|
+
if (self->destroy_all) clause->query->destroy(clause->query);
|
337
|
+
free(clause);
|
338
|
+
}
|
339
|
+
free(bq->clauses);
|
340
|
+
if (bq->similarity) {
|
341
|
+
bq->similarity->destroy(bq->similarity);
|
342
|
+
}
|
343
|
+
free(bq);
|
344
|
+
q_destroy(self);
|
345
|
+
}
|
346
|
+
|
347
|
+
float bq_coord_disabled(Similarity *sim, int overlap, int max_overlap)
|
348
|
+
{
|
349
|
+
return 1.0;
|
350
|
+
}
|
351
|
+
|
352
|
+
Similarity *bq_get_similarity(Query *self, Searcher *searcher)
|
353
|
+
{
|
354
|
+
BooleanQuery *bq = (BooleanQuery *)self->data;
|
355
|
+
if (!bq->similarity) {
|
356
|
+
Similarity *sim = q_get_similarity(self, searcher);
|
357
|
+
bq->similarity = ALLOC(Similarity);
|
358
|
+
memcpy(bq->similarity, sim, sizeof(Similarity));
|
359
|
+
bq->similarity->coord = &bq_coord_disabled;
|
360
|
+
bq->similarity->destroy = &free;
|
361
|
+
}
|
362
|
+
|
363
|
+
return bq->similarity;
|
364
|
+
}
|
365
|
+
|
366
|
+
Query *bq_create(bool coord_disabled)
|
367
|
+
{
|
368
|
+
Query *self = q_create();
|
369
|
+
BooleanQuery *bq = ALLOC(BooleanQuery);
|
370
|
+
self->type = BOOLEAN_QUERY;
|
371
|
+
self->create_weight = &bw_create;
|
372
|
+
self->rewrite = &bq_rewrite;
|
373
|
+
self->extract_terms = &bq_extract_terms;
|
374
|
+
self->to_s = &bq_to_s;
|
375
|
+
self->destroy = &bq_destroy;
|
376
|
+
self->data = bq;
|
377
|
+
bq->coord_disabled = coord_disabled;
|
378
|
+
if (coord_disabled) {
|
379
|
+
self->get_similarity = &bq_get_similarity;
|
380
|
+
}
|
381
|
+
bq->max_clause_cnt = DEFAULT_MAX_CLAUSE_COUNT;
|
382
|
+
bq->clause_cnt = 0;
|
383
|
+
bq->clause_capa = BOOLEAN_CLAUSES_START_CAPA;
|
384
|
+
bq->clauses = ALLOC_N(BooleanClause *, BOOLEAN_CLAUSES_START_CAPA);
|
385
|
+
bq->similarity = NULL;
|
386
|
+
|
387
|
+
return self;
|
388
|
+
}
|
389
|
+
|
390
|
+
void bq_add_query(Query *self, Query *sub_query, unsigned int occur)
|
391
|
+
{
|
392
|
+
BooleanQuery *bq = (BooleanQuery *)self->data;
|
393
|
+
BooleanClause *bc = bc_create(sub_query, occur);
|
394
|
+
if (bq->clause_cnt >= bq->clause_capa) {
|
395
|
+
bq->clause_capa *= 2;
|
396
|
+
REALLOC_N(bq->clauses, BooleanClause *, bq->clause_capa);
|
397
|
+
}
|
398
|
+
if (bq->clause_cnt > bq->max_clause_cnt) {
|
399
|
+
eprintf(STATE_ERROR, "Too many clauses.");
|
400
|
+
}
|
401
|
+
bq->clauses[bq->clause_cnt] = bc;
|
402
|
+
bq->clause_cnt++;
|
403
|
+
}
|
404
|
+
|
405
|
+
/***************************************************************************
|
406
|
+
*
|
407
|
+
* BooleanScorer
|
408
|
+
*
|
409
|
+
***************************************************************************/
|
410
|
+
|
411
|
+
/***************************************************************************
|
412
|
+
* Coordinator
|
413
|
+
***************************************************************************/
|
414
|
+
|
415
|
+
Coordinator *coord_create(Similarity *similarity)
|
416
|
+
{
|
417
|
+
Coordinator *self = ALLOC(Coordinator);
|
418
|
+
ZEROSET(self, Coordinator, 1);
|
419
|
+
self->similarity = similarity;
|
420
|
+
return self;
|
421
|
+
}
|
422
|
+
|
423
|
+
Coordinator *coord_init(Coordinator *self)
|
424
|
+
{
|
425
|
+
int i;
|
426
|
+
self->coord_factors = ALLOC_N(float, self->max_coord + 1);
|
427
|
+
|
428
|
+
for (i = 0; i <= self->max_coord; i++) {
|
429
|
+
self->coord_factors[i] = sim_coord(self->similarity, i, self->max_coord);
|
430
|
+
}
|
431
|
+
|
432
|
+
return self;
|
433
|
+
}
|
434
|
+
|
435
|
+
/***************************************************************************
|
436
|
+
* DisjunctionSumScorer
|
437
|
+
***************************************************************************/
|
438
|
+
|
439
|
+
float dssc_score(Scorer *self)
|
440
|
+
{
|
441
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
442
|
+
return dssc->cum_score;
|
443
|
+
}
|
444
|
+
|
445
|
+
void dssc_init_scorer_queue(DisjunctionSumScorer *dssc)
|
446
|
+
{
|
447
|
+
int i;
|
448
|
+
Scorer *sub_scorer;
|
449
|
+
PriorityQueue *pq = dssc->scorer_queue =
|
450
|
+
pq_create(dssc->ss_cnt, &scorer_doc_less_than);
|
451
|
+
|
452
|
+
for (i = 0; i < dssc->ss_cnt; i++) {
|
453
|
+
sub_scorer = dssc->sub_scorers[i];
|
454
|
+
if (sub_scorer->next(sub_scorer)) {
|
455
|
+
pq_insert(pq, sub_scorer);
|
456
|
+
}
|
457
|
+
}
|
458
|
+
}
|
459
|
+
|
460
|
+
bool dssc_advance_after_current(Scorer *self)
|
461
|
+
{
|
462
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
463
|
+
PriorityQueue *scorer_queue = dssc->scorer_queue;
|
464
|
+
Scorer *top;
|
465
|
+
while (true) { // repeat until minimum nr of matches
|
466
|
+
top = (Scorer *)pq_top(scorer_queue);
|
467
|
+
self->doc = top->doc;
|
468
|
+
dssc->cum_score = top->score(top);
|
469
|
+
dssc->num_matches = 1;
|
470
|
+
while (true) { // Until all subscorers are after self->hit.doc
|
471
|
+
if (top->next(top)) {
|
472
|
+
pq_down(scorer_queue);
|
473
|
+
} else {
|
474
|
+
pq_pop(scorer_queue);
|
475
|
+
if (scorer_queue->count < (dssc->min_num_matches - dssc->num_matches)) {
|
476
|
+
// Not enough subscorers left for a match on this document,
|
477
|
+
// and also no more chance of any further match.
|
478
|
+
return false;
|
479
|
+
}
|
480
|
+
if (scorer_queue->count == 0) {
|
481
|
+
break; // nothing more to advance, check for last match.
|
482
|
+
}
|
483
|
+
}
|
484
|
+
top = pq_top(scorer_queue);
|
485
|
+
if (top->doc != self->doc) {
|
486
|
+
break; // All remaining subscorers are after self->hit.doc.
|
487
|
+
} else {
|
488
|
+
dssc->cum_score += top->score(top);
|
489
|
+
dssc->num_matches++;
|
490
|
+
}
|
491
|
+
}
|
492
|
+
|
493
|
+
if (dssc->num_matches >= dssc->min_num_matches) {
|
494
|
+
return true;
|
495
|
+
} else if (scorer_queue->count < dssc->min_num_matches) {
|
496
|
+
return false;
|
497
|
+
}
|
498
|
+
}
|
499
|
+
}
|
500
|
+
|
501
|
+
bool dssc_next(Scorer *self)
|
502
|
+
{
|
503
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
504
|
+
|
505
|
+
if (dssc->scorer_queue == NULL) {
|
506
|
+
dssc_init_scorer_queue(dssc);
|
507
|
+
}
|
508
|
+
|
509
|
+
if (dssc->scorer_queue->count < dssc->min_num_matches) {
|
510
|
+
return false;
|
511
|
+
} else {
|
512
|
+
return dssc_advance_after_current(self);
|
513
|
+
}
|
514
|
+
}
|
515
|
+
|
516
|
+
bool dssc_skip_to(Scorer *self, int doc_num)
|
517
|
+
{
|
518
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
519
|
+
PriorityQueue *scorer_queue = dssc->scorer_queue;
|
520
|
+
Scorer *top;
|
521
|
+
|
522
|
+
if (scorer_queue == NULL) {
|
523
|
+
dssc_init_scorer_queue(dssc);
|
524
|
+
scorer_queue = dssc->scorer_queue;
|
525
|
+
}
|
526
|
+
|
527
|
+
if (scorer_queue->count < dssc->min_num_matches) {
|
528
|
+
return false;
|
529
|
+
}
|
530
|
+
if (doc_num <= self->doc) {
|
531
|
+
doc_num = self->doc + 1;
|
532
|
+
}
|
533
|
+
while (true) {
|
534
|
+
top = pq_top(scorer_queue);
|
535
|
+
if (top->doc >= doc_num) {
|
536
|
+
return dssc_advance_after_current(self);
|
537
|
+
} else if (top->skip_to(top, doc_num)) {
|
538
|
+
pq_down(scorer_queue);
|
539
|
+
} else {
|
540
|
+
pq_pop(scorer_queue);
|
541
|
+
if (scorer_queue->count < dssc->min_num_matches) {
|
542
|
+
return false;
|
543
|
+
}
|
544
|
+
}
|
545
|
+
}
|
546
|
+
}
|
547
|
+
|
548
|
+
Explanation *dssc_explain(Scorer *self, int doc_num)
|
549
|
+
{
|
550
|
+
int i;
|
551
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
552
|
+
Scorer *sub_scorer;
|
553
|
+
Explanation *e = expl_create(0.0,
|
554
|
+
epstrdup("At least %d of:", 20, dssc->min_num_matches));
|
555
|
+
for (i = 0; i < dssc->ss_cnt; i++) {
|
556
|
+
sub_scorer = dssc->sub_scorers[i];
|
557
|
+
expl_add_detail(e, sub_scorer->explain(sub_scorer, doc_num));
|
558
|
+
}
|
559
|
+
return e;
|
560
|
+
}
|
561
|
+
|
562
|
+
void dssc_destroy(void *p)
|
563
|
+
{
|
564
|
+
Scorer *self = (Scorer *)p;
|
565
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
566
|
+
int i;
|
567
|
+
for (i = 0; i < dssc->ss_cnt; i++) {
|
568
|
+
dssc->sub_scorers[i]->destroy(dssc->sub_scorers[i]);
|
569
|
+
}
|
570
|
+
pq_destroy(dssc->scorer_queue);
|
571
|
+
scorer_destroy(self);
|
572
|
+
}
|
573
|
+
|
574
|
+
Scorer *disjunction_sum_scorer_create(Scorer **sub_scorers, int ss_cnt,
|
575
|
+
int min_num_matches)
|
576
|
+
{
|
577
|
+
Scorer *self = scorer_create(NULL);
|
578
|
+
DisjunctionSumScorer *dssc = ALLOC(DisjunctionSumScorer);
|
579
|
+
self->data = dssc;
|
580
|
+
dssc->ss_cnt = ss_cnt;
|
581
|
+
|
582
|
+
// The document number of the current match.
|
583
|
+
self->doc = -1;
|
584
|
+
dssc->cum_score = -1.0;
|
585
|
+
|
586
|
+
// The number of subscorers that provide the current match.
|
587
|
+
dssc->num_matches = -1;
|
588
|
+
dssc->coordinator = NULL;
|
589
|
+
|
590
|
+
if (min_num_matches <= 0) {
|
591
|
+
eprintf(ARG_ERROR, "Minimum nr of matches must be positive");
|
592
|
+
}
|
593
|
+
if (ss_cnt <= 1) {
|
594
|
+
eprintf(ARG_ERROR, "There must be at least 2 sub_scorers");
|
595
|
+
}
|
596
|
+
|
597
|
+
dssc->min_num_matches = min_num_matches;
|
598
|
+
dssc->sub_scorers = sub_scorers;
|
599
|
+
|
600
|
+
dssc->scorer_queue = NULL;
|
601
|
+
|
602
|
+
self->score = &dssc_score;
|
603
|
+
self->next = &dssc_next;
|
604
|
+
self->skip_to = &dssc_skip_to;
|
605
|
+
self->explain = &dssc_explain;
|
606
|
+
self->destroy = &dssc_destroy;
|
607
|
+
|
608
|
+
return self;
|
609
|
+
}
|
610
|
+
|
611
|
+
float cdssc_score(Scorer *self)
|
612
|
+
{
|
613
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
614
|
+
dssc->coordinator->num_matches += dssc->num_matches;
|
615
|
+
return dssc->cum_score;
|
616
|
+
}
|
617
|
+
|
618
|
+
Scorer *counting_disjunction_sum_scorer_create(Coordinator *coordinator,
|
619
|
+
Scorer **sub_scorers, int ss_cnt, int min_num_matches)
|
620
|
+
{
|
621
|
+
Scorer *self = disjunction_sum_scorer_create(
|
622
|
+
sub_scorers, ss_cnt, min_num_matches);
|
623
|
+
DisjunctionSumScorer *dssc = (DisjunctionSumScorer *)self->data;
|
624
|
+
dssc->coordinator = coordinator;
|
625
|
+
self->score = &cdssc_score;
|
626
|
+
return self;
|
627
|
+
}
|
628
|
+
|
629
|
+
/***************************************************************************
|
630
|
+
* ConjunctionScorer
|
631
|
+
***************************************************************************/
|
632
|
+
|
633
|
+
void csc_sort_scorers(ConjunctionScorer *csc)
|
634
|
+
{
|
635
|
+
qsort(csc->sub_scorers, csc->ss_cnt, sizeof(Scorer *), &scorer_doc_cmp);
|
636
|
+
csc->first = 0;
|
637
|
+
csc->last = csc->ss_cnt - 1;
|
638
|
+
}
|
639
|
+
|
640
|
+
void csc_init(Scorer *self, bool init_scorers)
|
641
|
+
{
|
642
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
643
|
+
Scorer *sub_scorer;
|
644
|
+
int i;
|
645
|
+
// compute coord factor
|
646
|
+
csc->coord = sim_coord(self->similarity, csc->ss_cnt, csc->ss_cnt);
|
647
|
+
|
648
|
+
csc->more = (csc->ss_cnt > 0);
|
649
|
+
|
650
|
+
if (init_scorers) {
|
651
|
+
// move each scorer to its first entry
|
652
|
+
|
653
|
+
for (i = 0; i < csc->ss_cnt; i++) {
|
654
|
+
sub_scorer = csc->sub_scorers[i];
|
655
|
+
if (!csc->more) break;
|
656
|
+
csc->more = sub_scorer->next(sub_scorer);
|
657
|
+
}
|
658
|
+
if (csc->more) csc_sort_scorers(csc);
|
659
|
+
}
|
660
|
+
|
661
|
+
csc->first_time = false;
|
662
|
+
}
|
663
|
+
|
664
|
+
void csc_add_scorer(ConjunctionScorer *csc, Scorer *scorer)
|
665
|
+
{
|
666
|
+
RECAPA(csc, ss_cnt, ss_capa, sub_scorers, Scorer *);
|
667
|
+
csc->sub_scorers[csc->ss_cnt++] = scorer;
|
668
|
+
}
|
669
|
+
|
670
|
+
float csc_score(Scorer *self)
|
671
|
+
{
|
672
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
673
|
+
Scorer *sub_scorer;
|
674
|
+
float score = 0.0; // sum scores
|
675
|
+
int i;
|
676
|
+
for (i = 0; i < csc->ss_cnt; i++) {
|
677
|
+
sub_scorer = csc->sub_scorers[i];
|
678
|
+
score += sub_scorer->score(sub_scorer);
|
679
|
+
}
|
680
|
+
score *= csc->coord;
|
681
|
+
return score;
|
682
|
+
}
|
683
|
+
|
684
|
+
bool csc_do_next(Scorer *self)
|
685
|
+
{
|
686
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
687
|
+
Scorer *first = csc->sub_scorers[csc->first];
|
688
|
+
Scorer *last = csc->sub_scorers[csc->last];
|
689
|
+
|
690
|
+
// find doc w/ all clauses
|
691
|
+
while (csc->more && (first->doc < last->doc)) {
|
692
|
+
csc->more = first->skip_to(first, last->doc); // skip first upto last
|
693
|
+
// move first to last
|
694
|
+
csc->last = csc->first;
|
695
|
+
last = first;
|
696
|
+
csc->first = (csc->first + 1) % csc->ss_cnt;
|
697
|
+
first = csc->sub_scorers[csc->first];
|
698
|
+
}
|
699
|
+
self->doc = first->doc;
|
700
|
+
return csc->more;
|
701
|
+
}
|
702
|
+
|
703
|
+
bool csc_next(Scorer *self)
|
704
|
+
{
|
705
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
706
|
+
Scorer *sub_scorer;
|
707
|
+
if (csc->first_time) {
|
708
|
+
csc_init(self, true);
|
709
|
+
} else if (csc->more) {
|
710
|
+
sub_scorer = csc->sub_scorers[csc->last];
|
711
|
+
csc->more = sub_scorer->next(sub_scorer); // trigger further scanning
|
712
|
+
}
|
713
|
+
return csc_do_next(self);
|
714
|
+
}
|
715
|
+
|
716
|
+
bool csc_skip_to(Scorer *self, int doc_num)
|
717
|
+
{
|
718
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
719
|
+
Scorer *sub_scorer;
|
720
|
+
int i;
|
721
|
+
|
722
|
+
if (csc->first_time) {
|
723
|
+
csc_init(self, true);
|
724
|
+
}
|
725
|
+
|
726
|
+
for (i = 0; i < csc->ss_cnt; i++) {
|
727
|
+
if (!csc->more) break;
|
728
|
+
sub_scorer = csc->sub_scorers[i];
|
729
|
+
csc->more = sub_scorer->skip_to(sub_scorer, doc_num);
|
730
|
+
}
|
731
|
+
if (csc->more) csc_sort_scorers(csc); // resort the scorers
|
732
|
+
|
733
|
+
return csc_do_next(self);
|
734
|
+
}
|
735
|
+
|
736
|
+
void csc_destroy(void *p)
|
737
|
+
{
|
738
|
+
Scorer *self = (Scorer *)p;
|
739
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
740
|
+
int i;
|
741
|
+
for (i = 0; i < csc->ss_cnt; i++) {
|
742
|
+
csc->sub_scorers[i]->destroy(csc->sub_scorers[i]);
|
743
|
+
}
|
744
|
+
free(csc->sub_scorers);
|
745
|
+
scorer_destroy(self);
|
746
|
+
}
|
747
|
+
|
748
|
+
Scorer *conjunction_scorer_create(Similarity *similarity)
|
749
|
+
{
|
750
|
+
Scorer *self = scorer_create(similarity);
|
751
|
+
ConjunctionScorer *csc = ALLOC(ConjunctionScorer);
|
752
|
+
ZEROSET(csc, ConjunctionScorer, 1);
|
753
|
+
self->data = csc;
|
754
|
+
csc->first_time = true;
|
755
|
+
csc->more = true;
|
756
|
+
csc->coordinator = NULL;
|
757
|
+
|
758
|
+
self->score = &csc_score;
|
759
|
+
self->next = &csc_next;
|
760
|
+
self->skip_to = &csc_skip_to;
|
761
|
+
self->destroy = &csc_destroy;
|
762
|
+
|
763
|
+
return self;
|
764
|
+
}
|
765
|
+
|
766
|
+
float ccsc_score(Scorer *self)
|
767
|
+
{
|
768
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
769
|
+
|
770
|
+
int doc;
|
771
|
+
if ((doc = self->doc) > csc->last_scored_doc) {
|
772
|
+
csc->last_scored_doc = doc;
|
773
|
+
csc->coordinator->num_matches += csc->ss_cnt;
|
774
|
+
}
|
775
|
+
|
776
|
+
return csc_score(self);
|
777
|
+
}
|
778
|
+
|
779
|
+
Scorer *counting_conjunction_sum_scorer_create(Coordinator *coordinator,
|
780
|
+
Scorer **sub_scorers, int ss_cnt)
|
781
|
+
{
|
782
|
+
Scorer *self = conjunction_scorer_create(sim_create_default());
|
783
|
+
ConjunctionScorer *csc = (ConjunctionScorer *)self->data;
|
784
|
+
csc->coordinator = coordinator;
|
785
|
+
csc->last_scored_doc = -1;
|
786
|
+
csc->sub_scorers = ALLOC_N(Scorer *, ss_cnt);
|
787
|
+
memcpy(csc->sub_scorers, sub_scorers, sizeof(Scorer *) * ss_cnt);
|
788
|
+
csc->ss_capa = csc->ss_cnt = ss_cnt;
|
789
|
+
|
790
|
+
self->score = &ccsc_score;
|
791
|
+
|
792
|
+
return self;
|
793
|
+
}
|
794
|
+
|
795
|
+
/***************************************************************************
|
796
|
+
* SingleMatchScorer
|
797
|
+
***************************************************************************/
|
798
|
+
|
799
|
+
float smsc_score(Scorer *self)
|
800
|
+
{
|
801
|
+
SingleMatchScorer *smsc = (SingleMatchScorer *)self->data;
|
802
|
+
smsc->coordinator->num_matches++;
|
803
|
+
return smsc->scorer->score(smsc->scorer);
|
804
|
+
}
|
805
|
+
|
806
|
+
bool smsc_next(Scorer *self)
|
807
|
+
{
|
808
|
+
Scorer *scorer = ((SingleMatchScorer *)self->data)->scorer;
|
809
|
+
if (scorer->next(scorer)) {
|
810
|
+
self->doc = scorer->doc;
|
811
|
+
return true;
|
812
|
+
}
|
813
|
+
return false;
|
814
|
+
}
|
815
|
+
|
816
|
+
bool smsc_skip_to(Scorer *self, int doc_num)
|
817
|
+
{
|
818
|
+
Scorer *scorer = ((SingleMatchScorer *)self->data)->scorer;
|
819
|
+
if (scorer->skip_to(scorer, doc_num)) {
|
820
|
+
self->doc = scorer->doc;
|
821
|
+
return true;
|
822
|
+
}
|
823
|
+
return false;
|
824
|
+
}
|
825
|
+
|
826
|
+
Explanation *smsc_explain(Scorer *self, int doc_num)
|
827
|
+
{
|
828
|
+
Scorer *scorer = ((SingleMatchScorer *)self->data)->scorer;
|
829
|
+
return scorer->explain(scorer, doc_num);
|
830
|
+
}
|
831
|
+
|
832
|
+
void smsc_destroy(void *p)
|
833
|
+
{
|
834
|
+
Scorer *self = (Scorer *)p;
|
835
|
+
Scorer *scorer = ((SingleMatchScorer *)self->data)->scorer;
|
836
|
+
scorer->destroy(scorer);
|
837
|
+
scorer_destroy(self);
|
838
|
+
}
|
839
|
+
|
840
|
+
Scorer *single_match_scorer_create(Coordinator *coordinator, Scorer *scorer)
|
841
|
+
{
|
842
|
+
Scorer *self = scorer_create(scorer->similarity);
|
843
|
+
SingleMatchScorer *smsc = ALLOC(SingleMatchScorer);
|
844
|
+
smsc->coordinator = coordinator;
|
845
|
+
smsc->scorer = scorer;
|
846
|
+
self->data = smsc;
|
847
|
+
|
848
|
+
self->score = &smsc_score;
|
849
|
+
self->next = &smsc_next;
|
850
|
+
self->skip_to = &smsc_skip_to;
|
851
|
+
self->explain = &smsc_explain;
|
852
|
+
self->destroy = &smsc_destroy;
|
853
|
+
return self;
|
854
|
+
}
|
855
|
+
|
856
|
+
/***************************************************************************
|
857
|
+
* ReqOptSumScorer
|
858
|
+
***************************************************************************/
|
859
|
+
|
860
|
+
float rossc_score(Scorer *self)
|
861
|
+
{
|
862
|
+
ReqOptSumScorer *rossc = (ReqOptSumScorer *)self->data;
|
863
|
+
Scorer *req_scorer = rossc->req_scorer;
|
864
|
+
Scorer *opt_scorer = rossc->opt_scorer;
|
865
|
+
int cur_doc = req_scorer->doc;
|
866
|
+
float req_score = req_scorer->score(req_scorer);
|
867
|
+
|
868
|
+
if (rossc->first_time_opt) {
|
869
|
+
rossc->first_time_opt = false;
|
870
|
+
if (! opt_scorer->skip_to(opt_scorer, cur_doc)) {
|
871
|
+
SCORER_NULLIFY(rossc->opt_scorer);
|
872
|
+
return req_score;
|
873
|
+
}
|
874
|
+
} else if (opt_scorer == NULL) {
|
875
|
+
return req_score;
|
876
|
+
} else if ((opt_scorer->doc < cur_doc) &&
|
877
|
+
! opt_scorer->skip_to(opt_scorer, cur_doc)) {
|
878
|
+
SCORER_NULLIFY(rossc->opt_scorer);
|
879
|
+
return req_score;
|
880
|
+
}
|
881
|
+
// assert (@opt_scorer != nil) and (@opt_scorer.doc() >= cur_doc)
|
882
|
+
return (opt_scorer->doc == cur_doc)
|
883
|
+
? req_score + opt_scorer->score(opt_scorer)
|
884
|
+
: req_score;
|
885
|
+
}
|
886
|
+
|
887
|
+
bool rossc_next(Scorer *self)
|
888
|
+
{
|
889
|
+
Scorer *req_scorer = ((ReqOptSumScorer *)self->data)->req_scorer;
|
890
|
+
if (req_scorer->next(req_scorer)) {
|
891
|
+
self->doc = req_scorer->doc;
|
892
|
+
return true;
|
893
|
+
}
|
894
|
+
return false;
|
895
|
+
}
|
896
|
+
|
897
|
+
bool rossc_skip_to(Scorer *self, int doc_num)
|
898
|
+
{
|
899
|
+
Scorer *req_scorer = ((ReqOptSumScorer *)self->data)->req_scorer;
|
900
|
+
if (req_scorer->skip_to(req_scorer, doc_num)) {
|
901
|
+
self->doc = req_scorer->doc;
|
902
|
+
return true;
|
903
|
+
}
|
904
|
+
return false;
|
905
|
+
}
|
906
|
+
|
907
|
+
Explanation *rossc_explain(Scorer *self, int doc_num)
|
908
|
+
{
|
909
|
+
ReqOptSumScorer *rossc = (ReqOptSumScorer *)self->data;
|
910
|
+
Scorer *req_scorer = rossc->req_scorer;
|
911
|
+
Scorer *opt_scorer = rossc->opt_scorer;
|
912
|
+
|
913
|
+
Explanation *e = expl_create(self->score(self), estrdup("required, optional:"));
|
914
|
+
expl_add_detail(e, req_scorer->explain(req_scorer, doc_num));
|
915
|
+
expl_add_detail(e, opt_scorer->explain(opt_scorer, doc_num));
|
916
|
+
return e;
|
917
|
+
}
|
918
|
+
|
919
|
+
void rossc_destroy(void *p)
|
920
|
+
{
|
921
|
+
Scorer *self = (Scorer *)p;
|
922
|
+
ReqOptSumScorer *rossc = (ReqOptSumScorer *)self->data;
|
923
|
+
if (rossc->req_scorer) rossc->req_scorer->destroy(rossc->req_scorer);
|
924
|
+
if (rossc->opt_scorer) rossc->opt_scorer->destroy(rossc->opt_scorer);
|
925
|
+
scorer_destroy(self);
|
926
|
+
}
|
927
|
+
|
928
|
+
|
929
|
+
Scorer *req_opt_sum_scorer_create(Scorer *req_scorer, Scorer *opt_scorer)
|
930
|
+
{
|
931
|
+
Scorer *self = scorer_create(NULL);
|
932
|
+
ReqOptSumScorer *rossc = ALLOC(ReqOptSumScorer);
|
933
|
+
self->data = rossc;
|
934
|
+
rossc->req_scorer = req_scorer;
|
935
|
+
rossc->opt_scorer = opt_scorer;
|
936
|
+
rossc->first_time_opt = true;
|
937
|
+
|
938
|
+
self->score = &rossc_score;
|
939
|
+
self->next = &rossc_next;
|
940
|
+
self->skip_to = &rossc_skip_to;
|
941
|
+
self->explain = &rossc_explain;
|
942
|
+
self->destroy = &rossc_destroy;
|
943
|
+
|
944
|
+
return self;
|
945
|
+
}
|
946
|
+
|
947
|
+
/***************************************************************************
|
948
|
+
* ReqExclScorer
|
949
|
+
***************************************************************************/
|
950
|
+
|
951
|
+
bool rxsc_to_non_excluded(Scorer *self)
|
952
|
+
{
|
953
|
+
ReqExclScorer *rxsc = (ReqExclScorer *)self->data;
|
954
|
+
Scorer *req_scorer = rxsc->req_scorer;
|
955
|
+
Scorer *excl_scorer = rxsc->excl_scorer;
|
956
|
+
int excl_doc = excl_scorer->doc, req_doc;
|
957
|
+
|
958
|
+
do {
|
959
|
+
req_doc = req_scorer->doc; // may be excluded
|
960
|
+
if (req_doc < excl_doc) {
|
961
|
+
// req_scorer advanced to before excl_scorer, ie. not excluded
|
962
|
+
self->doc = req_doc;
|
963
|
+
return true;
|
964
|
+
} else if (req_doc > excl_doc) {
|
965
|
+
if (! excl_scorer->skip_to(excl_scorer, req_doc)) {
|
966
|
+
SCORER_NULLIFY(rxsc->excl_scorer); // exhausted, no more exclusions
|
967
|
+
self->doc = req_doc;
|
968
|
+
return true;
|
969
|
+
}
|
970
|
+
excl_doc = excl_scorer->doc;
|
971
|
+
if (excl_doc > req_doc) {
|
972
|
+
self->doc = req_doc;
|
973
|
+
return true; // not excluded
|
974
|
+
}
|
975
|
+
}
|
976
|
+
} while (req_scorer->next(req_scorer));
|
977
|
+
SCORER_NULLIFY(rxsc->req_scorer); // exhausted, nothing left
|
978
|
+
return false;
|
979
|
+
}
|
980
|
+
|
981
|
+
bool rxsc_next(Scorer *self)
|
982
|
+
{
|
983
|
+
ReqExclScorer *rxsc = (ReqExclScorer *)self->data;
|
984
|
+
Scorer *req_scorer = rxsc->req_scorer;
|
985
|
+
Scorer *excl_scorer = rxsc->excl_scorer;
|
986
|
+
|
987
|
+
if (rxsc->first_time) {
|
988
|
+
if (! excl_scorer->next(excl_scorer)) {
|
989
|
+
SCORER_NULLIFY(rxsc->excl_scorer); // exhausted at start
|
990
|
+
excl_scorer = NULL;
|
991
|
+
}
|
992
|
+
rxsc->first_time = false;
|
993
|
+
}
|
994
|
+
if (req_scorer == NULL) {
|
995
|
+
return false;
|
996
|
+
}
|
997
|
+
if (! req_scorer->next(req_scorer)) {
|
998
|
+
SCORER_NULLIFY(rxsc->req_scorer); // exhausted, nothing left
|
999
|
+
return false;
|
1000
|
+
}
|
1001
|
+
if (excl_scorer == NULL) {
|
1002
|
+
self->doc = req_scorer->doc;
|
1003
|
+
return true; // req_scorer->next() already returned true
|
1004
|
+
}
|
1005
|
+
return rxsc_to_non_excluded(self);
|
1006
|
+
}
|
1007
|
+
|
1008
|
+
bool rxsc_skip_to(Scorer *self, int doc_num)
|
1009
|
+
{
|
1010
|
+
ReqExclScorer *rxsc = (ReqExclScorer *)self->data;
|
1011
|
+
Scorer *req_scorer = rxsc->req_scorer;
|
1012
|
+
Scorer *excl_scorer = rxsc->excl_scorer;
|
1013
|
+
|
1014
|
+
if (rxsc->first_time) {
|
1015
|
+
rxsc->first_time = false;
|
1016
|
+
if (! excl_scorer->skip_to(excl_scorer, doc_num)) {
|
1017
|
+
SCORER_NULLIFY(rxsc->excl_scorer); // exhausted
|
1018
|
+
excl_scorer = NULL;
|
1019
|
+
}
|
1020
|
+
}
|
1021
|
+
if (req_scorer == NULL) {
|
1022
|
+
return false;
|
1023
|
+
}
|
1024
|
+
if (excl_scorer == NULL) {
|
1025
|
+
if (req_scorer->skip_to(req_scorer, doc_num)) {
|
1026
|
+
self->doc = req_scorer->doc;
|
1027
|
+
return true;
|
1028
|
+
}
|
1029
|
+
return false;
|
1030
|
+
}
|
1031
|
+
if (! req_scorer->skip_to(req_scorer, doc_num)) {
|
1032
|
+
SCORER_NULLIFY(rxsc->req_scorer);
|
1033
|
+
return false;
|
1034
|
+
}
|
1035
|
+
return rxsc_to_non_excluded(self);
|
1036
|
+
}
|
1037
|
+
|
1038
|
+
float rxsc_score(Scorer *self)
|
1039
|
+
{
|
1040
|
+
Scorer *req_scorer = ((ReqExclScorer *)self->data)->req_scorer;
|
1041
|
+
return req_scorer->score(req_scorer);
|
1042
|
+
}
|
1043
|
+
|
1044
|
+
Explanation *rxsc_explain(Scorer *self, int doc_num)
|
1045
|
+
{
|
1046
|
+
ReqExclScorer *rxsc = (ReqExclScorer *)self->data;
|
1047
|
+
Scorer *req_scorer = rxsc->req_scorer;
|
1048
|
+
Scorer *excl_scorer = rxsc->excl_scorer;
|
1049
|
+
|
1050
|
+
Explanation *e;
|
1051
|
+
if (excl_scorer->skip_to(excl_scorer, doc_num) && excl_scorer->doc == doc_num) {
|
1052
|
+
e = expl_create(0.0, estrdup("excluded:"));
|
1053
|
+
} else {
|
1054
|
+
e = expl_create(0.0, estrdup("not excluded:"));
|
1055
|
+
expl_add_detail(e, req_scorer->explain(req_scorer, doc_num));
|
1056
|
+
}
|
1057
|
+
return e;
|
1058
|
+
}
|
1059
|
+
|
1060
|
+
void rxsc_destroy(void *p)
|
1061
|
+
{
|
1062
|
+
Scorer *self = (Scorer *)p;
|
1063
|
+
ReqExclScorer *rxsc = (ReqExclScorer *)self->data;
|
1064
|
+
if (rxsc->req_scorer) rxsc->req_scorer->destroy(rxsc->req_scorer);
|
1065
|
+
if (rxsc->excl_scorer) rxsc->excl_scorer->destroy(rxsc->excl_scorer);
|
1066
|
+
scorer_destroy(self);
|
1067
|
+
}
|
1068
|
+
|
1069
|
+
Scorer *req_excl_scorer_create(Scorer *req_scorer, Scorer *excl_scorer)
|
1070
|
+
{
|
1071
|
+
Scorer *self = scorer_create(NULL);
|
1072
|
+
ReqExclScorer *rxsc = ALLOC(ReqExclScorer);
|
1073
|
+
self->data = rxsc;
|
1074
|
+
rxsc->req_scorer = req_scorer;
|
1075
|
+
rxsc->excl_scorer = excl_scorer;
|
1076
|
+
rxsc->first_time = true;
|
1077
|
+
|
1078
|
+
self->score = &rxsc_score;
|
1079
|
+
self->next = &rxsc_next;
|
1080
|
+
self->skip_to = &rxsc_skip_to;
|
1081
|
+
self->explain = &rxsc_explain;
|
1082
|
+
self->destroy = &rxsc_destroy;
|
1083
|
+
|
1084
|
+
return self;
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
/***************************************************************************
|
1088
|
+
* NonMatchScorer
|
1089
|
+
***************************************************************************/
|
1090
|
+
|
1091
|
+
float nmsc_score(Scorer *self)
|
1092
|
+
{
|
1093
|
+
return 0.0;
|
1094
|
+
}
|
1095
|
+
|
1096
|
+
bool nmsc_next(Scorer *self)
|
1097
|
+
{
|
1098
|
+
return false;
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
bool nmsc_skip_to(Scorer *self, int doc_num)
|
1102
|
+
{
|
1103
|
+
return false;
|
1104
|
+
}
|
1105
|
+
|
1106
|
+
Explanation *nmsc_explain(Scorer *self, int doc_num)
|
1107
|
+
{
|
1108
|
+
return expl_create(0.0, estrdup("No documents matched"));
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
Scorer *non_matching_scorer_create()
|
1112
|
+
{
|
1113
|
+
Scorer *self = scorer_create(NULL);
|
1114
|
+
self->score = &nmsc_score;
|
1115
|
+
self->next = &nmsc_next;
|
1116
|
+
self->skip_to = &nmsc_skip_to;
|
1117
|
+
self->explain = &nmsc_explain;
|
1118
|
+
|
1119
|
+
return self;
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
|
1123
|
+
/***************************************************************************
|
1124
|
+
* BooleanScorer
|
1125
|
+
***************************************************************************/
|
1126
|
+
|
1127
|
+
Scorer *counting_sum_scorer_create3(BooleanScorer *bsc, Scorer *req_scorer,
|
1128
|
+
Scorer *opt_scorer)
|
1129
|
+
{
|
1130
|
+
if (bsc->ps_cnt == 0) { // no prohibited
|
1131
|
+
return req_opt_sum_scorer_create(req_scorer, opt_scorer);
|
1132
|
+
} else if (bsc->ps_cnt == 1) { // 1 prohibited
|
1133
|
+
return req_opt_sum_scorer_create(
|
1134
|
+
req_excl_scorer_create(req_scorer, bsc->prohibited_scorers[0]),
|
1135
|
+
opt_scorer);
|
1136
|
+
} else { // more prohibited
|
1137
|
+
return req_opt_sum_scorer_create(
|
1138
|
+
req_excl_scorer_create(req_scorer,
|
1139
|
+
disjunction_sum_scorer_create(bsc->prohibited_scorers, bsc->ps_cnt, 1)),
|
1140
|
+
opt_scorer);
|
1141
|
+
}
|
1142
|
+
}
|
1143
|
+
|
1144
|
+
Scorer *counting_sum_scorer_create2(BooleanScorer *bsc, Scorer *req_scorer,
|
1145
|
+
Scorer **optional_scorers, int os_cnt)
|
1146
|
+
{
|
1147
|
+
if (os_cnt == 0) {
|
1148
|
+
if (bsc->ps_cnt == 0) {
|
1149
|
+
return req_scorer;
|
1150
|
+
} else if (bsc->ps_cnt == 1) {
|
1151
|
+
return req_excl_scorer_create(req_scorer,
|
1152
|
+
bsc->prohibited_scorers[0]);
|
1153
|
+
} else { // no optional, more than 1 prohibited
|
1154
|
+
return req_excl_scorer_create(req_scorer,
|
1155
|
+
disjunction_sum_scorer_create(bsc->prohibited_scorers, bsc->ps_cnt, 1));
|
1156
|
+
}
|
1157
|
+
} else if (os_cnt == 1) {
|
1158
|
+
return counting_sum_scorer_create3(
|
1159
|
+
bsc,
|
1160
|
+
req_scorer,
|
1161
|
+
single_match_scorer_create(bsc->coordinator, optional_scorers[0]));
|
1162
|
+
} else { // more optional
|
1163
|
+
return counting_sum_scorer_create3(
|
1164
|
+
bsc,
|
1165
|
+
req_scorer,
|
1166
|
+
counting_disjunction_sum_scorer_create(bsc->coordinator,
|
1167
|
+
optional_scorers, os_cnt, 1));
|
1168
|
+
}
|
1169
|
+
}
|
1170
|
+
|
1171
|
+
Scorer *counting_sum_scorer_create(BooleanScorer *bsc)
|
1172
|
+
{
|
1173
|
+
if (bsc->rs_cnt == 0) {
|
1174
|
+
if (bsc->os_cnt == 0) {
|
1175
|
+
int i;
|
1176
|
+
// only prohibited_scorers so free them and return non_matching scorer
|
1177
|
+
for (i = 0; i < bsc->ps_cnt; i++) {
|
1178
|
+
bsc->prohibited_scorers[i]->destroy(bsc->prohibited_scorers[i]);
|
1179
|
+
}
|
1180
|
+
return non_matching_scorer_create();
|
1181
|
+
} else if (bsc->os_cnt == 1) {
|
1182
|
+
return counting_sum_scorer_create2( // the only optional scorer is required
|
1183
|
+
bsc,
|
1184
|
+
single_match_scorer_create(bsc->coordinator, bsc->optional_scorers[0]),
|
1185
|
+
NULL, 0); // no optional scorers left
|
1186
|
+
} else { // more than 1 @optional_scorers, no required scorers
|
1187
|
+
return counting_sum_scorer_create2( // at least one optional scorer is required
|
1188
|
+
bsc,
|
1189
|
+
counting_disjunction_sum_scorer_create(bsc->coordinator,
|
1190
|
+
bsc->optional_scorers, bsc->os_cnt, 1),
|
1191
|
+
NULL, 0); // no optional scorers left
|
1192
|
+
}
|
1193
|
+
} else if (bsc->rs_cnt == 1) { // 1 required
|
1194
|
+
return counting_sum_scorer_create2(
|
1195
|
+
bsc,
|
1196
|
+
single_match_scorer_create(bsc->coordinator, bsc->required_scorers[0]),
|
1197
|
+
bsc->optional_scorers, bsc->os_cnt);
|
1198
|
+
} else {// more required scorers
|
1199
|
+
return counting_sum_scorer_create2(
|
1200
|
+
bsc,
|
1201
|
+
counting_conjunction_sum_scorer_create(bsc->coordinator,
|
1202
|
+
bsc->required_scorers, bsc->rs_cnt),
|
1203
|
+
bsc->optional_scorers, bsc->os_cnt);
|
1204
|
+
}
|
1205
|
+
}
|
1206
|
+
|
1207
|
+
void bsc_init_counting_sum_scorer(BooleanScorer *bsc)
|
1208
|
+
{
|
1209
|
+
coord_init(bsc->coordinator);
|
1210
|
+
bsc->counting_sum_scorer = counting_sum_scorer_create(bsc);
|
1211
|
+
}
|
1212
|
+
|
1213
|
+
void bsc_add_scorer(Scorer *self, Scorer *scorer, unsigned int occur)
|
1214
|
+
{
|
1215
|
+
BooleanScorer *bsc = (BooleanScorer *)self->data;
|
1216
|
+
if (occur != BC_MUST_NOT) {
|
1217
|
+
bsc->coordinator->max_coord++;
|
1218
|
+
}
|
1219
|
+
|
1220
|
+
switch (occur) {
|
1221
|
+
case BC_MUST:
|
1222
|
+
RECAPA(bsc, rs_cnt, rs_capa, required_scorers, Scorer *);
|
1223
|
+
bsc->required_scorers[bsc->rs_cnt++] = scorer;
|
1224
|
+
break;
|
1225
|
+
case BC_SHOULD:
|
1226
|
+
RECAPA(bsc, os_cnt, os_capa, optional_scorers, Scorer *);
|
1227
|
+
bsc->optional_scorers[bsc->os_cnt++] = scorer;
|
1228
|
+
break;
|
1229
|
+
case BC_MUST_NOT:
|
1230
|
+
RECAPA(bsc, ps_cnt, ps_capa, prohibited_scorers, Scorer *);
|
1231
|
+
bsc->prohibited_scorers[bsc->ps_cnt++] = scorer;
|
1232
|
+
break;
|
1233
|
+
default:
|
1234
|
+
eprintf(ARG_ERROR, "Unknown value for occur <%d>\n", occur);
|
1235
|
+
}
|
1236
|
+
}
|
1237
|
+
|
1238
|
+
float bsc_score(Scorer *self)
|
1239
|
+
{
|
1240
|
+
BooleanScorer *bsc = (BooleanScorer *)self->data;
|
1241
|
+
Coordinator *coord = bsc->coordinator;
|
1242
|
+
float sum;
|
1243
|
+
coord->num_matches = 0;
|
1244
|
+
sum = bsc->counting_sum_scorer->score(bsc->counting_sum_scorer);
|
1245
|
+
return sum * coord->coord_factors[coord->num_matches];
|
1246
|
+
}
|
1247
|
+
|
1248
|
+
bool bsc_next(Scorer *self)
|
1249
|
+
{
|
1250
|
+
BooleanScorer *bsc = (BooleanScorer *)self->data;
|
1251
|
+
|
1252
|
+
if (!bsc->counting_sum_scorer) {
|
1253
|
+
bsc_init_counting_sum_scorer(bsc);
|
1254
|
+
}
|
1255
|
+
if (bsc->counting_sum_scorer->next(bsc->counting_sum_scorer)) {
|
1256
|
+
self->doc = bsc->counting_sum_scorer->doc;
|
1257
|
+
return true;
|
1258
|
+
} else {
|
1259
|
+
return false;
|
1260
|
+
}
|
1261
|
+
}
|
1262
|
+
|
1263
|
+
bool bsc_skip_to(Scorer *self, int doc_num)
|
1264
|
+
{
|
1265
|
+
BooleanScorer *bsc = (BooleanScorer *)self->data;
|
1266
|
+
|
1267
|
+
if (!bsc->counting_sum_scorer) {
|
1268
|
+
bsc_init_counting_sum_scorer(bsc);
|
1269
|
+
}
|
1270
|
+
if (bsc->counting_sum_scorer->skip_to(bsc->counting_sum_scorer, doc_num)) {
|
1271
|
+
self->doc = bsc->counting_sum_scorer->doc;
|
1272
|
+
return true;
|
1273
|
+
} else {
|
1274
|
+
return false;
|
1275
|
+
}
|
1276
|
+
}
|
1277
|
+
|
1278
|
+
void bsc_destroy(void *p)
|
1279
|
+
{
|
1280
|
+
Scorer *self = (Scorer *)p;
|
1281
|
+
BooleanScorer *bsc = (BooleanScorer *)self->data;
|
1282
|
+
Coordinator *coord = bsc->coordinator;
|
1283
|
+
|
1284
|
+
free(coord->coord_factors);
|
1285
|
+
free(coord);
|
1286
|
+
|
1287
|
+
if (bsc->counting_sum_scorer) {
|
1288
|
+
bsc->counting_sum_scorer->destroy(bsc->counting_sum_scorer);
|
1289
|
+
} else {
|
1290
|
+
int i;
|
1291
|
+
for (i = 0; i < bsc->rs_cnt; i++) {
|
1292
|
+
bsc->required_scorers[i]->destroy(bsc->required_scorers[i]);
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
for (i = 0; i < bsc->os_cnt; i++) {
|
1296
|
+
bsc->optional_scorers[i]->destroy(bsc->optional_scorers[i]);
|
1297
|
+
}
|
1298
|
+
|
1299
|
+
for (i = 0; i < bsc->ps_cnt; i++) {
|
1300
|
+
bsc->prohibited_scorers[i]->destroy(bsc->prohibited_scorers[i]);
|
1301
|
+
}
|
1302
|
+
}
|
1303
|
+
free(bsc->required_scorers);
|
1304
|
+
free(bsc->optional_scorers);
|
1305
|
+
free(bsc->prohibited_scorers);
|
1306
|
+
scorer_destroy(self);
|
1307
|
+
}
|
1308
|
+
|
1309
|
+
Explanation *bsc_explain(Scorer *self, int doc_num)
|
1310
|
+
{
|
1311
|
+
return expl_create(0.0, estrdup("This explanation is not supported"));
|
1312
|
+
}
|
1313
|
+
|
1314
|
+
Scorer *bsc_create(Similarity *similarity)
|
1315
|
+
{
|
1316
|
+
Scorer *self = scorer_create(similarity);
|
1317
|
+
BooleanScorer *bsc = ALLOC(BooleanScorer);
|
1318
|
+
ZEROSET(bsc, BooleanScorer, 1);
|
1319
|
+
bsc->coordinator = coord_create(similarity);
|
1320
|
+
bsc->counting_sum_scorer = NULL;
|
1321
|
+
self->data = bsc;
|
1322
|
+
|
1323
|
+
self->score = &bsc_score;
|
1324
|
+
self->next = &bsc_next;
|
1325
|
+
self->skip_to = &bsc_skip_to;
|
1326
|
+
self->explain = &bsc_explain;
|
1327
|
+
self->destroy = &bsc_destroy;
|
1328
|
+
return self;
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
|