ferret 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/q_const_score.c
CHANGED
@@ -12,11 +12,6 @@ char *csw_to_s(Weight *self)
|
|
12
12
|
return strfmt("ConstantScoreWeight(%f)", self->value);
|
13
13
|
}
|
14
14
|
|
15
|
-
void csw_destroy(void *p)
|
16
|
-
{
|
17
|
-
free(p);
|
18
|
-
}
|
19
|
-
|
20
15
|
Explanation *csw_explain(Weight *self, IndexReader *ir, int doc_num)
|
21
16
|
{
|
22
17
|
Filter *filter = (Filter *)self->query->data;
|
@@ -40,21 +35,14 @@ Explanation *csw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
40
35
|
|
41
36
|
Weight *csw_create(Query *query, Searcher *searcher)
|
42
37
|
{
|
43
|
-
Weight *self =
|
44
|
-
ZEROSET(self, Weight, 1);
|
45
|
-
self->get_query = &w_get_query;
|
46
|
-
self->get_value = &w_get_value;
|
47
|
-
self->normalize = &w_normalize;
|
38
|
+
Weight *self = w_create(query);
|
48
39
|
self->scorer = &cssc_create;
|
49
40
|
self->explain = &csw_explain;
|
50
41
|
self->to_s = &csw_to_s;
|
51
|
-
self->destroy = &csw_destroy;
|
52
42
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
53
43
|
|
54
44
|
self->similarity = query->get_similarity(query, searcher);
|
55
45
|
self->idf = 1.0;
|
56
|
-
self->query = query;
|
57
|
-
self->value = 0.0;
|
58
46
|
|
59
47
|
return self;
|
60
48
|
}
|
@@ -79,24 +67,36 @@ char *csq_to_s(Query *self, char *field)
|
|
79
67
|
return buffer;;
|
80
68
|
}
|
81
69
|
|
82
|
-
void csq_destroy(
|
70
|
+
void csq_destroy(Query *self)
|
83
71
|
{
|
84
|
-
Query *self = (Query *)p;
|
85
72
|
if (self->destroy_all) {
|
86
73
|
Filter *filter = (Filter *)self->data;
|
87
74
|
filter->destroy(filter);
|
88
75
|
}
|
89
|
-
|
76
|
+
q_destroy_i(self);
|
77
|
+
}
|
78
|
+
|
79
|
+
static uint csq_hash(Query *self)
|
80
|
+
{
|
81
|
+
return filt_hash((Filter *)self->data);
|
82
|
+
}
|
83
|
+
|
84
|
+
static int csq_eq(Query *self, Query *o)
|
85
|
+
{
|
86
|
+
return filt_eq((Filter *)self->data, (Filter *)o->data);
|
90
87
|
}
|
91
88
|
|
92
89
|
Query *csq_create(Filter *filter)
|
93
90
|
{
|
94
91
|
Query *self = q_create();
|
95
|
-
self->type = CONSTANT_QUERY;
|
96
92
|
self->data = filter;
|
97
|
-
|
93
|
+
|
94
|
+
self->type = CONSTANT_QUERY;
|
98
95
|
self->to_s = &csq_to_s;
|
99
|
-
self->
|
96
|
+
self->hash = &csq_hash;
|
97
|
+
self->eq = &csq_eq;
|
98
|
+
self->destroy_i = &csq_destroy;
|
99
|
+
self->create_weight_i = &csw_create;
|
100
100
|
|
101
101
|
return self;
|
102
102
|
}
|
@@ -143,6 +143,6 @@ Scorer *cssc_create(Weight *weight, IndexReader *ir)
|
|
143
143
|
self->next = &cssc_next;
|
144
144
|
self->skip_to = &cssc_skip_to;
|
145
145
|
self->explain = &cssc_explain;
|
146
|
-
self->destroy = &
|
146
|
+
self->destroy = &scorer_destroy_i;
|
147
147
|
return self;
|
148
148
|
}
|
data/ext/q_filtered_query.c
CHANGED
@@ -14,11 +14,6 @@ char *fqw_to_s(Weight *self)
|
|
14
14
|
return strfmt("FilteredQueryWeight(%f)", self->value);
|
15
15
|
}
|
16
16
|
|
17
|
-
void fqw_destroy(void *p)
|
18
|
-
{
|
19
|
-
free(p);
|
20
|
-
}
|
21
|
-
|
22
17
|
float fqw_sum_of_squared_weights(Weight *self)
|
23
18
|
{
|
24
19
|
Weight *sw = (Weight *)self->data;
|
@@ -28,7 +23,7 @@ float fqw_sum_of_squared_weights(Weight *self)
|
|
28
23
|
void fqw_normalize(Weight *self, float normalization_factor)
|
29
24
|
{
|
30
25
|
Weight *sw = (Weight *)self->data;
|
31
|
-
|
26
|
+
sw->normalize(sw, normalization_factor);
|
32
27
|
}
|
33
28
|
|
34
29
|
float fqw_get_value(Weight *self)
|
@@ -52,13 +47,19 @@ Scorer *fqw_scorer(Weight *self, IndexReader *ir)
|
|
52
47
|
return fqsc_create(scorer, filter->get_bv(filter, ir), self->similarity);
|
53
48
|
}
|
54
49
|
|
50
|
+
void fqw_destroy(Weight *self)
|
51
|
+
{
|
52
|
+
Weight *sw = (Weight *)self->data;
|
53
|
+
sw->destroy(sw);
|
54
|
+
w_destroy(self);
|
55
|
+
}
|
56
|
+
|
55
57
|
Weight *fqw_create(Query *query, Weight *sub_weight, Similarity *sim)
|
56
58
|
{
|
57
|
-
Weight *self =
|
58
|
-
|
59
|
+
Weight *self = w_create(query);
|
60
|
+
|
59
61
|
self->data = sub_weight;
|
60
62
|
|
61
|
-
self->get_query = &w_get_query;
|
62
63
|
self->get_value = &fqw_get_value;
|
63
64
|
self->normalize = &fqw_normalize;
|
64
65
|
self->scorer = &fqw_scorer;
|
@@ -69,7 +70,6 @@ Weight *fqw_create(Query *query, Weight *sub_weight, Similarity *sim)
|
|
69
70
|
|
70
71
|
self->similarity = sim;
|
71
72
|
self->idf = 1.0;
|
72
|
-
self->query = query;
|
73
73
|
self->value = sub_weight->value;
|
74
74
|
|
75
75
|
return self;
|
@@ -99,16 +99,15 @@ char *fq_to_s(Query *self, char *field)
|
|
99
99
|
return buffer;;
|
100
100
|
}
|
101
101
|
|
102
|
-
void fq_destroy(
|
102
|
+
void fq_destroy(Query *self)
|
103
103
|
{
|
104
|
-
Query *self = (Query *)p;
|
105
104
|
if (self->destroy_all) {
|
106
105
|
FilteredQuery *fq = (FilteredQuery *)self->data;
|
107
106
|
fq->filter->destroy(fq->filter);
|
108
|
-
|
107
|
+
q_deref(fq->query);
|
109
108
|
}
|
110
109
|
free(self->data);
|
111
|
-
|
110
|
+
q_destroy_i(self);
|
112
111
|
}
|
113
112
|
|
114
113
|
Weight *fq_create_weight(Query *self, Searcher *searcher)
|
@@ -121,14 +120,16 @@ Weight *fq_create_weight(Query *self, Searcher *searcher)
|
|
121
120
|
Query *fq_create(Query *query, Filter *filter)
|
122
121
|
{
|
123
122
|
Query *self = q_create();
|
123
|
+
|
124
124
|
FilteredQuery *fq = ALLOC(FilteredQuery);
|
125
125
|
fq->query = query;
|
126
126
|
fq->filter = filter;
|
127
|
-
self->type = FILTERED_QUERY;
|
128
127
|
self->data = fq;
|
129
|
-
|
128
|
+
|
129
|
+
self->type = FILTERED_QUERY;
|
130
130
|
self->to_s = &fq_to_s;
|
131
|
-
self->
|
131
|
+
self->destroy_i = &fq_destroy;
|
132
|
+
self->create_weight_i = &fq_create_weight;
|
132
133
|
|
133
134
|
return self;
|
134
135
|
}
|
@@ -180,13 +181,12 @@ Explanation *fqsc_explain(Scorer *self, int doc_num)
|
|
180
181
|
return sub_sc->explain(sub_sc, doc_num);
|
181
182
|
}
|
182
183
|
|
183
|
-
void fqsc_destroy(
|
184
|
+
void fqsc_destroy(Scorer *self)
|
184
185
|
{
|
185
|
-
Scorer *self = (Scorer *)p;
|
186
186
|
FilteredQueryScorer *fqsc = (FilteredQueryScorer *)self->data;
|
187
187
|
bv_destroy(fqsc->bv);
|
188
188
|
fqsc->sub_scorer->destroy(fqsc->sub_scorer);
|
189
|
-
|
189
|
+
scorer_destroy_i(self);
|
190
190
|
}
|
191
191
|
|
192
192
|
Scorer *fqsc_create(Scorer *scorer, BitVector *bv, Similarity *sim)
|
data/ext/q_fuzzy.c
CHANGED
@@ -24,7 +24,7 @@ void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
|
24
24
|
}
|
25
25
|
}
|
26
26
|
|
27
|
-
|
27
|
+
int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
|
28
28
|
{
|
29
29
|
return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
|
30
30
|
: fuzq_calculate_max_distance(fuzq, m);
|
@@ -34,7 +34,7 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
|
|
34
34
|
{
|
35
35
|
int i, j;
|
36
36
|
int max_distance;
|
37
|
-
int m = strlen(target);
|
37
|
+
int m = (int)strlen(target);
|
38
38
|
int n = fuzq->text_len;
|
39
39
|
int *d = fuzq->da;
|
40
40
|
char *text = fuzq->text;
|
@@ -82,7 +82,7 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
|
|
82
82
|
} else {
|
83
83
|
d[i + m*j] = min3(d[i-1 + m*j]+1, d[i + m*(j-1)]+1, d[i-1 + m*(j-1)]);
|
84
84
|
}
|
85
|
-
best_pos_ed_dist =
|
85
|
+
best_pos_ed_dist = min2(best_pos_ed_dist, d[i + m*j]);
|
86
86
|
}
|
87
87
|
//printf("(bped = %d, i = %d, md = %d)", best_pos_ed_dist, i, max_distance);
|
88
88
|
|
@@ -104,7 +104,7 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
|
|
104
104
|
* number of characters in the shorter word. but this was the formula that
|
105
105
|
* was previously used in FuzzyTermEnum, so it has not been changed (even
|
106
106
|
* though min_sim must be greater than 0.0) */
|
107
|
-
return 1.0f - ((float)d[n + m*m] / (float) (fuzq->pre_len +
|
107
|
+
return 1.0f - ((float)d[n + m*m] / (float) (fuzq->pre_len + min2(n, m)));
|
108
108
|
}
|
109
109
|
|
110
110
|
/****************************************************************************
|
@@ -118,8 +118,8 @@ char *fuzq_to_s(Query *self, char *field)
|
|
118
118
|
char *buffer, *bptr;
|
119
119
|
FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
|
120
120
|
Term *term = fuzq->term;
|
121
|
-
int tlen = strlen(term->text);
|
122
|
-
int flen = strlen(term->field);
|
121
|
+
int tlen = (int)strlen(term->text);
|
122
|
+
int flen = (int)strlen(term->field);
|
123
123
|
bptr = buffer = ALLOC_N(char, tlen + flen + 35);
|
124
124
|
|
125
125
|
if (strcmp(term->field, field) != 0) {
|
@@ -155,11 +155,10 @@ bool scored_term_less_than(void *p1, void *p2)
|
|
155
155
|
return (st1->score < st2->score);
|
156
156
|
}
|
157
157
|
|
158
|
-
void scored_term_destroy(
|
158
|
+
void scored_term_destroy(ScoredTerm *self)
|
159
159
|
{
|
160
|
-
|
161
|
-
|
162
|
-
free(st);
|
160
|
+
term_destroy(self->term);
|
161
|
+
free(self);
|
163
162
|
}
|
164
163
|
|
165
164
|
ScoredTerm *scored_term_create(Term *term, float score)
|
@@ -181,7 +180,7 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
181
180
|
char *field = term->field;
|
182
181
|
Term prefix_term;
|
183
182
|
prefix_term.field = field;
|
184
|
-
if (fuzq->pre_len >= strlen(text)) {
|
183
|
+
if (fuzq->pre_len >= (int)strlen(text)) {
|
185
184
|
q = tq_create(term_clone(term));
|
186
185
|
} else {
|
187
186
|
PriorityQueue *term_pq;
|
@@ -195,7 +194,7 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
195
194
|
|
196
195
|
term_pq = pq_create(((BooleanQuery *)q->data)->max_clause_cnt,
|
197
196
|
&scored_term_less_than);
|
198
|
-
term_pq->free_elem = &scored_term_destroy;
|
197
|
+
term_pq->free_elem = (free_ft)&scored_term_destroy;
|
199
198
|
|
200
199
|
prefix_term.field = field;
|
201
200
|
prefix_term.text = (char *)EMPTY_STRING;
|
@@ -207,9 +206,9 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
207
206
|
}
|
208
207
|
te = ir->terms_from(ir, &prefix_term);
|
209
208
|
|
210
|
-
fuzq->scale_factor = 1.0 / (1.0 - fuzq->min_sim);
|
209
|
+
fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
|
211
210
|
fuzq->text = fuzq->term->text + pre_len;
|
212
|
-
fuzq->text_len = strlen(fuzq->text);
|
211
|
+
fuzq->text_len = (int)strlen(fuzq->text);
|
213
212
|
fuzq_initialize_max_distances(fuzq);
|
214
213
|
|
215
214
|
if (te) {
|
@@ -246,36 +245,51 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
246
245
|
pq_destroy(term_pq);
|
247
246
|
}
|
248
247
|
|
249
|
-
|
250
|
-
return self->rewritten = q;
|
248
|
+
return q;
|
251
249
|
}
|
252
250
|
|
253
|
-
void fuzq_destroy(
|
251
|
+
void fuzq_destroy(Query *self)
|
254
252
|
{
|
255
|
-
Query *self = (Query *)p;
|
256
253
|
FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
|
257
254
|
if (self->destroy_all) term_destroy((Term *)fuzq->term);
|
258
255
|
free(fuzq->da);
|
259
256
|
free(fuzq);
|
260
|
-
|
257
|
+
q_destroy_i(self);
|
258
|
+
}
|
259
|
+
|
260
|
+
static uint fuzq_hash(Query *self)
|
261
|
+
{
|
262
|
+
FuzzyQuery *fq = (FuzzyQuery *)self->data;
|
263
|
+
return term_hash(fq->term) ^ *((int *)&fq->min_sim) ^ fq->pre_len;
|
264
|
+
}
|
265
|
+
|
266
|
+
static int fuzq_eq(Query *self, Query *o)
|
267
|
+
{
|
268
|
+
FuzzyQuery *fq1 = (FuzzyQuery *)self->data;
|
269
|
+
FuzzyQuery *fq2 = (FuzzyQuery *)o->data;
|
270
|
+
return term_eq(fq1->term, fq2->term) &&
|
271
|
+
(fq1->pre_len == fq2->pre_len) &&
|
272
|
+
(fq1->min_sim == fq2->min_sim);
|
261
273
|
}
|
262
274
|
|
263
275
|
Query *fuzq_create(Term *term)
|
264
276
|
{
|
265
277
|
Query *self = q_create();
|
278
|
+
|
266
279
|
FuzzyQuery *fq = ALLOC(FuzzyQuery);
|
267
280
|
ZEROSET(fq, FuzzyQuery, 1);
|
268
|
-
|
269
281
|
fq->term = term;
|
270
282
|
fq->pre_len = DEF_PRE_LEN;
|
271
283
|
fq->min_sim = DEF_MIN_SIM;
|
272
284
|
self->data = fq;
|
285
|
+
|
273
286
|
self->type = FUZZY_QUERY;
|
274
|
-
self->create_weight = NULL;
|
275
287
|
self->to_s = &fuzq_to_s;
|
288
|
+
self->hash = &fuzq_hash;
|
289
|
+
self->eq = &fuzq_eq;
|
276
290
|
self->rewrite = &fuzq_rewrite;
|
277
|
-
self->
|
278
|
-
self->
|
291
|
+
self->destroy_i = &fuzq_destroy;
|
292
|
+
self->create_weight_i = &q_create_weight_unsup;
|
279
293
|
|
280
294
|
return self;
|
281
295
|
}
|
data/ext/q_match_all.c
CHANGED
@@ -12,11 +12,6 @@ char *maw_to_s(Weight *self)
|
|
12
12
|
return strfmt("MatchAllWeight(%f)", self->value);
|
13
13
|
}
|
14
14
|
|
15
|
-
void maw_destroy(void *p)
|
16
|
-
{
|
17
|
-
free(p);
|
18
|
-
}
|
19
|
-
|
20
15
|
Explanation *maw_explain(Weight *self, IndexReader *ir, int doc_num)
|
21
16
|
{
|
22
17
|
Explanation *expl;
|
@@ -34,21 +29,15 @@ Explanation *maw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
34
29
|
|
35
30
|
Weight *maw_create(Query *query, Searcher *searcher)
|
36
31
|
{
|
37
|
-
Weight *self =
|
38
|
-
|
39
|
-
self->get_query = &w_get_query;
|
40
|
-
self->get_value = &w_get_value;
|
41
|
-
self->normalize = &w_normalize;
|
32
|
+
Weight *self = w_create(query);
|
33
|
+
|
42
34
|
self->scorer = &masc_create;
|
43
35
|
self->explain = &maw_explain;
|
44
36
|
self->to_s = &maw_to_s;
|
45
|
-
self->destroy = &maw_destroy;
|
46
37
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
47
38
|
|
48
39
|
self->similarity = query->get_similarity(query, searcher);
|
49
40
|
self->idf = 1.0;
|
50
|
-
self->query = query;
|
51
|
-
self->value = 0.0;
|
52
41
|
|
53
42
|
return self;
|
54
43
|
}
|
@@ -68,19 +57,26 @@ char *maq_to_s(Query *self, char *field)
|
|
68
57
|
}
|
69
58
|
}
|
70
59
|
|
71
|
-
|
60
|
+
static uint maq_hash(Query *self)
|
72
61
|
{
|
73
|
-
|
74
|
-
|
62
|
+
return 0;
|
63
|
+
}
|
64
|
+
|
65
|
+
static int maq_eq(Query *self, Query *o)
|
66
|
+
{
|
67
|
+
return true;
|
75
68
|
}
|
76
69
|
|
77
70
|
Query *maq_create()
|
78
71
|
{
|
79
72
|
Query *self = q_create();
|
73
|
+
|
80
74
|
self->type = MATCH_ALL_QUERY;
|
81
|
-
self->create_weight = &maw_create;
|
82
75
|
self->to_s = &maq_to_s;
|
83
|
-
self->
|
76
|
+
self->hash = &maq_hash;
|
77
|
+
self->eq = &maq_eq;
|
78
|
+
self->destroy_i = &q_destroy_i;
|
79
|
+
self->create_weight_i = &maw_create;
|
84
80
|
|
85
81
|
return self;
|
86
82
|
}
|
@@ -133,6 +129,6 @@ Scorer *masc_create(Weight *weight, IndexReader *ir)
|
|
133
129
|
self->next = &masc_next;
|
134
130
|
self->skip_to = &masc_skip_to;
|
135
131
|
self->explain = &masc_explain;
|
136
|
-
self->destroy = &
|
132
|
+
self->destroy = &scorer_destroy_i;
|
137
133
|
return self;
|
138
134
|
}
|
data/ext/q_multi_phrase.c
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
#include <string.h>
|
2
1
|
#include "search.h"
|
2
|
+
#include <string.h>
|
3
3
|
|
4
4
|
static char * const FIELD_CHANGE_ERROR_MSG = "All phrase terms must be in the same field.";
|
5
5
|
|
@@ -19,9 +19,13 @@ Scorer *mphw_scorer(Weight *self, IndexReader *ir)
|
|
19
19
|
Scorer *phsc;
|
20
20
|
MultiPhraseQuery *mphq = (MultiPhraseQuery *)self->query->data;
|
21
21
|
int i;
|
22
|
-
|
22
|
+
TermDocEnum **tps;
|
23
|
+
|
24
|
+
if (mphq->t_cnt == 0) {
|
25
|
+
return NULL; /* optimize zero-term case */
|
26
|
+
}
|
23
27
|
|
24
|
-
|
28
|
+
tps = ALLOC_N(TermDocEnum *, mphq->t_cnt);
|
25
29
|
|
26
30
|
for (i = 0; i < mphq->t_cnt; i++) {
|
27
31
|
if (mphq->pt_cnt[i] == 1) {
|
@@ -56,6 +60,17 @@ Scorer *mphw_scorer(Weight *self, IndexReader *ir)
|
|
56
60
|
|
57
61
|
Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
58
62
|
{
|
63
|
+
Explanation *idf_expl1;
|
64
|
+
Explanation *idf_expl2;
|
65
|
+
Explanation *query_expl;
|
66
|
+
Explanation *qnorm_expl;
|
67
|
+
Explanation *field_expl;
|
68
|
+
Explanation *tf_expl;
|
69
|
+
Scorer *scorer;
|
70
|
+
uchar *field_norms;
|
71
|
+
float field_norm;
|
72
|
+
Explanation *field_norm_expl;
|
73
|
+
|
59
74
|
char *query_str = self->query->to_s(self->query, "");
|
60
75
|
MultiPhraseQuery *mphq = (MultiPhraseQuery *)self->query->data;
|
61
76
|
int i, j;
|
@@ -67,7 +82,7 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
67
82
|
|
68
83
|
for (i = 0; i < mphq->t_cnt; i++) {
|
69
84
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
70
|
-
len += strlen(mphq->terms[i][j]->text) + 30;
|
85
|
+
len += (int)strlen(mphq->terms[i][j]->text) + 30;
|
71
86
|
}
|
72
87
|
}
|
73
88
|
doc_freqs = ALLOC_N(char, len);
|
@@ -75,20 +90,20 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
75
90
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
76
91
|
Term *term = mphq->terms[i][j];
|
77
92
|
sprintf(doc_freqs + pos, "%s=%d, ", term->text, ir->doc_freq(ir, term));
|
78
|
-
pos += strlen(doc_freqs + pos);
|
93
|
+
pos += (int)strlen(doc_freqs + pos);
|
79
94
|
}
|
80
95
|
}
|
81
96
|
pos -= 2; // remove ", " from the end
|
82
97
|
doc_freqs[pos] = 0;
|
83
98
|
|
84
|
-
|
99
|
+
idf_expl1 = expl_create(self->idf,
|
85
100
|
strfmt("idf(%s:<%s>)", mphq->field, doc_freqs));
|
86
|
-
|
101
|
+
idf_expl2 = expl_create(self->idf,
|
87
102
|
strfmt("idf(%s:<%s>)", mphq->field, doc_freqs));
|
88
103
|
free(doc_freqs);
|
89
104
|
|
90
|
-
|
91
|
-
|
105
|
+
/* explain query weight */
|
106
|
+
query_expl = expl_create(0.0,
|
92
107
|
strfmt("query_weight(%s), product of:", query_str));
|
93
108
|
|
94
109
|
if (self->query->boost != 1.0) {
|
@@ -96,36 +111,36 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
96
111
|
}
|
97
112
|
expl_add_detail(query_expl, idf_expl1);
|
98
113
|
|
99
|
-
|
114
|
+
qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
|
100
115
|
expl_add_detail(query_expl, qnorm_expl);
|
101
116
|
|
102
117
|
query_expl->value = self->query->boost * self->idf * self->qnorm;
|
103
118
|
|
104
119
|
expl_add_detail(expl, query_expl);
|
105
120
|
|
106
|
-
|
107
|
-
|
121
|
+
/* explain field weight */
|
122
|
+
field_expl = expl_create(0.0,
|
108
123
|
strfmt("field_weight(%s in %d), product of:", query_str, doc_num));
|
109
124
|
free(query_str);
|
110
125
|
|
111
|
-
|
112
|
-
|
126
|
+
scorer = self->scorer(self, ir);
|
127
|
+
tf_expl = scorer->explain(scorer, doc_num);
|
113
128
|
scorer->destroy(scorer);
|
114
129
|
expl_add_detail(field_expl, tf_expl);
|
115
130
|
expl_add_detail(field_expl, idf_expl2);
|
116
131
|
|
117
|
-
|
118
|
-
|
132
|
+
field_norms = ir->get_norms(ir, mphq->field);
|
133
|
+
field_norm = (field_norms != NULL)
|
119
134
|
? sim_decode_norm(self->similarity, field_norms[doc_num])
|
120
|
-
: 0.0;
|
121
|
-
|
135
|
+
: (float)0.0;
|
136
|
+
field_norm_expl = expl_create(field_norm,
|
122
137
|
strfmt("field_norm(field=%s, doc=%d)", mphq->field, doc_num));
|
123
138
|
|
124
139
|
expl_add_detail(field_expl, field_norm_expl);
|
125
140
|
|
126
141
|
field_expl->value = tf_expl->value * self->idf * field_norm;
|
127
142
|
|
128
|
-
|
143
|
+
/* combine them */
|
129
144
|
if (query_expl->value == 1.0) {
|
130
145
|
expl_destoy(expl);
|
131
146
|
return field_expl;
|
@@ -136,29 +151,23 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
136
151
|
}
|
137
152
|
}
|
138
153
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
154
|
Weight *mphw_create(Query *query, Searcher *searcher)
|
143
155
|
{
|
156
|
+
Weight *self = w_create(query);
|
157
|
+
|
144
158
|
MultiPhraseQuery *mphq = (MultiPhraseQuery *)query->data;
|
145
|
-
Weight *self = ALLOC(Weight);
|
146
159
|
int i, j;
|
147
|
-
|
148
|
-
self->get_query = &w_get_query;
|
149
|
-
self->get_value = &w_get_value;
|
150
|
-
self->normalize = &w_normalize;
|
160
|
+
|
151
161
|
self->scorer = &mphw_scorer;
|
152
162
|
self->explain = &mphw_explain;
|
153
163
|
self->to_s = &mphw_to_s;
|
154
|
-
self->destroy = &free;
|
155
164
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
156
165
|
|
157
166
|
self->similarity = query->get_similarity(query, searcher);
|
158
167
|
self->query = query;
|
159
168
|
self->value = query->boost;
|
160
|
-
|
161
169
|
self->idf = 0.0;
|
170
|
+
|
162
171
|
for (i = 0; i < mphq->t_cnt; i++) {
|
163
172
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
164
173
|
self->idf += sim_idf_term(self->similarity, mphq->terms[i][j], searcher);
|
@@ -209,10 +218,8 @@ void mphq_add_terms(Query *self, Term **terms, int t_cnt, int pos_inc)
|
|
209
218
|
mphq->t_cnt++;
|
210
219
|
}
|
211
220
|
|
212
|
-
void mphq_destroy(
|
221
|
+
void mphq_destroy(Query *self)
|
213
222
|
{
|
214
|
-
Query *self = (Query *)p;
|
215
|
-
|
216
223
|
GET_MPHQ;
|
217
224
|
int i, j;
|
218
225
|
if (self->destroy_all) {
|
@@ -228,16 +235,16 @@ void mphq_destroy(void *p)
|
|
228
235
|
free(mphq->pt_cnt);
|
229
236
|
free(mphq);
|
230
237
|
|
231
|
-
|
238
|
+
q_destroy_i(self);
|
232
239
|
}
|
233
240
|
|
234
|
-
void mphq_extract_terms(Query *self,
|
241
|
+
void mphq_extract_terms(Query *self, HashSet *terms)
|
235
242
|
{
|
236
243
|
GET_MPHQ;
|
237
244
|
int i, j;
|
238
245
|
for (i = 0; i < mphq->t_cnt; i++) {
|
239
246
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
240
|
-
|
247
|
+
hs_add(terms, term_clone(mphq->terms[i][j]));
|
241
248
|
}
|
242
249
|
}
|
243
250
|
}
|
@@ -248,10 +255,10 @@ char *mphq_to_s(Query *self, char *field)
|
|
248
255
|
int i, j, buf_index = 0, len = 0, pos, last_pos = -1;
|
249
256
|
char *buffer;
|
250
257
|
if (!mphq->t_cnt) return NULL;
|
251
|
-
len = strlen(mphq->field) + 1;
|
258
|
+
len = (int)strlen(mphq->field) + 1;
|
252
259
|
for (i = 0; i < mphq->t_cnt; i++) {
|
253
260
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
254
|
-
len += strlen(mphq->terms[i][j]->text) + 1;
|
261
|
+
len += (int)strlen(mphq->terms[i][j]->text) + 1;
|
255
262
|
}
|
256
263
|
}
|
257
264
|
|
@@ -261,7 +268,7 @@ char *mphq_to_s(Query *self, char *field)
|
|
261
268
|
buffer = ALLOC_N(char, len);
|
262
269
|
|
263
270
|
if (strcmp(field, mphq->field) != 0) {
|
264
|
-
len = strlen(mphq->field);
|
271
|
+
len = (int)strlen(mphq->field);
|
265
272
|
memcpy(buffer, mphq->field, len);
|
266
273
|
buffer[len] = ':';
|
267
274
|
buf_index += len + 1;
|
@@ -278,7 +285,7 @@ char *mphq_to_s(Query *self, char *field)
|
|
278
285
|
|
279
286
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
280
287
|
Term *term = mphq->terms[i][j];
|
281
|
-
len = strlen(term->text);
|
288
|
+
len = (int)strlen(term->text);
|
282
289
|
memcpy(buffer + buf_index, term->text, len);
|
283
290
|
buf_index += len;
|
284
291
|
buffer[buf_index++] = '|';
|
@@ -291,7 +298,7 @@ char *mphq_to_s(Query *self, char *field)
|
|
291
298
|
buffer[buf_index] = 0;
|
292
299
|
if (mphq->slop != 0) {
|
293
300
|
sprintf(buffer + buf_index, "~%d", mphq->slop);
|
294
|
-
buf_index += strlen(buffer + buf_index);
|
301
|
+
buf_index += (int)strlen(buffer + buf_index);
|
295
302
|
}
|
296
303
|
if (self->boost != 1.0) {
|
297
304
|
buffer[buf_index] = '^';
|
@@ -311,16 +318,48 @@ Query *mphq_rewrite(Query *self, IndexReader *ir)
|
|
311
318
|
bq_add_query(bq, tq_create(term_clone(terms[i])), BC_SHOULD);
|
312
319
|
}
|
313
320
|
bq->boost = self->boost;
|
314
|
-
|
315
|
-
return self->rewritten = bq;
|
321
|
+
return bq;
|
316
322
|
} else {
|
323
|
+
self->ref_cnt++;
|
317
324
|
return self;
|
318
325
|
}
|
319
326
|
}
|
320
327
|
|
328
|
+
static uint mphq_hash(Query *self)
|
329
|
+
{
|
330
|
+
int i, j;
|
331
|
+
uint hash = 0;
|
332
|
+
MultiPhraseQuery *mphq = (MultiPhraseQuery *)self->data;
|
333
|
+
for (i = 0; i < mphq->t_cnt; i++) {
|
334
|
+
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
335
|
+
hash ^= (term_hash(mphq->terms[i][j]) ^ mphq->positions[i]);
|
336
|
+
}
|
337
|
+
hash <<= 1;
|
338
|
+
}
|
339
|
+
return (hash ^ mphq->slop);
|
340
|
+
}
|
341
|
+
|
342
|
+
static int mphq_eq(Query *self, Query *o)
|
343
|
+
{
|
344
|
+
int i, j;
|
345
|
+
MultiPhraseQuery *mphq1 = (MultiPhraseQuery *)self->data;
|
346
|
+
MultiPhraseQuery *mphq2 = (MultiPhraseQuery *)o->data;
|
347
|
+
if (mphq1->slop != mphq2->slop) return false;
|
348
|
+
for (i = 0; i < mphq1->t_cnt; i++) {
|
349
|
+
if ((mphq1->pt_cnt[i] != mphq2->pt_cnt[i]) ||
|
350
|
+
(mphq1->positions[i] != mphq2->positions[i])) return false;
|
351
|
+
|
352
|
+
for (j = 0; j < mphq1->pt_cnt[i]; j++) {
|
353
|
+
if (!term_eq(mphq1->terms[i][j], mphq2->terms[i][j])) return false;
|
354
|
+
}
|
355
|
+
}
|
356
|
+
return true;
|
357
|
+
}
|
358
|
+
|
321
359
|
Query *mphq_create()
|
322
360
|
{
|
323
361
|
Query *self = q_create();
|
362
|
+
|
324
363
|
MultiPhraseQuery *mphq = ALLOC(MultiPhraseQuery);
|
325
364
|
ZEROSET(mphq, MultiPhraseQuery, 1);
|
326
365
|
mphq->t_capa = PHQ_INIT_CAPA;
|
@@ -329,11 +368,13 @@ Query *mphq_create()
|
|
329
368
|
mphq->pt_cnt = ALLOC_N(int, PHQ_INIT_CAPA);
|
330
369
|
self->data = mphq;
|
331
370
|
|
332
|
-
self->
|
371
|
+
self->type = MULTI_PHRASE_QUERY;
|
372
|
+
self->rewrite = &mphq_rewrite;
|
333
373
|
self->extract_terms = &mphq_extract_terms;
|
334
374
|
self->to_s = &mphq_to_s;
|
335
|
-
self->
|
336
|
-
self->
|
337
|
-
self->
|
375
|
+
self->hash = &mphq_hash;
|
376
|
+
self->eq = &mphq_eq;
|
377
|
+
self->destroy_i = &mphq_destroy;
|
378
|
+
self->create_weight_i = &mphw_create;
|
338
379
|
return self;
|
339
380
|
}
|