ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/q_const_score.c
CHANGED
@@ -12,11 +12,6 @@ char *csw_to_s(Weight *self)
|
|
12
12
|
return strfmt("ConstantScoreWeight(%f)", self->value);
|
13
13
|
}
|
14
14
|
|
15
|
-
void csw_destroy(void *p)
|
16
|
-
{
|
17
|
-
free(p);
|
18
|
-
}
|
19
|
-
|
20
15
|
Explanation *csw_explain(Weight *self, IndexReader *ir, int doc_num)
|
21
16
|
{
|
22
17
|
Filter *filter = (Filter *)self->query->data;
|
@@ -40,21 +35,14 @@ Explanation *csw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
40
35
|
|
41
36
|
Weight *csw_create(Query *query, Searcher *searcher)
|
42
37
|
{
|
43
|
-
Weight *self =
|
44
|
-
ZEROSET(self, Weight, 1);
|
45
|
-
self->get_query = &w_get_query;
|
46
|
-
self->get_value = &w_get_value;
|
47
|
-
self->normalize = &w_normalize;
|
38
|
+
Weight *self = w_create(query);
|
48
39
|
self->scorer = &cssc_create;
|
49
40
|
self->explain = &csw_explain;
|
50
41
|
self->to_s = &csw_to_s;
|
51
|
-
self->destroy = &csw_destroy;
|
52
42
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
53
43
|
|
54
44
|
self->similarity = query->get_similarity(query, searcher);
|
55
45
|
self->idf = 1.0;
|
56
|
-
self->query = query;
|
57
|
-
self->value = 0.0;
|
58
46
|
|
59
47
|
return self;
|
60
48
|
}
|
@@ -79,24 +67,36 @@ char *csq_to_s(Query *self, char *field)
|
|
79
67
|
return buffer;;
|
80
68
|
}
|
81
69
|
|
82
|
-
void csq_destroy(
|
70
|
+
void csq_destroy(Query *self)
|
83
71
|
{
|
84
|
-
Query *self = (Query *)p;
|
85
72
|
if (self->destroy_all) {
|
86
73
|
Filter *filter = (Filter *)self->data;
|
87
74
|
filter->destroy(filter);
|
88
75
|
}
|
89
|
-
|
76
|
+
q_destroy_i(self);
|
77
|
+
}
|
78
|
+
|
79
|
+
static uint csq_hash(Query *self)
|
80
|
+
{
|
81
|
+
return filt_hash((Filter *)self->data);
|
82
|
+
}
|
83
|
+
|
84
|
+
static int csq_eq(Query *self, Query *o)
|
85
|
+
{
|
86
|
+
return filt_eq((Filter *)self->data, (Filter *)o->data);
|
90
87
|
}
|
91
88
|
|
92
89
|
Query *csq_create(Filter *filter)
|
93
90
|
{
|
94
91
|
Query *self = q_create();
|
95
|
-
self->type = CONSTANT_QUERY;
|
96
92
|
self->data = filter;
|
97
|
-
|
93
|
+
|
94
|
+
self->type = CONSTANT_QUERY;
|
98
95
|
self->to_s = &csq_to_s;
|
99
|
-
self->
|
96
|
+
self->hash = &csq_hash;
|
97
|
+
self->eq = &csq_eq;
|
98
|
+
self->destroy_i = &csq_destroy;
|
99
|
+
self->create_weight_i = &csw_create;
|
100
100
|
|
101
101
|
return self;
|
102
102
|
}
|
@@ -143,6 +143,6 @@ Scorer *cssc_create(Weight *weight, IndexReader *ir)
|
|
143
143
|
self->next = &cssc_next;
|
144
144
|
self->skip_to = &cssc_skip_to;
|
145
145
|
self->explain = &cssc_explain;
|
146
|
-
self->destroy = &
|
146
|
+
self->destroy = &scorer_destroy_i;
|
147
147
|
return self;
|
148
148
|
}
|
data/ext/q_filtered_query.c
CHANGED
@@ -14,11 +14,6 @@ char *fqw_to_s(Weight *self)
|
|
14
14
|
return strfmt("FilteredQueryWeight(%f)", self->value);
|
15
15
|
}
|
16
16
|
|
17
|
-
void fqw_destroy(void *p)
|
18
|
-
{
|
19
|
-
free(p);
|
20
|
-
}
|
21
|
-
|
22
17
|
float fqw_sum_of_squared_weights(Weight *self)
|
23
18
|
{
|
24
19
|
Weight *sw = (Weight *)self->data;
|
@@ -28,7 +23,7 @@ float fqw_sum_of_squared_weights(Weight *self)
|
|
28
23
|
void fqw_normalize(Weight *self, float normalization_factor)
|
29
24
|
{
|
30
25
|
Weight *sw = (Weight *)self->data;
|
31
|
-
|
26
|
+
sw->normalize(sw, normalization_factor);
|
32
27
|
}
|
33
28
|
|
34
29
|
float fqw_get_value(Weight *self)
|
@@ -52,13 +47,19 @@ Scorer *fqw_scorer(Weight *self, IndexReader *ir)
|
|
52
47
|
return fqsc_create(scorer, filter->get_bv(filter, ir), self->similarity);
|
53
48
|
}
|
54
49
|
|
50
|
+
void fqw_destroy(Weight *self)
|
51
|
+
{
|
52
|
+
Weight *sw = (Weight *)self->data;
|
53
|
+
sw->destroy(sw);
|
54
|
+
w_destroy(self);
|
55
|
+
}
|
56
|
+
|
55
57
|
Weight *fqw_create(Query *query, Weight *sub_weight, Similarity *sim)
|
56
58
|
{
|
57
|
-
Weight *self =
|
58
|
-
|
59
|
+
Weight *self = w_create(query);
|
60
|
+
|
59
61
|
self->data = sub_weight;
|
60
62
|
|
61
|
-
self->get_query = &w_get_query;
|
62
63
|
self->get_value = &fqw_get_value;
|
63
64
|
self->normalize = &fqw_normalize;
|
64
65
|
self->scorer = &fqw_scorer;
|
@@ -69,7 +70,6 @@ Weight *fqw_create(Query *query, Weight *sub_weight, Similarity *sim)
|
|
69
70
|
|
70
71
|
self->similarity = sim;
|
71
72
|
self->idf = 1.0;
|
72
|
-
self->query = query;
|
73
73
|
self->value = sub_weight->value;
|
74
74
|
|
75
75
|
return self;
|
@@ -99,16 +99,15 @@ char *fq_to_s(Query *self, char *field)
|
|
99
99
|
return buffer;;
|
100
100
|
}
|
101
101
|
|
102
|
-
void fq_destroy(
|
102
|
+
void fq_destroy(Query *self)
|
103
103
|
{
|
104
|
-
Query *self = (Query *)p;
|
105
104
|
if (self->destroy_all) {
|
106
105
|
FilteredQuery *fq = (FilteredQuery *)self->data;
|
107
106
|
fq->filter->destroy(fq->filter);
|
108
|
-
|
107
|
+
q_deref(fq->query);
|
109
108
|
}
|
110
109
|
free(self->data);
|
111
|
-
|
110
|
+
q_destroy_i(self);
|
112
111
|
}
|
113
112
|
|
114
113
|
Weight *fq_create_weight(Query *self, Searcher *searcher)
|
@@ -121,14 +120,16 @@ Weight *fq_create_weight(Query *self, Searcher *searcher)
|
|
121
120
|
Query *fq_create(Query *query, Filter *filter)
|
122
121
|
{
|
123
122
|
Query *self = q_create();
|
123
|
+
|
124
124
|
FilteredQuery *fq = ALLOC(FilteredQuery);
|
125
125
|
fq->query = query;
|
126
126
|
fq->filter = filter;
|
127
|
-
self->type = FILTERED_QUERY;
|
128
127
|
self->data = fq;
|
129
|
-
|
128
|
+
|
129
|
+
self->type = FILTERED_QUERY;
|
130
130
|
self->to_s = &fq_to_s;
|
131
|
-
self->
|
131
|
+
self->destroy_i = &fq_destroy;
|
132
|
+
self->create_weight_i = &fq_create_weight;
|
132
133
|
|
133
134
|
return self;
|
134
135
|
}
|
@@ -180,13 +181,12 @@ Explanation *fqsc_explain(Scorer *self, int doc_num)
|
|
180
181
|
return sub_sc->explain(sub_sc, doc_num);
|
181
182
|
}
|
182
183
|
|
183
|
-
void fqsc_destroy(
|
184
|
+
void fqsc_destroy(Scorer *self)
|
184
185
|
{
|
185
|
-
Scorer *self = (Scorer *)p;
|
186
186
|
FilteredQueryScorer *fqsc = (FilteredQueryScorer *)self->data;
|
187
187
|
bv_destroy(fqsc->bv);
|
188
188
|
fqsc->sub_scorer->destroy(fqsc->sub_scorer);
|
189
|
-
|
189
|
+
scorer_destroy_i(self);
|
190
190
|
}
|
191
191
|
|
192
192
|
Scorer *fqsc_create(Scorer *scorer, BitVector *bv, Similarity *sim)
|
data/ext/q_fuzzy.c
CHANGED
@@ -24,7 +24,7 @@ void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
|
24
24
|
}
|
25
25
|
}
|
26
26
|
|
27
|
-
|
27
|
+
int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
|
28
28
|
{
|
29
29
|
return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
|
30
30
|
: fuzq_calculate_max_distance(fuzq, m);
|
@@ -34,7 +34,7 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
|
|
34
34
|
{
|
35
35
|
int i, j;
|
36
36
|
int max_distance;
|
37
|
-
int m = strlen(target);
|
37
|
+
int m = (int)strlen(target);
|
38
38
|
int n = fuzq->text_len;
|
39
39
|
int *d = fuzq->da;
|
40
40
|
char *text = fuzq->text;
|
@@ -82,7 +82,7 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
|
|
82
82
|
} else {
|
83
83
|
d[i + m*j] = min3(d[i-1 + m*j]+1, d[i + m*(j-1)]+1, d[i-1 + m*(j-1)]);
|
84
84
|
}
|
85
|
-
best_pos_ed_dist =
|
85
|
+
best_pos_ed_dist = min2(best_pos_ed_dist, d[i + m*j]);
|
86
86
|
}
|
87
87
|
//printf("(bped = %d, i = %d, md = %d)", best_pos_ed_dist, i, max_distance);
|
88
88
|
|
@@ -104,7 +104,7 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
|
|
104
104
|
* number of characters in the shorter word. but this was the formula that
|
105
105
|
* was previously used in FuzzyTermEnum, so it has not been changed (even
|
106
106
|
* though min_sim must be greater than 0.0) */
|
107
|
-
return 1.0f - ((float)d[n + m*m] / (float) (fuzq->pre_len +
|
107
|
+
return 1.0f - ((float)d[n + m*m] / (float) (fuzq->pre_len + min2(n, m)));
|
108
108
|
}
|
109
109
|
|
110
110
|
/****************************************************************************
|
@@ -118,8 +118,8 @@ char *fuzq_to_s(Query *self, char *field)
|
|
118
118
|
char *buffer, *bptr;
|
119
119
|
FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
|
120
120
|
Term *term = fuzq->term;
|
121
|
-
int tlen = strlen(term->text);
|
122
|
-
int flen = strlen(term->field);
|
121
|
+
int tlen = (int)strlen(term->text);
|
122
|
+
int flen = (int)strlen(term->field);
|
123
123
|
bptr = buffer = ALLOC_N(char, tlen + flen + 35);
|
124
124
|
|
125
125
|
if (strcmp(term->field, field) != 0) {
|
@@ -155,11 +155,10 @@ bool scored_term_less_than(void *p1, void *p2)
|
|
155
155
|
return (st1->score < st2->score);
|
156
156
|
}
|
157
157
|
|
158
|
-
void scored_term_destroy(
|
158
|
+
void scored_term_destroy(ScoredTerm *self)
|
159
159
|
{
|
160
|
-
|
161
|
-
|
162
|
-
free(st);
|
160
|
+
term_destroy(self->term);
|
161
|
+
free(self);
|
163
162
|
}
|
164
163
|
|
165
164
|
ScoredTerm *scored_term_create(Term *term, float score)
|
@@ -181,7 +180,7 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
181
180
|
char *field = term->field;
|
182
181
|
Term prefix_term;
|
183
182
|
prefix_term.field = field;
|
184
|
-
if (fuzq->pre_len >= strlen(text)) {
|
183
|
+
if (fuzq->pre_len >= (int)strlen(text)) {
|
185
184
|
q = tq_create(term_clone(term));
|
186
185
|
} else {
|
187
186
|
PriorityQueue *term_pq;
|
@@ -195,7 +194,7 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
195
194
|
|
196
195
|
term_pq = pq_create(((BooleanQuery *)q->data)->max_clause_cnt,
|
197
196
|
&scored_term_less_than);
|
198
|
-
term_pq->free_elem = &scored_term_destroy;
|
197
|
+
term_pq->free_elem = (free_ft)&scored_term_destroy;
|
199
198
|
|
200
199
|
prefix_term.field = field;
|
201
200
|
prefix_term.text = (char *)EMPTY_STRING;
|
@@ -207,9 +206,9 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
207
206
|
}
|
208
207
|
te = ir->terms_from(ir, &prefix_term);
|
209
208
|
|
210
|
-
fuzq->scale_factor = 1.0 / (1.0 - fuzq->min_sim);
|
209
|
+
fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
|
211
210
|
fuzq->text = fuzq->term->text + pre_len;
|
212
|
-
fuzq->text_len = strlen(fuzq->text);
|
211
|
+
fuzq->text_len = (int)strlen(fuzq->text);
|
213
212
|
fuzq_initialize_max_distances(fuzq);
|
214
213
|
|
215
214
|
if (te) {
|
@@ -246,36 +245,51 @@ Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
246
245
|
pq_destroy(term_pq);
|
247
246
|
}
|
248
247
|
|
249
|
-
|
250
|
-
return self->rewritten = q;
|
248
|
+
return q;
|
251
249
|
}
|
252
250
|
|
253
|
-
void fuzq_destroy(
|
251
|
+
void fuzq_destroy(Query *self)
|
254
252
|
{
|
255
|
-
Query *self = (Query *)p;
|
256
253
|
FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
|
257
254
|
if (self->destroy_all) term_destroy((Term *)fuzq->term);
|
258
255
|
free(fuzq->da);
|
259
256
|
free(fuzq);
|
260
|
-
|
257
|
+
q_destroy_i(self);
|
258
|
+
}
|
259
|
+
|
260
|
+
static uint fuzq_hash(Query *self)
|
261
|
+
{
|
262
|
+
FuzzyQuery *fq = (FuzzyQuery *)self->data;
|
263
|
+
return term_hash(fq->term) ^ *((int *)&fq->min_sim) ^ fq->pre_len;
|
264
|
+
}
|
265
|
+
|
266
|
+
static int fuzq_eq(Query *self, Query *o)
|
267
|
+
{
|
268
|
+
FuzzyQuery *fq1 = (FuzzyQuery *)self->data;
|
269
|
+
FuzzyQuery *fq2 = (FuzzyQuery *)o->data;
|
270
|
+
return term_eq(fq1->term, fq2->term) &&
|
271
|
+
(fq1->pre_len == fq2->pre_len) &&
|
272
|
+
(fq1->min_sim == fq2->min_sim);
|
261
273
|
}
|
262
274
|
|
263
275
|
Query *fuzq_create(Term *term)
|
264
276
|
{
|
265
277
|
Query *self = q_create();
|
278
|
+
|
266
279
|
FuzzyQuery *fq = ALLOC(FuzzyQuery);
|
267
280
|
ZEROSET(fq, FuzzyQuery, 1);
|
268
|
-
|
269
281
|
fq->term = term;
|
270
282
|
fq->pre_len = DEF_PRE_LEN;
|
271
283
|
fq->min_sim = DEF_MIN_SIM;
|
272
284
|
self->data = fq;
|
285
|
+
|
273
286
|
self->type = FUZZY_QUERY;
|
274
|
-
self->create_weight = NULL;
|
275
287
|
self->to_s = &fuzq_to_s;
|
288
|
+
self->hash = &fuzq_hash;
|
289
|
+
self->eq = &fuzq_eq;
|
276
290
|
self->rewrite = &fuzq_rewrite;
|
277
|
-
self->
|
278
|
-
self->
|
291
|
+
self->destroy_i = &fuzq_destroy;
|
292
|
+
self->create_weight_i = &q_create_weight_unsup;
|
279
293
|
|
280
294
|
return self;
|
281
295
|
}
|
data/ext/q_match_all.c
CHANGED
@@ -12,11 +12,6 @@ char *maw_to_s(Weight *self)
|
|
12
12
|
return strfmt("MatchAllWeight(%f)", self->value);
|
13
13
|
}
|
14
14
|
|
15
|
-
void maw_destroy(void *p)
|
16
|
-
{
|
17
|
-
free(p);
|
18
|
-
}
|
19
|
-
|
20
15
|
Explanation *maw_explain(Weight *self, IndexReader *ir, int doc_num)
|
21
16
|
{
|
22
17
|
Explanation *expl;
|
@@ -34,21 +29,15 @@ Explanation *maw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
34
29
|
|
35
30
|
Weight *maw_create(Query *query, Searcher *searcher)
|
36
31
|
{
|
37
|
-
Weight *self =
|
38
|
-
|
39
|
-
self->get_query = &w_get_query;
|
40
|
-
self->get_value = &w_get_value;
|
41
|
-
self->normalize = &w_normalize;
|
32
|
+
Weight *self = w_create(query);
|
33
|
+
|
42
34
|
self->scorer = &masc_create;
|
43
35
|
self->explain = &maw_explain;
|
44
36
|
self->to_s = &maw_to_s;
|
45
|
-
self->destroy = &maw_destroy;
|
46
37
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
47
38
|
|
48
39
|
self->similarity = query->get_similarity(query, searcher);
|
49
40
|
self->idf = 1.0;
|
50
|
-
self->query = query;
|
51
|
-
self->value = 0.0;
|
52
41
|
|
53
42
|
return self;
|
54
43
|
}
|
@@ -68,19 +57,26 @@ char *maq_to_s(Query *self, char *field)
|
|
68
57
|
}
|
69
58
|
}
|
70
59
|
|
71
|
-
|
60
|
+
static uint maq_hash(Query *self)
|
72
61
|
{
|
73
|
-
|
74
|
-
|
62
|
+
return 0;
|
63
|
+
}
|
64
|
+
|
65
|
+
static int maq_eq(Query *self, Query *o)
|
66
|
+
{
|
67
|
+
return true;
|
75
68
|
}
|
76
69
|
|
77
70
|
Query *maq_create()
|
78
71
|
{
|
79
72
|
Query *self = q_create();
|
73
|
+
|
80
74
|
self->type = MATCH_ALL_QUERY;
|
81
|
-
self->create_weight = &maw_create;
|
82
75
|
self->to_s = &maq_to_s;
|
83
|
-
self->
|
76
|
+
self->hash = &maq_hash;
|
77
|
+
self->eq = &maq_eq;
|
78
|
+
self->destroy_i = &q_destroy_i;
|
79
|
+
self->create_weight_i = &maw_create;
|
84
80
|
|
85
81
|
return self;
|
86
82
|
}
|
@@ -133,6 +129,6 @@ Scorer *masc_create(Weight *weight, IndexReader *ir)
|
|
133
129
|
self->next = &masc_next;
|
134
130
|
self->skip_to = &masc_skip_to;
|
135
131
|
self->explain = &masc_explain;
|
136
|
-
self->destroy = &
|
132
|
+
self->destroy = &scorer_destroy_i;
|
137
133
|
return self;
|
138
134
|
}
|
data/ext/q_multi_phrase.c
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
#include <string.h>
|
2
1
|
#include "search.h"
|
2
|
+
#include <string.h>
|
3
3
|
|
4
4
|
static char * const FIELD_CHANGE_ERROR_MSG = "All phrase terms must be in the same field.";
|
5
5
|
|
@@ -19,9 +19,13 @@ Scorer *mphw_scorer(Weight *self, IndexReader *ir)
|
|
19
19
|
Scorer *phsc;
|
20
20
|
MultiPhraseQuery *mphq = (MultiPhraseQuery *)self->query->data;
|
21
21
|
int i;
|
22
|
-
|
22
|
+
TermDocEnum **tps;
|
23
|
+
|
24
|
+
if (mphq->t_cnt == 0) {
|
25
|
+
return NULL; /* optimize zero-term case */
|
26
|
+
}
|
23
27
|
|
24
|
-
|
28
|
+
tps = ALLOC_N(TermDocEnum *, mphq->t_cnt);
|
25
29
|
|
26
30
|
for (i = 0; i < mphq->t_cnt; i++) {
|
27
31
|
if (mphq->pt_cnt[i] == 1) {
|
@@ -56,6 +60,17 @@ Scorer *mphw_scorer(Weight *self, IndexReader *ir)
|
|
56
60
|
|
57
61
|
Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
58
62
|
{
|
63
|
+
Explanation *idf_expl1;
|
64
|
+
Explanation *idf_expl2;
|
65
|
+
Explanation *query_expl;
|
66
|
+
Explanation *qnorm_expl;
|
67
|
+
Explanation *field_expl;
|
68
|
+
Explanation *tf_expl;
|
69
|
+
Scorer *scorer;
|
70
|
+
uchar *field_norms;
|
71
|
+
float field_norm;
|
72
|
+
Explanation *field_norm_expl;
|
73
|
+
|
59
74
|
char *query_str = self->query->to_s(self->query, "");
|
60
75
|
MultiPhraseQuery *mphq = (MultiPhraseQuery *)self->query->data;
|
61
76
|
int i, j;
|
@@ -67,7 +82,7 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
67
82
|
|
68
83
|
for (i = 0; i < mphq->t_cnt; i++) {
|
69
84
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
70
|
-
len += strlen(mphq->terms[i][j]->text) + 30;
|
85
|
+
len += (int)strlen(mphq->terms[i][j]->text) + 30;
|
71
86
|
}
|
72
87
|
}
|
73
88
|
doc_freqs = ALLOC_N(char, len);
|
@@ -75,20 +90,20 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
75
90
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
76
91
|
Term *term = mphq->terms[i][j];
|
77
92
|
sprintf(doc_freqs + pos, "%s=%d, ", term->text, ir->doc_freq(ir, term));
|
78
|
-
pos += strlen(doc_freqs + pos);
|
93
|
+
pos += (int)strlen(doc_freqs + pos);
|
79
94
|
}
|
80
95
|
}
|
81
96
|
pos -= 2; // remove ", " from the end
|
82
97
|
doc_freqs[pos] = 0;
|
83
98
|
|
84
|
-
|
99
|
+
idf_expl1 = expl_create(self->idf,
|
85
100
|
strfmt("idf(%s:<%s>)", mphq->field, doc_freqs));
|
86
|
-
|
101
|
+
idf_expl2 = expl_create(self->idf,
|
87
102
|
strfmt("idf(%s:<%s>)", mphq->field, doc_freqs));
|
88
103
|
free(doc_freqs);
|
89
104
|
|
90
|
-
|
91
|
-
|
105
|
+
/* explain query weight */
|
106
|
+
query_expl = expl_create(0.0,
|
92
107
|
strfmt("query_weight(%s), product of:", query_str));
|
93
108
|
|
94
109
|
if (self->query->boost != 1.0) {
|
@@ -96,36 +111,36 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
96
111
|
}
|
97
112
|
expl_add_detail(query_expl, idf_expl1);
|
98
113
|
|
99
|
-
|
114
|
+
qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
|
100
115
|
expl_add_detail(query_expl, qnorm_expl);
|
101
116
|
|
102
117
|
query_expl->value = self->query->boost * self->idf * self->qnorm;
|
103
118
|
|
104
119
|
expl_add_detail(expl, query_expl);
|
105
120
|
|
106
|
-
|
107
|
-
|
121
|
+
/* explain field weight */
|
122
|
+
field_expl = expl_create(0.0,
|
108
123
|
strfmt("field_weight(%s in %d), product of:", query_str, doc_num));
|
109
124
|
free(query_str);
|
110
125
|
|
111
|
-
|
112
|
-
|
126
|
+
scorer = self->scorer(self, ir);
|
127
|
+
tf_expl = scorer->explain(scorer, doc_num);
|
113
128
|
scorer->destroy(scorer);
|
114
129
|
expl_add_detail(field_expl, tf_expl);
|
115
130
|
expl_add_detail(field_expl, idf_expl2);
|
116
131
|
|
117
|
-
|
118
|
-
|
132
|
+
field_norms = ir->get_norms(ir, mphq->field);
|
133
|
+
field_norm = (field_norms != NULL)
|
119
134
|
? sim_decode_norm(self->similarity, field_norms[doc_num])
|
120
|
-
: 0.0;
|
121
|
-
|
135
|
+
: (float)0.0;
|
136
|
+
field_norm_expl = expl_create(field_norm,
|
122
137
|
strfmt("field_norm(field=%s, doc=%d)", mphq->field, doc_num));
|
123
138
|
|
124
139
|
expl_add_detail(field_expl, field_norm_expl);
|
125
140
|
|
126
141
|
field_expl->value = tf_expl->value * self->idf * field_norm;
|
127
142
|
|
128
|
-
|
143
|
+
/* combine them */
|
129
144
|
if (query_expl->value == 1.0) {
|
130
145
|
expl_destoy(expl);
|
131
146
|
return field_expl;
|
@@ -136,29 +151,23 @@ Explanation *mphw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
136
151
|
}
|
137
152
|
}
|
138
153
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
154
|
Weight *mphw_create(Query *query, Searcher *searcher)
|
143
155
|
{
|
156
|
+
Weight *self = w_create(query);
|
157
|
+
|
144
158
|
MultiPhraseQuery *mphq = (MultiPhraseQuery *)query->data;
|
145
|
-
Weight *self = ALLOC(Weight);
|
146
159
|
int i, j;
|
147
|
-
|
148
|
-
self->get_query = &w_get_query;
|
149
|
-
self->get_value = &w_get_value;
|
150
|
-
self->normalize = &w_normalize;
|
160
|
+
|
151
161
|
self->scorer = &mphw_scorer;
|
152
162
|
self->explain = &mphw_explain;
|
153
163
|
self->to_s = &mphw_to_s;
|
154
|
-
self->destroy = &free;
|
155
164
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
156
165
|
|
157
166
|
self->similarity = query->get_similarity(query, searcher);
|
158
167
|
self->query = query;
|
159
168
|
self->value = query->boost;
|
160
|
-
|
161
169
|
self->idf = 0.0;
|
170
|
+
|
162
171
|
for (i = 0; i < mphq->t_cnt; i++) {
|
163
172
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
164
173
|
self->idf += sim_idf_term(self->similarity, mphq->terms[i][j], searcher);
|
@@ -209,10 +218,8 @@ void mphq_add_terms(Query *self, Term **terms, int t_cnt, int pos_inc)
|
|
209
218
|
mphq->t_cnt++;
|
210
219
|
}
|
211
220
|
|
212
|
-
void mphq_destroy(
|
221
|
+
void mphq_destroy(Query *self)
|
213
222
|
{
|
214
|
-
Query *self = (Query *)p;
|
215
|
-
|
216
223
|
GET_MPHQ;
|
217
224
|
int i, j;
|
218
225
|
if (self->destroy_all) {
|
@@ -228,16 +235,16 @@ void mphq_destroy(void *p)
|
|
228
235
|
free(mphq->pt_cnt);
|
229
236
|
free(mphq);
|
230
237
|
|
231
|
-
|
238
|
+
q_destroy_i(self);
|
232
239
|
}
|
233
240
|
|
234
|
-
void mphq_extract_terms(Query *self,
|
241
|
+
void mphq_extract_terms(Query *self, HashSet *terms)
|
235
242
|
{
|
236
243
|
GET_MPHQ;
|
237
244
|
int i, j;
|
238
245
|
for (i = 0; i < mphq->t_cnt; i++) {
|
239
246
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
240
|
-
|
247
|
+
hs_add(terms, term_clone(mphq->terms[i][j]));
|
241
248
|
}
|
242
249
|
}
|
243
250
|
}
|
@@ -248,10 +255,10 @@ char *mphq_to_s(Query *self, char *field)
|
|
248
255
|
int i, j, buf_index = 0, len = 0, pos, last_pos = -1;
|
249
256
|
char *buffer;
|
250
257
|
if (!mphq->t_cnt) return NULL;
|
251
|
-
len = strlen(mphq->field) + 1;
|
258
|
+
len = (int)strlen(mphq->field) + 1;
|
252
259
|
for (i = 0; i < mphq->t_cnt; i++) {
|
253
260
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
254
|
-
len += strlen(mphq->terms[i][j]->text) + 1;
|
261
|
+
len += (int)strlen(mphq->terms[i][j]->text) + 1;
|
255
262
|
}
|
256
263
|
}
|
257
264
|
|
@@ -261,7 +268,7 @@ char *mphq_to_s(Query *self, char *field)
|
|
261
268
|
buffer = ALLOC_N(char, len);
|
262
269
|
|
263
270
|
if (strcmp(field, mphq->field) != 0) {
|
264
|
-
len = strlen(mphq->field);
|
271
|
+
len = (int)strlen(mphq->field);
|
265
272
|
memcpy(buffer, mphq->field, len);
|
266
273
|
buffer[len] = ':';
|
267
274
|
buf_index += len + 1;
|
@@ -278,7 +285,7 @@ char *mphq_to_s(Query *self, char *field)
|
|
278
285
|
|
279
286
|
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
280
287
|
Term *term = mphq->terms[i][j];
|
281
|
-
len = strlen(term->text);
|
288
|
+
len = (int)strlen(term->text);
|
282
289
|
memcpy(buffer + buf_index, term->text, len);
|
283
290
|
buf_index += len;
|
284
291
|
buffer[buf_index++] = '|';
|
@@ -291,7 +298,7 @@ char *mphq_to_s(Query *self, char *field)
|
|
291
298
|
buffer[buf_index] = 0;
|
292
299
|
if (mphq->slop != 0) {
|
293
300
|
sprintf(buffer + buf_index, "~%d", mphq->slop);
|
294
|
-
buf_index += strlen(buffer + buf_index);
|
301
|
+
buf_index += (int)strlen(buffer + buf_index);
|
295
302
|
}
|
296
303
|
if (self->boost != 1.0) {
|
297
304
|
buffer[buf_index] = '^';
|
@@ -311,16 +318,48 @@ Query *mphq_rewrite(Query *self, IndexReader *ir)
|
|
311
318
|
bq_add_query(bq, tq_create(term_clone(terms[i])), BC_SHOULD);
|
312
319
|
}
|
313
320
|
bq->boost = self->boost;
|
314
|
-
|
315
|
-
return self->rewritten = bq;
|
321
|
+
return bq;
|
316
322
|
} else {
|
323
|
+
self->ref_cnt++;
|
317
324
|
return self;
|
318
325
|
}
|
319
326
|
}
|
320
327
|
|
328
|
+
static uint mphq_hash(Query *self)
|
329
|
+
{
|
330
|
+
int i, j;
|
331
|
+
uint hash = 0;
|
332
|
+
MultiPhraseQuery *mphq = (MultiPhraseQuery *)self->data;
|
333
|
+
for (i = 0; i < mphq->t_cnt; i++) {
|
334
|
+
for (j = 0; j < mphq->pt_cnt[i]; j++) {
|
335
|
+
hash ^= (term_hash(mphq->terms[i][j]) ^ mphq->positions[i]);
|
336
|
+
}
|
337
|
+
hash <<= 1;
|
338
|
+
}
|
339
|
+
return (hash ^ mphq->slop);
|
340
|
+
}
|
341
|
+
|
342
|
+
static int mphq_eq(Query *self, Query *o)
|
343
|
+
{
|
344
|
+
int i, j;
|
345
|
+
MultiPhraseQuery *mphq1 = (MultiPhraseQuery *)self->data;
|
346
|
+
MultiPhraseQuery *mphq2 = (MultiPhraseQuery *)o->data;
|
347
|
+
if (mphq1->slop != mphq2->slop) return false;
|
348
|
+
for (i = 0; i < mphq1->t_cnt; i++) {
|
349
|
+
if ((mphq1->pt_cnt[i] != mphq2->pt_cnt[i]) ||
|
350
|
+
(mphq1->positions[i] != mphq2->positions[i])) return false;
|
351
|
+
|
352
|
+
for (j = 0; j < mphq1->pt_cnt[i]; j++) {
|
353
|
+
if (!term_eq(mphq1->terms[i][j], mphq2->terms[i][j])) return false;
|
354
|
+
}
|
355
|
+
}
|
356
|
+
return true;
|
357
|
+
}
|
358
|
+
|
321
359
|
Query *mphq_create()
|
322
360
|
{
|
323
361
|
Query *self = q_create();
|
362
|
+
|
324
363
|
MultiPhraseQuery *mphq = ALLOC(MultiPhraseQuery);
|
325
364
|
ZEROSET(mphq, MultiPhraseQuery, 1);
|
326
365
|
mphq->t_capa = PHQ_INIT_CAPA;
|
@@ -329,11 +368,13 @@ Query *mphq_create()
|
|
329
368
|
mphq->pt_cnt = ALLOC_N(int, PHQ_INIT_CAPA);
|
330
369
|
self->data = mphq;
|
331
370
|
|
332
|
-
self->
|
371
|
+
self->type = MULTI_PHRASE_QUERY;
|
372
|
+
self->rewrite = &mphq_rewrite;
|
333
373
|
self->extract_terms = &mphq_extract_terms;
|
334
374
|
self->to_s = &mphq_to_s;
|
335
|
-
self->
|
336
|
-
self->
|
337
|
-
self->
|
375
|
+
self->hash = &mphq_hash;
|
376
|
+
self->eq = &mphq_eq;
|
377
|
+
self->destroy_i = &mphq_destroy;
|
378
|
+
self->create_weight_i = &mphw_create;
|
338
379
|
return self;
|
339
380
|
}
|