ferret 0.11.6 → 0.11.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
data/ext/q_fuzzy.c
CHANGED
@@ -1,21 +1,41 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include "search.h"
|
3
3
|
#include "helper.h"
|
4
|
+
#include "internal.h"
|
4
5
|
|
5
6
|
/****************************************************************************
|
6
7
|
*
|
7
8
|
* FuzzyStuff
|
8
9
|
*
|
9
|
-
* The main method here is the
|
10
|
-
* another term. The other methods all act in support.
|
10
|
+
* The main method here is the fuzq_score_mn method which scores a term
|
11
|
+
* against another term. The other methods all act in support.
|
12
|
+
*
|
13
|
+
* To learn more about the fuzzy scoring algorithm see;
|
14
|
+
*
|
15
|
+
* http://en.wikipedia.org/wiki/Levenshtein_distance
|
11
16
|
*
|
12
17
|
****************************************************************************/
|
13
18
|
|
14
|
-
|
19
|
+
/**
|
20
|
+
* Calculate the maximum nomber of allowed edits (or maximum edit distance)
|
21
|
+
* for a word to be a match.
|
22
|
+
*
|
23
|
+
* Note that fuzq->text_len and m are both the lengths text *after* the prefix
|
24
|
+
* so `MIN(fuzq->text_len, m) + fuzq->pre_len)` actually gets the byte length
|
25
|
+
* of the shorter string out of the query string and the index term being
|
26
|
+
* compared.
|
27
|
+
*/
|
28
|
+
static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
|
15
29
|
{
|
16
30
|
return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
|
17
31
|
}
|
18
32
|
|
33
|
+
/**
|
34
|
+
* The max-distance formula gets used a lot - it needs to be calculated for
|
35
|
+
* every possible match in the index - so we cache the results for all
|
36
|
+
* lengths up to the TYPICAL_LONGEST_WORD limit. For words longer than this we
|
37
|
+
* calculate the value live.
|
38
|
+
*/
|
19
39
|
static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
20
40
|
{
|
21
41
|
int i;
|
@@ -24,10 +44,79 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
|
24
44
|
}
|
25
45
|
}
|
26
46
|
|
47
|
+
/**
|
48
|
+
* Return the cached max-distance value if the word is within the
|
49
|
+
* TYPICAL_LONGEST_WORD limit.
|
50
|
+
*/
|
27
51
|
static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
|
28
52
|
{
|
29
|
-
|
30
|
-
|
53
|
+
if (m < TYPICAL_LONGEST_WORD)
|
54
|
+
return fuzq->max_distances[m];
|
55
|
+
return fuzq_calculate_max_distance(fuzq, m);
|
56
|
+
}
|
57
|
+
|
58
|
+
/**
|
59
|
+
* Calculate the similarity score for the +target+ against the query.
|
60
|
+
*
|
61
|
+
* @params fuzq The Fuzzy Query
|
62
|
+
* @params target *the term to compare against minus the prefix
|
63
|
+
* @params m the string length of +target+
|
64
|
+
* @params n the string length of the query string minus length of the prefix
|
65
|
+
*/
|
66
|
+
static INLINE float fuzq_score_mn(FuzzyQuery *fuzq,
|
67
|
+
const char *target,
|
68
|
+
const int m, const int n)
|
69
|
+
{
|
70
|
+
int i, j, prune;
|
71
|
+
int *d_curr, *d_prev;
|
72
|
+
const char *text = fuzq->text;
|
73
|
+
const int max_distance = fuzq_get_max_distance(fuzq, m);
|
74
|
+
|
75
|
+
/* Just adding the characters of m to n or vice-versa results in
|
76
|
+
* too many edits for example "pre" length is 3 and "prefixes"
|
77
|
+
* length is 8. We can see that given this optimal circumstance,
|
78
|
+
* the edit distance cannot be less than 5 which is 8-3 or more
|
79
|
+
* precisesly Math.abs(3-8). If our maximum edit distance is 4,
|
80
|
+
* then we can discard this word without looking at it. */
|
81
|
+
if (max_distance < ABS(m-n)) {
|
82
|
+
return 0.0f;
|
83
|
+
}
|
84
|
+
|
85
|
+
d_curr = fuzq->da;
|
86
|
+
d_prev = d_curr + n + 1;
|
87
|
+
|
88
|
+
/* init array */
|
89
|
+
for (j = 0; j <= n; j++) {
|
90
|
+
d_curr[j] = j;
|
91
|
+
}
|
92
|
+
|
93
|
+
/* start computing edit distance */
|
94
|
+
for (i = 0; i < m;) {
|
95
|
+
char s_i = target[i];
|
96
|
+
/* swap d_current into d_prev */
|
97
|
+
int *d_tmp = d_prev;
|
98
|
+
d_prev = d_curr;
|
99
|
+
d_curr = d_tmp;
|
100
|
+
prune = (d_curr[0] = ++i) > max_distance;
|
101
|
+
|
102
|
+
for (j = 0; j < n; j++) {
|
103
|
+
d_curr[j + 1] = (s_i == text[j])
|
104
|
+
? min3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
|
105
|
+
: min3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
|
106
|
+
if (prune && d_curr[j + 1] <= max_distance) {
|
107
|
+
prune = false;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
if (prune) {
|
111
|
+
return 0.0f;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
/* this will return less than 0.0 when the edit distance is greater
|
116
|
+
* than the number of characters in the shorter word. but this was
|
117
|
+
* the formula that was previously used in FuzzyTermEnum, so it has
|
118
|
+
* not been changed (even though min_sim must be greater than 0.0) */
|
119
|
+
return 1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + min2(n, m)));
|
31
120
|
}
|
32
121
|
|
33
122
|
/**
|
@@ -41,76 +130,15 @@ float fuzq_score(FuzzyQuery *fuzq, const char *target)
|
|
41
130
|
const int m = (int)strlen(target);
|
42
131
|
const int n = fuzq->text_len;
|
43
132
|
|
44
|
-
if
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
}
|
49
|
-
else if (m == 0) {
|
50
|
-
return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) n / fuzq->pre_len);
|
51
|
-
}
|
52
|
-
else {
|
53
|
-
int i, j, prune;
|
54
|
-
int *d_curr, *d_prev;
|
55
|
-
const char *text = fuzq->text;
|
56
|
-
const int max_distance = fuzq_get_max_distance(fuzq, m);
|
57
|
-
|
58
|
-
/*
|
59
|
-
printf("n%dm%dmd%ddiff%d<%s><%s>\n", n, m, max_distance, m-n,
|
60
|
-
fuzq->text, target);
|
61
|
-
*/
|
62
|
-
if (max_distance < ((m > n) ? (m-n) : (n-m))) { /* abs */
|
63
|
-
/* Just adding the characters of m to n or vice-versa results in
|
64
|
-
* too many edits for example "pre" length is 3 and "prefixes"
|
65
|
-
* length is 8. We can see that given this optimal circumstance,
|
66
|
-
* the edit distance cannot be less than 5 which is 8-3 or more
|
67
|
-
* precisesly Math.abs(3-8). If our maximum edit distance is 4,
|
68
|
-
* then we can discard this word without looking at it. */
|
133
|
+
/* we don't have anything to compare. That means if we just add
|
134
|
+
* the letters for m we get the new word */
|
135
|
+
if (m == 0 || n == 0) {
|
136
|
+
if (fuzq->pre_len == 0)
|
69
137
|
return 0.0f;
|
70
|
-
|
71
|
-
|
72
|
-
d_curr = fuzq->da;
|
73
|
-
d_prev = d_curr + n + 1;
|
74
|
-
|
75
|
-
/* init array */
|
76
|
-
for (j = 0; j <= n; j++) {
|
77
|
-
d_curr[j] = j;
|
78
|
-
}
|
79
|
-
|
80
|
-
/* start computing edit distance */
|
81
|
-
for (i = 0; i < m;) {
|
82
|
-
char s_i = target[i];
|
83
|
-
/* swap d_current into d_prev */
|
84
|
-
int *d_tmp = d_prev;
|
85
|
-
d_prev = d_curr;
|
86
|
-
d_curr = d_tmp;
|
87
|
-
prune = (d_curr[0] = ++i) > max_distance;
|
88
|
-
|
89
|
-
for (j = 0; j < n; j++) {
|
90
|
-
d_curr[j + 1] = (s_i == text[j])
|
91
|
-
? min3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
|
92
|
-
: min3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
|
93
|
-
if (prune && d_curr[j + 1] <= max_distance) {
|
94
|
-
prune = false;
|
95
|
-
}
|
96
|
-
}
|
97
|
-
if (prune) {
|
98
|
-
return 0.0f;
|
99
|
-
}
|
100
|
-
}
|
101
|
-
|
102
|
-
/*
|
103
|
-
printf("<%f, d_curr[n] = %d min_len = %d>",
|
104
|
-
1.0f - ((float)d_curr[m] / (float) (fuzq->pre_len + min2(n, m))),
|
105
|
-
d_curr[m], fuzq->pre_len + min2(n, m));
|
106
|
-
*/
|
107
|
-
|
108
|
-
/* this will return less than 0.0 when the edit distance is greater
|
109
|
-
* than the number of characters in the shorter word. but this was
|
110
|
-
* the formula that was previously used in FuzzyTermEnum, so it has
|
111
|
-
* not been changed (even though min_sim must be greater than 0.0) */
|
112
|
-
return 1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + min2(n, m)));
|
138
|
+
return 1.0f - ((float) (m+n) / fuzq->pre_len);
|
113
139
|
}
|
140
|
+
|
141
|
+
return fuzq_score_mn(fuzq, target, m, n);
|
114
142
|
}
|
115
143
|
|
116
144
|
/****************************************************************************
|
@@ -121,22 +149,18 @@ float fuzq_score(FuzzyQuery *fuzq, const char *target)
|
|
121
149
|
|
122
150
|
#define FzQ(query) ((FuzzyQuery *)(query))
|
123
151
|
|
124
|
-
static char *fuzq_to_s(Query *self,
|
152
|
+
static char *fuzq_to_s(Query *self, Symbol curr_field)
|
125
153
|
{
|
126
154
|
char *buffer, *bptr;
|
127
155
|
char *term = FzQ(self)->term;
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
if (strcmp(curr_field, field) != 0) {
|
134
|
-
sprintf(bptr, "%s:", field);
|
135
|
-
bptr += flen + 1;
|
156
|
+
Symbol field = FzQ(self)->field;
|
157
|
+
bptr = buffer = ALLOC_N(char, strlen(term) + sym_len(field) + 70);
|
158
|
+
|
159
|
+
if (curr_field != field) {
|
160
|
+
bptr += sprintf(bptr, "%s:", S(field));
|
136
161
|
}
|
137
162
|
|
138
|
-
sprintf(bptr, "%s~", term);
|
139
|
-
bptr += tlen + 1;
|
163
|
+
bptr += sprintf(bptr, "%s~", term);
|
140
164
|
if (FzQ(self)->min_sim != 0.5) {
|
141
165
|
dbl_to_s(bptr, FzQ(self)->min_sim);
|
142
166
|
bptr += strlen(bptr);
|
@@ -155,77 +179,65 @@ static Query *fuzq_rewrite(Query *self, IndexReader *ir)
|
|
155
179
|
Query *q;
|
156
180
|
FuzzyQuery *fuzq = FzQ(self);
|
157
181
|
|
182
|
+
int pre_len = fuzq->pre_len;
|
183
|
+
char *prefix = NULL;
|
158
184
|
const char *term = fuzq->term;
|
159
|
-
const
|
160
|
-
|
185
|
+
const int field_num = fis_get_field_num(ir->fis, fuzq->field);
|
186
|
+
TermEnum *te;
|
161
187
|
|
162
188
|
if (field_num < 0) {
|
163
|
-
|
189
|
+
return bq_new(true);
|
164
190
|
}
|
165
|
-
|
166
|
-
|
191
|
+
if (fuzq->pre_len >= (int)strlen(term)) {
|
192
|
+
return tq_new(fuzq->field, term);
|
167
193
|
}
|
168
|
-
else {
|
169
|
-
TermEnum *te;
|
170
|
-
char *prefix = NULL;
|
171
|
-
int pre_len = fuzq->pre_len;
|
172
|
-
|
173
|
-
q = multi_tq_new_conf(fuzq->field, MTQMaxTerms(self), fuzq->min_sim);
|
174
|
-
|
175
|
-
if (pre_len > 0) {
|
176
|
-
prefix = ALLOC_N(char, pre_len + 1);
|
177
|
-
strncpy(prefix, term, pre_len);
|
178
|
-
prefix[pre_len] = '\0';
|
179
|
-
te = ir->terms_from(ir, field_num, prefix);
|
180
|
-
}
|
181
|
-
else {
|
182
|
-
te = ir->terms(ir, field_num);
|
183
|
-
}
|
184
194
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
+
q = multi_tq_new_conf(fuzq->field, MTQMaxTerms(self), fuzq->min_sim);
|
196
|
+
if (pre_len > 0) {
|
197
|
+
prefix = ALLOC_N(char, pre_len + 1);
|
198
|
+
strncpy(prefix, term, pre_len);
|
199
|
+
prefix[pre_len] = '\0';
|
200
|
+
te = ir->terms_from(ir, field_num, prefix);
|
201
|
+
}
|
202
|
+
else {
|
203
|
+
te = ir->terms(ir, field_num);
|
204
|
+
}
|
195
205
|
|
206
|
+
assert(NULL != te);
|
196
207
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
208
|
+
fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
|
209
|
+
fuzq->text = term + pre_len;
|
210
|
+
fuzq->text_len = (int)strlen(fuzq->text);
|
211
|
+
fuzq->da = REALLOC_N(fuzq->da, int, fuzq->text_len * 2 + 2);
|
212
|
+
fuzq_initialize_max_distances(fuzq);
|
201
213
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
multi_tq_add_term_boost(q, curr_term, score);
|
214
|
+
do {
|
215
|
+
const char *curr_term = te->curr_term;
|
216
|
+
const char *curr_suffix = curr_term + pre_len;
|
217
|
+
float score = 0.0;
|
207
218
|
|
208
|
-
|
219
|
+
if (prefix && strncmp(curr_term, prefix, pre_len) != 0)
|
220
|
+
break;
|
209
221
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
}
|
222
|
+
score = fuzq_score(fuzq, curr_suffix);
|
223
|
+
multi_tq_add_term_boost(q, curr_term, score);
|
224
|
+
} while (te->next(te) != NULL);
|
214
225
|
|
226
|
+
te->close(te);
|
227
|
+
if (prefix) free(prefix);
|
215
228
|
return q;
|
216
229
|
}
|
217
230
|
|
218
231
|
static void fuzq_destroy(Query *self)
|
219
232
|
{
|
220
233
|
free(FzQ(self)->term);
|
221
|
-
free(FzQ(self)->field);
|
222
234
|
free(FzQ(self)->da);
|
223
235
|
q_destroy_i(self);
|
224
236
|
}
|
225
237
|
|
226
238
|
static unsigned long fuzq_hash(Query *self)
|
227
239
|
{
|
228
|
-
return str_hash(FzQ(self)->term) ^
|
240
|
+
return str_hash(FzQ(self)->term) ^ sym_hash(FzQ(self)->field)
|
229
241
|
^ float2int(FzQ(self)->min_sim) ^ FzQ(self)->pre_len;
|
230
242
|
}
|
231
243
|
|
@@ -235,17 +247,17 @@ static int fuzq_eq(Query *self, Query *o)
|
|
235
247
|
FuzzyQuery *fq2 = FzQ(o);
|
236
248
|
|
237
249
|
return (strcmp(fq1->term, fq2->term) == 0)
|
238
|
-
&& (
|
250
|
+
&& (fq1->field == fq2->field)
|
239
251
|
&& (fq1->pre_len == fq2->pre_len)
|
240
252
|
&& (fq1->min_sim == fq2->min_sim);
|
241
253
|
}
|
242
254
|
|
243
|
-
Query *fuzq_new_conf(
|
255
|
+
Query *fuzq_new_conf(Symbol field, const char *term,
|
244
256
|
float min_sim, int pre_len, int max_terms)
|
245
257
|
{
|
246
258
|
Query *self = q_new(FuzzyQuery);
|
247
259
|
|
248
|
-
FzQ(self)->field =
|
260
|
+
FzQ(self)->field = field;
|
249
261
|
FzQ(self)->term = estrdup(term);
|
250
262
|
FzQ(self)->pre_len = pre_len ? pre_len : DEF_PRE_LEN;
|
251
263
|
FzQ(self)->min_sim = min_sim ? min_sim : DEF_MIN_SIM;
|
@@ -262,7 +274,7 @@ Query *fuzq_new_conf(const char *field, const char *term,
|
|
262
274
|
return self;
|
263
275
|
}
|
264
276
|
|
265
|
-
Query *fuzq_new(
|
277
|
+
Query *fuzq_new(Symbol field, const char *term)
|
266
278
|
{
|
267
279
|
return fuzq_new_conf(field, term, 0.0f, 0, 0);
|
268
280
|
}
|
data/ext/q_match_all.c
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#include "search.h"
|
2
2
|
#include <string.h>
|
3
|
+
#include "internal.h"
|
3
4
|
|
4
5
|
/***************************************************************************
|
5
6
|
*
|
@@ -110,9 +111,9 @@ static Weight *maw_new(Query *query, Searcher *searcher)
|
|
110
111
|
*
|
111
112
|
***************************************************************************/
|
112
113
|
|
113
|
-
char *maq_to_s(Query *self,
|
114
|
+
static char *maq_to_s(Query *self, Symbol default_field)
|
114
115
|
{
|
115
|
-
(void)
|
116
|
+
(void)default_field;
|
116
117
|
if (self->boost == 1.0) {
|
117
118
|
return estrdup("*");
|
118
119
|
} else {
|
data/ext/q_multi_term.c
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include "search.h"
|
3
|
-
#include "priorityqueue.h"
|
4
3
|
#include "helper.h"
|
4
|
+
#include "symbol.h"
|
5
|
+
#include "internal.h"
|
5
6
|
|
6
7
|
#define MTQ(query) ((MultiTermQuery *)(query))
|
7
8
|
|
@@ -141,7 +142,7 @@ static TermDocEnumWrapper *tdew_new(const char *term, TermDocEnum *tde,
|
|
141
142
|
typedef struct MultiTermScorer
|
142
143
|
{
|
143
144
|
Scorer super;
|
144
|
-
|
145
|
+
Symbol field;
|
145
146
|
uchar *norms;
|
146
147
|
Weight *weight;
|
147
148
|
TermDocEnumWrapper **tdew_a;
|
@@ -176,7 +177,7 @@ static bool multi_tsc_next(Scorer *self)
|
|
176
177
|
}
|
177
178
|
mtsc->tdew_pq = tdew_pq;
|
178
179
|
}
|
179
|
-
|
180
|
+
|
180
181
|
tdew = (TermDocEnumWrapper *)pq_top(tdew_pq);
|
181
182
|
if (tdew == NULL) {
|
182
183
|
return false;
|
@@ -259,7 +260,7 @@ static Explanation *multi_tsc_explain(Scorer *self, int doc_num)
|
|
259
260
|
expl_add_detail(expl,
|
260
261
|
expl_new(sim_tf(self->similarity, (float)freq) * tdew->boost,
|
261
262
|
"tf(term_freq(%s:%s)=%d)^%f",
|
262
|
-
mtsc->field, tdew->term, freq, tdew->boost));
|
263
|
+
S(mtsc->field), tdew->term, freq, tdew->boost));
|
263
264
|
|
264
265
|
total_score += sim_tf(self->similarity, (float)freq) * tdew->boost;
|
265
266
|
|
@@ -294,7 +295,7 @@ static void multi_tsc_destroy(Scorer *self)
|
|
294
295
|
scorer_destroy_i(self);
|
295
296
|
}
|
296
297
|
|
297
|
-
static Scorer *multi_tsc_new(Weight *weight,
|
298
|
+
static Scorer *multi_tsc_new(Weight *weight, Symbol field,
|
298
299
|
TermDocEnumWrapper **tdew_a, int tdew_cnt,
|
299
300
|
uchar *norms)
|
300
301
|
{
|
@@ -367,7 +368,7 @@ static Scorer *multi_tw_scorer(Weight *self, IndexReader *ir)
|
|
367
368
|
return multi_tsc;
|
368
369
|
}
|
369
370
|
|
370
|
-
Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
371
|
+
static Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
371
372
|
{
|
372
373
|
Explanation *expl;
|
373
374
|
Explanation *idf_expl1;
|
@@ -383,19 +384,20 @@ Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
383
384
|
|
384
385
|
char *query_str;
|
385
386
|
MultiTermQuery *mtq = MTQ(self->query);
|
386
|
-
const char *field = mtq->field;
|
387
|
+
const char *field = S(mtq->field);
|
387
388
|
PriorityQueue *bt_pq = mtq->boosted_terms;
|
388
389
|
int i;
|
389
390
|
int total_doc_freqs = 0;
|
390
391
|
char *doc_freqs = NULL;
|
391
392
|
size_t len = 0, pos = 0;
|
392
|
-
const int field_num = fis_get_field_num(ir->fis, field);
|
393
|
+
const int field_num = fis_get_field_num(ir->fis, mtq->field);
|
393
394
|
|
394
395
|
if (field_num < 0) {
|
395
|
-
return expl_new(0.0, "field \"%s\" does not exist in the index",
|
396
|
+
return expl_new(0.0, "field \"%s\" does not exist in the index",
|
397
|
+
field);
|
396
398
|
}
|
397
|
-
|
398
|
-
query_str = self->query->to_s(self->query,
|
399
|
+
|
400
|
+
query_str = self->query->to_s(self->query, NULL);
|
399
401
|
|
400
402
|
expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
|
401
403
|
|
@@ -407,8 +409,7 @@ Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
407
409
|
for (i = bt_pq->size; i > 0; i--) {
|
408
410
|
char *term = ((BoostedTerm *)bt_pq->heap[i])->term;
|
409
411
|
int doc_freq = ir->doc_freq(ir, field_num, term);
|
410
|
-
sprintf(doc_freqs + pos, "(%s=%d) + ", term, doc_freq);
|
411
|
-
pos += strlen(doc_freqs + pos);
|
412
|
+
pos += sprintf(doc_freqs + pos, "(%s=%d) + ", term, doc_freq);
|
412
413
|
total_doc_freqs += doc_freq;
|
413
414
|
}
|
414
415
|
pos -= 2; /* remove " + " from the end */
|
@@ -476,7 +477,6 @@ static Weight *multi_tw_new(Query *query, Searcher *searcher)
|
|
476
477
|
int i;
|
477
478
|
int doc_freq = 0;
|
478
479
|
Weight *self = w_new(Weight, query);
|
479
|
-
const char *field = MTQ(query)->field;
|
480
480
|
PriorityQueue *bt_pq = MTQ(query)->boosted_terms;
|
481
481
|
|
482
482
|
self->scorer = &multi_tw_scorer;
|
@@ -488,7 +488,7 @@ static Weight *multi_tw_new(Query *query, Searcher *searcher)
|
|
488
488
|
self->idf = 0.0;
|
489
489
|
|
490
490
|
for (i = bt_pq->size; i > 0; i--) {
|
491
|
-
doc_freq += searcher->doc_freq(searcher, field,
|
491
|
+
doc_freq += searcher->doc_freq(searcher, MTQ(query)->field,
|
492
492
|
((BoostedTerm *)bt_pq->heap[i])->term);
|
493
493
|
}
|
494
494
|
self->idf += sim_idf(self->similarity, doc_freq,
|
@@ -502,13 +502,13 @@ static Weight *multi_tw_new(Query *query, Searcher *searcher)
|
|
502
502
|
* MultiTermQuery
|
503
503
|
***************************************************************************/
|
504
504
|
|
505
|
-
static char *multi_tq_to_s(Query *self,
|
505
|
+
static char *multi_tq_to_s(Query *self, Symbol default_field)
|
506
506
|
{
|
507
507
|
int i;
|
508
508
|
PriorityQueue *boosted_terms = MTQ(self)->boosted_terms, *bt_pq_clone;
|
509
509
|
BoostedTerm *bt;
|
510
510
|
char *buffer, *bptr;
|
511
|
-
char *field = MTQ(self)->field;
|
511
|
+
const char *field = S(MTQ(self)->field);
|
512
512
|
int flen = (int)strlen(field);
|
513
513
|
int tlen = 0;
|
514
514
|
|
@@ -519,16 +519,14 @@ static char *multi_tq_to_s(Query *self, const char *curr_field)
|
|
519
519
|
|
520
520
|
bptr = buffer = ALLOC_N(char, tlen + flen + 35);
|
521
521
|
|
522
|
-
if (
|
523
|
-
sprintf(bptr, "%s:", field);
|
524
|
-
bptr += flen + 1;
|
522
|
+
if (default_field != MTQ(self)->field) {
|
523
|
+
bptr += sprintf(bptr, "%s:", field);
|
525
524
|
}
|
526
525
|
|
527
526
|
*(bptr++) = '"';
|
528
527
|
bt_pq_clone = pq_clone(boosted_terms);
|
529
528
|
while ((bt = (BoostedTerm *)pq_pop(bt_pq_clone)) != NULL) {
|
530
|
-
sprintf(bptr, "%s", bt->term);
|
531
|
-
bptr += (int)strlen(bptr);
|
529
|
+
bptr += sprintf(bptr, "%s", bt->term);
|
532
530
|
|
533
531
|
if (bt->boost != 1.0) {
|
534
532
|
*bptr = '^';
|
@@ -545,7 +543,7 @@ static char *multi_tq_to_s(Query *self, const char *curr_field)
|
|
545
543
|
}
|
546
544
|
bptr[-1] = '"'; /* delete last '|' char */
|
547
545
|
bptr[ 0] = '\0';
|
548
|
-
|
546
|
+
|
549
547
|
if (self->boost != 1.0) {
|
550
548
|
*bptr = '^';
|
551
549
|
dbl_to_s(++bptr, self->boost);
|
@@ -556,7 +554,6 @@ static char *multi_tq_to_s(Query *self, const char *curr_field)
|
|
556
554
|
|
557
555
|
static void multi_tq_destroy_i(Query *self)
|
558
556
|
{
|
559
|
-
free(MTQ(self)->field);
|
560
557
|
pq_destroy(MTQ(self)->boosted_terms);
|
561
558
|
q_destroy_i(self);
|
562
559
|
}
|
@@ -564,18 +561,17 @@ static void multi_tq_destroy_i(Query *self)
|
|
564
561
|
static void multi_tq_extract_terms(Query *self, HashSet *terms)
|
565
562
|
{
|
566
563
|
int i;
|
567
|
-
char *field = MTQ(self)->field;
|
568
564
|
PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
|
569
565
|
for (i = boosted_terms->size; i > 0; i--) {
|
570
566
|
BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
|
571
|
-
hs_add(terms, term_new(field, bt->term));
|
567
|
+
hs_add(terms, term_new(MTQ(self)->field, bt->term));
|
572
568
|
}
|
573
569
|
}
|
574
570
|
|
575
571
|
static unsigned long multi_tq_hash(Query *self)
|
576
572
|
{
|
577
573
|
int i;
|
578
|
-
unsigned long hash =
|
574
|
+
unsigned long hash = sym_hash(MTQ(self)->field);
|
579
575
|
PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
|
580
576
|
for (i = boosted_terms->size; i > 0; i--) {
|
581
577
|
BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
|
@@ -590,7 +586,7 @@ static int multi_tq_eq(Query *self, Query *o)
|
|
590
586
|
PriorityQueue *boosted_terms1 = MTQ(self)->boosted_terms;
|
591
587
|
PriorityQueue *boosted_terms2 = MTQ(o)->boosted_terms;
|
592
588
|
|
593
|
-
if (
|
589
|
+
if ((MTQ(self)->field != MTQ(o)->field)
|
594
590
|
|| boosted_terms1->size != boosted_terms2->size) {
|
595
591
|
return false;
|
596
592
|
}
|
@@ -607,7 +603,7 @@ static int multi_tq_eq(Query *self, Query *o)
|
|
607
603
|
static MatchVector *multi_tq_get_matchv_i(Query *self, MatchVector *mv,
|
608
604
|
TermVector *tv)
|
609
605
|
{
|
610
|
-
if (
|
606
|
+
if (tv->field == MTQ(self)->field) {
|
611
607
|
int i;
|
612
608
|
PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
|
613
609
|
for (i = boosted_terms->size; i > 0; i--) {
|
@@ -625,7 +621,7 @@ static MatchVector *multi_tq_get_matchv_i(Query *self, MatchVector *mv,
|
|
625
621
|
return mv;
|
626
622
|
}
|
627
623
|
|
628
|
-
Query *multi_tq_new_conf(
|
624
|
+
Query *multi_tq_new_conf(Symbol field, int max_terms, float min_boost)
|
629
625
|
{
|
630
626
|
Query *self;
|
631
627
|
|
@@ -636,7 +632,7 @@ Query *multi_tq_new_conf(const char *field, int max_terms, float min_boost)
|
|
636
632
|
|
637
633
|
self = q_new(MultiTermQuery);
|
638
634
|
|
639
|
-
MTQ(self)->field =
|
635
|
+
MTQ(self)->field = field;
|
640
636
|
MTQ(self)->boosted_terms = pq_new(max_terms,
|
641
637
|
(lt_ft)&boosted_term_less_than,
|
642
638
|
(free_ft)&boosted_term_destroy);
|
@@ -654,7 +650,7 @@ Query *multi_tq_new_conf(const char *field, int max_terms, float min_boost)
|
|
654
650
|
return self;
|
655
651
|
}
|
656
652
|
|
657
|
-
Query *multi_tq_new(
|
653
|
+
Query *multi_tq_new(Symbol field)
|
658
654
|
{
|
659
655
|
return multi_tq_new_conf(field, MULTI_TERM_QUERY_MAX_TERMS, 0.0);
|
660
656
|
}
|