ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -1,21 +1,41 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
3
  #include "helper.h"
4
+ #include "internal.h"
4
5
 
5
6
  /****************************************************************************
6
7
  *
7
8
  * FuzzyStuff
8
9
  *
9
- * The main method here is the fuzq_score method which scores a term against
10
- * another term. The other methods all act in support.
10
+ * The main method here is the fuzq_score_mn method which scores a term
11
+ * against another term. The other methods all act in support.
12
+ *
13
+ * To learn more about the fuzzy scoring algorithm see;
14
+ *
15
+ * http://en.wikipedia.org/wiki/Levenshtein_distance
11
16
  *
12
17
  ****************************************************************************/
13
18
 
14
- static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
19
+ /**
20
+ * Calculate the maximum nomber of allowed edits (or maximum edit distance)
21
+ * for a word to be a match.
22
+ *
23
+ * Note that fuzq->text_len and m are both the lengths text *after* the prefix
24
+ * so `MIN(fuzq->text_len, m) + fuzq->pre_len)` actually gets the byte length
25
+ * of the shorter string out of the query string and the index term being
26
+ * compared.
27
+ */
28
+ static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
15
29
  {
16
30
  return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
17
31
  }
18
32
 
33
+ /**
34
+ * The max-distance formula gets used a lot - it needs to be calculated for
35
+ * every possible match in the index - so we cache the results for all
36
+ * lengths up to the TYPICAL_LONGEST_WORD limit. For words longer than this we
37
+ * calculate the value live.
38
+ */
19
39
  static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
20
40
  {
21
41
  int i;
@@ -24,10 +44,79 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
24
44
  }
25
45
  }
26
46
 
47
+ /**
48
+ * Return the cached max-distance value if the word is within the
49
+ * TYPICAL_LONGEST_WORD limit.
50
+ */
27
51
  static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
28
52
  {
29
- return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
30
- : fuzq_calculate_max_distance(fuzq, m);
53
+ if (m < TYPICAL_LONGEST_WORD)
54
+ return fuzq->max_distances[m];
55
+ return fuzq_calculate_max_distance(fuzq, m);
56
+ }
57
+
58
+ /**
59
+ * Calculate the similarity score for the +target+ against the query.
60
+ *
61
+ * @params fuzq The Fuzzy Query
62
+ * @params target *the term to compare against minus the prefix
63
+ * @params m the string length of +target+
64
+ * @params n the string length of the query string minus length of the prefix
65
+ */
66
+ static INLINE float fuzq_score_mn(FuzzyQuery *fuzq,
67
+ const char *target,
68
+ const int m, const int n)
69
+ {
70
+ int i, j, prune;
71
+ int *d_curr, *d_prev;
72
+ const char *text = fuzq->text;
73
+ const int max_distance = fuzq_get_max_distance(fuzq, m);
74
+
75
+ /* Just adding the characters of m to n or vice-versa results in
76
+ * too many edits for example "pre" length is 3 and "prefixes"
77
+ * length is 8. We can see that given this optimal circumstance,
78
+ * the edit distance cannot be less than 5 which is 8-3 or more
79
+ * precisesly Math.abs(3-8). If our maximum edit distance is 4,
80
+ * then we can discard this word without looking at it. */
81
+ if (max_distance < ABS(m-n)) {
82
+ return 0.0f;
83
+ }
84
+
85
+ d_curr = fuzq->da;
86
+ d_prev = d_curr + n + 1;
87
+
88
+ /* init array */
89
+ for (j = 0; j <= n; j++) {
90
+ d_curr[j] = j;
91
+ }
92
+
93
+ /* start computing edit distance */
94
+ for (i = 0; i < m;) {
95
+ char s_i = target[i];
96
+ /* swap d_current into d_prev */
97
+ int *d_tmp = d_prev;
98
+ d_prev = d_curr;
99
+ d_curr = d_tmp;
100
+ prune = (d_curr[0] = ++i) > max_distance;
101
+
102
+ for (j = 0; j < n; j++) {
103
+ d_curr[j + 1] = (s_i == text[j])
104
+ ? min3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
105
+ : min3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
106
+ if (prune && d_curr[j + 1] <= max_distance) {
107
+ prune = false;
108
+ }
109
+ }
110
+ if (prune) {
111
+ return 0.0f;
112
+ }
113
+ }
114
+
115
+ /* this will return less than 0.0 when the edit distance is greater
116
+ * than the number of characters in the shorter word. but this was
117
+ * the formula that was previously used in FuzzyTermEnum, so it has
118
+ * not been changed (even though min_sim must be greater than 0.0) */
119
+ return 1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + min2(n, m)));
31
120
  }
32
121
 
33
122
  /**
@@ -41,76 +130,15 @@ float fuzq_score(FuzzyQuery *fuzq, const char *target)
41
130
  const int m = (int)strlen(target);
42
131
  const int n = fuzq->text_len;
43
132
 
44
- if (n == 0) {
45
- /* we don't have anything to compare. That means if we just add
46
- * the letters for m we get the new word */
47
- return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) m / fuzq->pre_len);
48
- }
49
- else if (m == 0) {
50
- return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) n / fuzq->pre_len);
51
- }
52
- else {
53
- int i, j, prune;
54
- int *d_curr, *d_prev;
55
- const char *text = fuzq->text;
56
- const int max_distance = fuzq_get_max_distance(fuzq, m);
57
-
58
- /*
59
- printf("n%dm%dmd%ddiff%d<%s><%s>\n", n, m, max_distance, m-n,
60
- fuzq->text, target);
61
- */
62
- if (max_distance < ((m > n) ? (m-n) : (n-m))) { /* abs */
63
- /* Just adding the characters of m to n or vice-versa results in
64
- * too many edits for example "pre" length is 3 and "prefixes"
65
- * length is 8. We can see that given this optimal circumstance,
66
- * the edit distance cannot be less than 5 which is 8-3 or more
67
- * precisesly Math.abs(3-8). If our maximum edit distance is 4,
68
- * then we can discard this word without looking at it. */
133
+ /* we don't have anything to compare. That means if we just add
134
+ * the letters for m we get the new word */
135
+ if (m == 0 || n == 0) {
136
+ if (fuzq->pre_len == 0)
69
137
  return 0.0f;
70
- }
71
-
72
- d_curr = fuzq->da;
73
- d_prev = d_curr + n + 1;
74
-
75
- /* init array */
76
- for (j = 0; j <= n; j++) {
77
- d_curr[j] = j;
78
- }
79
-
80
- /* start computing edit distance */
81
- for (i = 0; i < m;) {
82
- char s_i = target[i];
83
- /* swap d_current into d_prev */
84
- int *d_tmp = d_prev;
85
- d_prev = d_curr;
86
- d_curr = d_tmp;
87
- prune = (d_curr[0] = ++i) > max_distance;
88
-
89
- for (j = 0; j < n; j++) {
90
- d_curr[j + 1] = (s_i == text[j])
91
- ? min3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
92
- : min3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
93
- if (prune && d_curr[j + 1] <= max_distance) {
94
- prune = false;
95
- }
96
- }
97
- if (prune) {
98
- return 0.0f;
99
- }
100
- }
101
-
102
- /*
103
- printf("<%f, d_curr[n] = %d min_len = %d>",
104
- 1.0f - ((float)d_curr[m] / (float) (fuzq->pre_len + min2(n, m))),
105
- d_curr[m], fuzq->pre_len + min2(n, m));
106
- */
107
-
108
- /* this will return less than 0.0 when the edit distance is greater
109
- * than the number of characters in the shorter word. but this was
110
- * the formula that was previously used in FuzzyTermEnum, so it has
111
- * not been changed (even though min_sim must be greater than 0.0) */
112
- return 1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + min2(n, m)));
138
+ return 1.0f - ((float) (m+n) / fuzq->pre_len);
113
139
  }
140
+
141
+ return fuzq_score_mn(fuzq, target, m, n);
114
142
  }
115
143
 
116
144
  /****************************************************************************
@@ -121,22 +149,18 @@ float fuzq_score(FuzzyQuery *fuzq, const char *target)
121
149
 
122
150
  #define FzQ(query) ((FuzzyQuery *)(query))
123
151
 
124
- static char *fuzq_to_s(Query *self, const char *curr_field)
152
+ static char *fuzq_to_s(Query *self, Symbol curr_field)
125
153
  {
126
154
  char *buffer, *bptr;
127
155
  char *term = FzQ(self)->term;
128
- char *field = FzQ(self)->field;
129
- int tlen = (int)strlen(term);
130
- int flen = (int)strlen(field);
131
- bptr = buffer = ALLOC_N(char, tlen + flen + 70);
132
-
133
- if (strcmp(curr_field, field) != 0) {
134
- sprintf(bptr, "%s:", field);
135
- bptr += flen + 1;
156
+ Symbol field = FzQ(self)->field;
157
+ bptr = buffer = ALLOC_N(char, strlen(term) + sym_len(field) + 70);
158
+
159
+ if (curr_field != field) {
160
+ bptr += sprintf(bptr, "%s:", S(field));
136
161
  }
137
162
 
138
- sprintf(bptr, "%s~", term);
139
- bptr += tlen + 1;
163
+ bptr += sprintf(bptr, "%s~", term);
140
164
  if (FzQ(self)->min_sim != 0.5) {
141
165
  dbl_to_s(bptr, FzQ(self)->min_sim);
142
166
  bptr += strlen(bptr);
@@ -155,77 +179,65 @@ static Query *fuzq_rewrite(Query *self, IndexReader *ir)
155
179
  Query *q;
156
180
  FuzzyQuery *fuzq = FzQ(self);
157
181
 
182
+ int pre_len = fuzq->pre_len;
183
+ char *prefix = NULL;
158
184
  const char *term = fuzq->term;
159
- const char *field = fuzq->field;
160
- const int field_num = fis_get_field_num(ir->fis, field);
185
+ const int field_num = fis_get_field_num(ir->fis, fuzq->field);
186
+ TermEnum *te;
161
187
 
162
188
  if (field_num < 0) {
163
- q = bq_new(true);
189
+ return bq_new(true);
164
190
  }
165
- else if (fuzq->pre_len >= (int)strlen(term)) {
166
- q = tq_new(field, term);
191
+ if (fuzq->pre_len >= (int)strlen(term)) {
192
+ return tq_new(fuzq->field, term);
167
193
  }
168
- else {
169
- TermEnum *te;
170
- char *prefix = NULL;
171
- int pre_len = fuzq->pre_len;
172
-
173
- q = multi_tq_new_conf(fuzq->field, MTQMaxTerms(self), fuzq->min_sim);
174
-
175
- if (pre_len > 0) {
176
- prefix = ALLOC_N(char, pre_len + 1);
177
- strncpy(prefix, term, pre_len);
178
- prefix[pre_len] = '\0';
179
- te = ir->terms_from(ir, field_num, prefix);
180
- }
181
- else {
182
- te = ir->terms(ir, field_num);
183
- }
184
194
 
185
- fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
186
- fuzq->text = term + pre_len;
187
- fuzq->text_len = (int)strlen(fuzq->text);
188
- fuzq->da = REALLOC_N(fuzq->da, int, fuzq->text_len * 2 + 2);
189
- fuzq_initialize_max_distances(fuzq);
190
-
191
- if (te) {
192
- const char *curr_term = te->curr_term;
193
- const char *curr_suffix = curr_term + pre_len;
194
- float score = 0.0;
195
+ q = multi_tq_new_conf(fuzq->field, MTQMaxTerms(self), fuzq->min_sim);
196
+ if (pre_len > 0) {
197
+ prefix = ALLOC_N(char, pre_len + 1);
198
+ strncpy(prefix, term, pre_len);
199
+ prefix[pre_len] = '\0';
200
+ te = ir->terms_from(ir, field_num, prefix);
201
+ }
202
+ else {
203
+ te = ir->terms(ir, field_num);
204
+ }
195
205
 
206
+ assert(NULL != te);
196
207
 
197
- do {
198
- if ((prefix && strncmp(curr_term, prefix, pre_len) != 0)) {
199
- break;
200
- }
208
+ fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
209
+ fuzq->text = term + pre_len;
210
+ fuzq->text_len = (int)strlen(fuzq->text);
211
+ fuzq->da = REALLOC_N(fuzq->da, int, fuzq->text_len * 2 + 2);
212
+ fuzq_initialize_max_distances(fuzq);
201
213
 
202
- score = fuzq_score(fuzq, curr_suffix);
203
- /*
204
- printf("%s:%s:%f < %f\n", curr_term, term, score, min_score);
205
- */
206
- multi_tq_add_term_boost(q, curr_term, score);
214
+ do {
215
+ const char *curr_term = te->curr_term;
216
+ const char *curr_suffix = curr_term + pre_len;
217
+ float score = 0.0;
207
218
 
208
- } while (te->next(te) != NULL);
219
+ if (prefix && strncmp(curr_term, prefix, pre_len) != 0)
220
+ break;
209
221
 
210
- te->close(te);
211
- }
212
- free(prefix);
213
- }
222
+ score = fuzq_score(fuzq, curr_suffix);
223
+ multi_tq_add_term_boost(q, curr_term, score);
224
+ } while (te->next(te) != NULL);
214
225
 
226
+ te->close(te);
227
+ if (prefix) free(prefix);
215
228
  return q;
216
229
  }
217
230
 
218
231
  static void fuzq_destroy(Query *self)
219
232
  {
220
233
  free(FzQ(self)->term);
221
- free(FzQ(self)->field);
222
234
  free(FzQ(self)->da);
223
235
  q_destroy_i(self);
224
236
  }
225
237
 
226
238
  static unsigned long fuzq_hash(Query *self)
227
239
  {
228
- return str_hash(FzQ(self)->term) ^ str_hash(FzQ(self)->field)
240
+ return str_hash(FzQ(self)->term) ^ sym_hash(FzQ(self)->field)
229
241
  ^ float2int(FzQ(self)->min_sim) ^ FzQ(self)->pre_len;
230
242
  }
231
243
 
@@ -235,17 +247,17 @@ static int fuzq_eq(Query *self, Query *o)
235
247
  FuzzyQuery *fq2 = FzQ(o);
236
248
 
237
249
  return (strcmp(fq1->term, fq2->term) == 0)
238
- && (strcmp(fq1->field, fq2->field) == 0)
250
+ && (fq1->field == fq2->field)
239
251
  && (fq1->pre_len == fq2->pre_len)
240
252
  && (fq1->min_sim == fq2->min_sim);
241
253
  }
242
254
 
243
- Query *fuzq_new_conf(const char *field, const char *term,
255
+ Query *fuzq_new_conf(Symbol field, const char *term,
244
256
  float min_sim, int pre_len, int max_terms)
245
257
  {
246
258
  Query *self = q_new(FuzzyQuery);
247
259
 
248
- FzQ(self)->field = estrdup(field);
260
+ FzQ(self)->field = field;
249
261
  FzQ(self)->term = estrdup(term);
250
262
  FzQ(self)->pre_len = pre_len ? pre_len : DEF_PRE_LEN;
251
263
  FzQ(self)->min_sim = min_sim ? min_sim : DEF_MIN_SIM;
@@ -262,7 +274,7 @@ Query *fuzq_new_conf(const char *field, const char *term,
262
274
  return self;
263
275
  }
264
276
 
265
- Query *fuzq_new(const char *field, const char *term)
277
+ Query *fuzq_new(Symbol field, const char *term)
266
278
  {
267
279
  return fuzq_new_conf(field, term, 0.0f, 0, 0);
268
280
  }
@@ -1,5 +1,6 @@
1
1
  #include "search.h"
2
2
  #include <string.h>
3
+ #include "internal.h"
3
4
 
4
5
  /***************************************************************************
5
6
  *
@@ -110,9 +111,9 @@ static Weight *maw_new(Query *query, Searcher *searcher)
110
111
  *
111
112
  ***************************************************************************/
112
113
 
113
- char *maq_to_s(Query *self, const char *field)
114
+ static char *maq_to_s(Query *self, Symbol default_field)
114
115
  {
115
- (void)field;
116
+ (void)default_field;
116
117
  if (self->boost == 1.0) {
117
118
  return estrdup("*");
118
119
  } else {
@@ -1,7 +1,8 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
- #include "priorityqueue.h"
4
3
  #include "helper.h"
4
+ #include "symbol.h"
5
+ #include "internal.h"
5
6
 
6
7
  #define MTQ(query) ((MultiTermQuery *)(query))
7
8
 
@@ -141,7 +142,7 @@ static TermDocEnumWrapper *tdew_new(const char *term, TermDocEnum *tde,
141
142
  typedef struct MultiTermScorer
142
143
  {
143
144
  Scorer super;
144
- const char *field;
145
+ Symbol field;
145
146
  uchar *norms;
146
147
  Weight *weight;
147
148
  TermDocEnumWrapper **tdew_a;
@@ -176,7 +177,7 @@ static bool multi_tsc_next(Scorer *self)
176
177
  }
177
178
  mtsc->tdew_pq = tdew_pq;
178
179
  }
179
-
180
+
180
181
  tdew = (TermDocEnumWrapper *)pq_top(tdew_pq);
181
182
  if (tdew == NULL) {
182
183
  return false;
@@ -259,7 +260,7 @@ static Explanation *multi_tsc_explain(Scorer *self, int doc_num)
259
260
  expl_add_detail(expl,
260
261
  expl_new(sim_tf(self->similarity, (float)freq) * tdew->boost,
261
262
  "tf(term_freq(%s:%s)=%d)^%f",
262
- mtsc->field, tdew->term, freq, tdew->boost));
263
+ S(mtsc->field), tdew->term, freq, tdew->boost));
263
264
 
264
265
  total_score += sim_tf(self->similarity, (float)freq) * tdew->boost;
265
266
 
@@ -294,7 +295,7 @@ static void multi_tsc_destroy(Scorer *self)
294
295
  scorer_destroy_i(self);
295
296
  }
296
297
 
297
- static Scorer *multi_tsc_new(Weight *weight, const char *field,
298
+ static Scorer *multi_tsc_new(Weight *weight, Symbol field,
298
299
  TermDocEnumWrapper **tdew_a, int tdew_cnt,
299
300
  uchar *norms)
300
301
  {
@@ -367,7 +368,7 @@ static Scorer *multi_tw_scorer(Weight *self, IndexReader *ir)
367
368
  return multi_tsc;
368
369
  }
369
370
 
370
- Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
371
+ static Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
371
372
  {
372
373
  Explanation *expl;
373
374
  Explanation *idf_expl1;
@@ -383,19 +384,20 @@ Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
383
384
 
384
385
  char *query_str;
385
386
  MultiTermQuery *mtq = MTQ(self->query);
386
- const char *field = mtq->field;
387
+ const char *field = S(mtq->field);
387
388
  PriorityQueue *bt_pq = mtq->boosted_terms;
388
389
  int i;
389
390
  int total_doc_freqs = 0;
390
391
  char *doc_freqs = NULL;
391
392
  size_t len = 0, pos = 0;
392
- const int field_num = fis_get_field_num(ir->fis, field);
393
+ const int field_num = fis_get_field_num(ir->fis, mtq->field);
393
394
 
394
395
  if (field_num < 0) {
395
- return expl_new(0.0, "field \"%s\" does not exist in the index", field);
396
+ return expl_new(0.0, "field \"%s\" does not exist in the index",
397
+ field);
396
398
  }
397
-
398
- query_str = self->query->to_s(self->query, "");
399
+
400
+ query_str = self->query->to_s(self->query, NULL);
399
401
 
400
402
  expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
401
403
 
@@ -407,8 +409,7 @@ Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
407
409
  for (i = bt_pq->size; i > 0; i--) {
408
410
  char *term = ((BoostedTerm *)bt_pq->heap[i])->term;
409
411
  int doc_freq = ir->doc_freq(ir, field_num, term);
410
- sprintf(doc_freqs + pos, "(%s=%d) + ", term, doc_freq);
411
- pos += strlen(doc_freqs + pos);
412
+ pos += sprintf(doc_freqs + pos, "(%s=%d) + ", term, doc_freq);
412
413
  total_doc_freqs += doc_freq;
413
414
  }
414
415
  pos -= 2; /* remove " + " from the end */
@@ -476,7 +477,6 @@ static Weight *multi_tw_new(Query *query, Searcher *searcher)
476
477
  int i;
477
478
  int doc_freq = 0;
478
479
  Weight *self = w_new(Weight, query);
479
- const char *field = MTQ(query)->field;
480
480
  PriorityQueue *bt_pq = MTQ(query)->boosted_terms;
481
481
 
482
482
  self->scorer = &multi_tw_scorer;
@@ -488,7 +488,7 @@ static Weight *multi_tw_new(Query *query, Searcher *searcher)
488
488
  self->idf = 0.0;
489
489
 
490
490
  for (i = bt_pq->size; i > 0; i--) {
491
- doc_freq += searcher->doc_freq(searcher, field,
491
+ doc_freq += searcher->doc_freq(searcher, MTQ(query)->field,
492
492
  ((BoostedTerm *)bt_pq->heap[i])->term);
493
493
  }
494
494
  self->idf += sim_idf(self->similarity, doc_freq,
@@ -502,13 +502,13 @@ static Weight *multi_tw_new(Query *query, Searcher *searcher)
502
502
  * MultiTermQuery
503
503
  ***************************************************************************/
504
504
 
505
- static char *multi_tq_to_s(Query *self, const char *curr_field)
505
+ static char *multi_tq_to_s(Query *self, Symbol default_field)
506
506
  {
507
507
  int i;
508
508
  PriorityQueue *boosted_terms = MTQ(self)->boosted_terms, *bt_pq_clone;
509
509
  BoostedTerm *bt;
510
510
  char *buffer, *bptr;
511
- char *field = MTQ(self)->field;
511
+ const char *field = S(MTQ(self)->field);
512
512
  int flen = (int)strlen(field);
513
513
  int tlen = 0;
514
514
 
@@ -519,16 +519,14 @@ static char *multi_tq_to_s(Query *self, const char *curr_field)
519
519
 
520
520
  bptr = buffer = ALLOC_N(char, tlen + flen + 35);
521
521
 
522
- if (strcmp(curr_field, field) != 0) {
523
- sprintf(bptr, "%s:", field);
524
- bptr += flen + 1;
522
+ if (default_field != MTQ(self)->field) {
523
+ bptr += sprintf(bptr, "%s:", field);
525
524
  }
526
525
 
527
526
  *(bptr++) = '"';
528
527
  bt_pq_clone = pq_clone(boosted_terms);
529
528
  while ((bt = (BoostedTerm *)pq_pop(bt_pq_clone)) != NULL) {
530
- sprintf(bptr, "%s", bt->term);
531
- bptr += (int)strlen(bptr);
529
+ bptr += sprintf(bptr, "%s", bt->term);
532
530
 
533
531
  if (bt->boost != 1.0) {
534
532
  *bptr = '^';
@@ -545,7 +543,7 @@ static char *multi_tq_to_s(Query *self, const char *curr_field)
545
543
  }
546
544
  bptr[-1] = '"'; /* delete last '|' char */
547
545
  bptr[ 0] = '\0';
548
-
546
+
549
547
  if (self->boost != 1.0) {
550
548
  *bptr = '^';
551
549
  dbl_to_s(++bptr, self->boost);
@@ -556,7 +554,6 @@ static char *multi_tq_to_s(Query *self, const char *curr_field)
556
554
 
557
555
  static void multi_tq_destroy_i(Query *self)
558
556
  {
559
- free(MTQ(self)->field);
560
557
  pq_destroy(MTQ(self)->boosted_terms);
561
558
  q_destroy_i(self);
562
559
  }
@@ -564,18 +561,17 @@ static void multi_tq_destroy_i(Query *self)
564
561
  static void multi_tq_extract_terms(Query *self, HashSet *terms)
565
562
  {
566
563
  int i;
567
- char *field = MTQ(self)->field;
568
564
  PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
569
565
  for (i = boosted_terms->size; i > 0; i--) {
570
566
  BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
571
- hs_add(terms, term_new(field, bt->term));
567
+ hs_add(terms, term_new(MTQ(self)->field, bt->term));
572
568
  }
573
569
  }
574
570
 
575
571
  static unsigned long multi_tq_hash(Query *self)
576
572
  {
577
573
  int i;
578
- unsigned long hash = str_hash(MTQ(self)->field);
574
+ unsigned long hash = sym_hash(MTQ(self)->field);
579
575
  PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
580
576
  for (i = boosted_terms->size; i > 0; i--) {
581
577
  BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
@@ -590,7 +586,7 @@ static int multi_tq_eq(Query *self, Query *o)
590
586
  PriorityQueue *boosted_terms1 = MTQ(self)->boosted_terms;
591
587
  PriorityQueue *boosted_terms2 = MTQ(o)->boosted_terms;
592
588
 
593
- if (strcmp(MTQ(self)->field, MTQ(o)->field) != 0
589
+ if ((MTQ(self)->field != MTQ(o)->field)
594
590
  || boosted_terms1->size != boosted_terms2->size) {
595
591
  return false;
596
592
  }
@@ -607,7 +603,7 @@ static int multi_tq_eq(Query *self, Query *o)
607
603
  static MatchVector *multi_tq_get_matchv_i(Query *self, MatchVector *mv,
608
604
  TermVector *tv)
609
605
  {
610
- if (strcmp(tv->field, MTQ(self)->field) == 0) {
606
+ if (tv->field == MTQ(self)->field) {
611
607
  int i;
612
608
  PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
613
609
  for (i = boosted_terms->size; i > 0; i--) {
@@ -625,7 +621,7 @@ static MatchVector *multi_tq_get_matchv_i(Query *self, MatchVector *mv,
625
621
  return mv;
626
622
  }
627
623
 
628
- Query *multi_tq_new_conf(const char *field, int max_terms, float min_boost)
624
+ Query *multi_tq_new_conf(Symbol field, int max_terms, float min_boost)
629
625
  {
630
626
  Query *self;
631
627
 
@@ -636,7 +632,7 @@ Query *multi_tq_new_conf(const char *field, int max_terms, float min_boost)
636
632
 
637
633
  self = q_new(MultiTermQuery);
638
634
 
639
- MTQ(self)->field = estrdup(field);
635
+ MTQ(self)->field = field;
640
636
  MTQ(self)->boosted_terms = pq_new(max_terms,
641
637
  (lt_ft)&boosted_term_less_than,
642
638
  (free_ft)&boosted_term_destroy);
@@ -654,7 +650,7 @@ Query *multi_tq_new_conf(const char *field, int max_terms, float min_boost)
654
650
  return self;
655
651
  }
656
652
 
657
- Query *multi_tq_new(const char *field)
653
+ Query *multi_tq_new(Symbol field)
658
654
  {
659
655
  return multi_tq_new_conf(field, MULTI_TERM_QUERY_MAX_TERMS, 0.0);
660
656
  }