jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/q_phrase.c ADDED
@@ -0,0 +1,1206 @@
1
+ #include <string.h>
2
+ #include <limits.h>
3
+ #include "search.h"
4
+ #include "array.h"
5
+ #include "symbol.h"
6
+ #include "internal.h"
7
+
8
+ #define PhQ(query) ((PhraseQuery *)(query))
9
+
10
+ /**
11
+ * Use to sort the phrase positions into positional order. For phrase
12
+ * positions matching at the same position (a very unusual case) we order by
13
+ * first terms. The only real reason for the sorting by first terms is to get
14
+ * consistant order of positions when testing. Functionally it makes no
15
+ * difference.
16
+ */
17
+ static int phrase_pos_cmp(const void *p1, const void *p2)
18
+ {
19
+ int pos1 = ((PhrasePosition *)p1)->pos;
20
+ int pos2 = ((PhrasePosition *)p2)->pos;
21
+ if (pos1 > pos2) {
22
+ return 1;
23
+ }
24
+ if (pos1 < pos2) {
25
+ return -1;
26
+ }
27
+ return strcmp(((PhrasePosition *)p1)->terms[0],
28
+ ((PhrasePosition *)p2)->terms[0]);
29
+ }
30
+
31
+
32
+ /***************************************************************************
33
+ *
34
+ * PhraseScorer
35
+ *
36
+ ***************************************************************************/
37
+
38
+ /***************************************************************************
39
+ * PhPos
40
+ ***************************************************************************/
41
+
42
+ #define PP(p) ((PhPos *)(p))
43
+ typedef struct PhPos
44
+ {
45
+ TermDocEnum *tpe;
46
+ int offset;
47
+ int count;
48
+ int doc;
49
+ int position;
50
+ } PhPos;
51
+
52
+ static bool pp_next(PhPos *self)
53
+ {
54
+ TermDocEnum *tpe = self->tpe;
55
+ if (!tpe->next(tpe)) {
56
+ tpe->close(tpe); /* close stream */
57
+ self->tpe = NULL;
58
+ self->doc = INT_MAX; /* sentinel value */
59
+ return false;
60
+ }
61
+ self->doc = tpe->doc_num(tpe);
62
+ self->position = 0;
63
+ return true;
64
+ }
65
+
66
+ static bool pp_skip_to(PhPos *self, int doc_num)
67
+ {
68
+ TermDocEnum *tpe = self->tpe;
69
+ if (!tpe) {
70
+ return false;
71
+ }
72
+
73
+ if (!tpe->skip_to(tpe, doc_num)) {
74
+ tpe->close(tpe); /* close stream */
75
+ self->tpe = NULL;
76
+ self->doc = INT_MAX; /* sentinel value */
77
+ return false;
78
+ }
79
+ self->doc = tpe->doc_num(tpe);
80
+ self->position = 0;
81
+ return true;
82
+ }
83
+
84
+ static bool pp_next_position(PhPos *self)
85
+ {
86
+ TermDocEnum *tpe = self->tpe;
87
+ self->count--;
88
+ if (self->count >= 0) { /* read subsequent pos's */
89
+ self->position = tpe->next_position(tpe) - self->offset;
90
+ return true;
91
+ }
92
+ else {
93
+ return false;
94
+ }
95
+ }
96
+
97
+ static bool pp_first_position(PhPos *self)
98
+ {
99
+ TermDocEnum *tpe = self->tpe;
100
+ self->count = tpe->freq(tpe); /* read first pos */
101
+ return pp_next_position(self);
102
+ }
103
+
104
+ /*
105
+ static char *pp_to_s(PhPos *self)
106
+ {
107
+ return strfmt("pp->(doc => %d, position => %d)", self->doc, self->position);
108
+ }
109
+ */
110
+
111
+ #define PP_pp(p) (*(PhPos **)p)
112
+ static int pp_cmp(const void *const p1, const void *const p2)
113
+ {
114
+ int cmp = PP_pp(p1)->doc - PP_pp(p2)->doc;
115
+ if (cmp == 0) {
116
+ return PP_pp(p1)->position - PP_pp(p2)->position;
117
+ }
118
+ else {
119
+ return cmp;
120
+ }
121
+ }
122
+
123
+ static int pp_pos_cmp(const void *const p1, const void *const p2)
124
+ {
125
+ return PP_pp(p1)->position - PP_pp(p2)->position;
126
+ }
127
+
128
+ static bool pp_less_than(const PhPos *pp1, const PhPos *pp2)
129
+ {
130
+ if (pp1->position == pp2->position) {
131
+ return pp1->offset < pp2->offset;
132
+ }
133
+ else {
134
+ return pp1->position < pp2->position;
135
+ }
136
+ }
137
+
138
+ static void pp_destroy(PhPos *pp)
139
+ {
140
+ if (pp->tpe) {
141
+ pp->tpe->close(pp->tpe);
142
+ }
143
+ free(pp);
144
+ }
145
+
146
+ static PhPos *pp_new(TermDocEnum *tpe, int offset)
147
+ {
148
+ PhPos *self = ALLOC(PhPos);
149
+
150
+ self->tpe = tpe;
151
+ self->count = self->doc = self->position = -1;
152
+ self->offset = offset;
153
+
154
+ return self;
155
+ }
156
+
157
+ /***************************************************************************
158
+ * PhraseScorer
159
+ ***************************************************************************/
160
+
161
+ #define PhSc(scorer) ((PhraseScorer *)(scorer))
162
+
163
+ typedef struct PhraseScorer
164
+ {
165
+ Scorer super;
166
+ float (*phrase_freq)(Scorer *self);
167
+ float freq;
168
+ uchar *norms;
169
+ float value;
170
+ Weight *weight;
171
+ PhPos **phrase_pos;
172
+ int pp_first_idx;
173
+ int pp_cnt;
174
+ int slop;
175
+ bool first_time : 1;
176
+ bool more : 1;
177
+ bool check_repeats : 1;
178
+ } PhraseScorer;
179
+
180
+ static void phsc_init(PhraseScorer *phsc)
181
+ {
182
+ int i;
183
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
184
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
185
+ }
186
+
187
+ if (phsc->more) {
188
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
189
+ sizeof(PhPos *), &pp_cmp);
190
+ phsc->pp_first_idx = 0;
191
+ }
192
+ }
193
+
194
+ static bool phsc_do_next(Scorer *self)
195
+ {
196
+ PhraseScorer *phsc = PhSc(self);
197
+ const int pp_cnt = phsc->pp_cnt;
198
+ int pp_first_idx = phsc->pp_first_idx;
199
+ PhPos **phrase_positions = phsc->phrase_pos;
200
+
201
+ PhPos *first = phrase_positions[pp_first_idx];
202
+ PhPos *last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
203
+
204
+ while (phsc->more) {
205
+ /* find doc with all the terms */
206
+ while (phsc->more && first->doc < last->doc) {
207
+ /* skip first upto last */
208
+ phsc->more = pp_skip_to(first, last->doc);
209
+ last = first;
210
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
211
+ first = phrase_positions[pp_first_idx];
212
+ }
213
+
214
+ if (phsc->more) {
215
+ /* pp_first_idx will be used by phrase_freq */
216
+ phsc->pp_first_idx = pp_first_idx;
217
+
218
+ /* found a doc with all of the terms */
219
+ phsc->freq = phsc->phrase_freq(self);
220
+
221
+ if (phsc->freq == 0.0) { /* no match */
222
+ /* continuing search so re-set first and last */
223
+ pp_first_idx = phsc->pp_first_idx;
224
+ first = phrase_positions[pp_first_idx];
225
+ last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
226
+ phsc->more = pp_next(last); /* trigger further scanning */
227
+ }
228
+ else {
229
+ self->doc = first->doc;
230
+ return true; /* found a match */
231
+ }
232
+
233
+ }
234
+ }
235
+ return false;
236
+ }
237
+
238
+ static float phsc_score(Scorer *self)
239
+ {
240
+ PhraseScorer *phsc = PhSc(self);
241
+ float raw_score = sim_tf(self->similarity, phsc->freq) * phsc->value;
242
+ /* normalize */
243
+ return raw_score * sim_decode_norm(
244
+ self->similarity,
245
+ phsc->norms[self->doc]);
246
+ }
247
+
248
+ static bool phsc_next(Scorer *self)
249
+ {
250
+ PhraseScorer *phsc = PhSc(self);
251
+ if (phsc->first_time) {
252
+ phsc_init(phsc);
253
+ phsc->first_time = false;
254
+ }
255
+ else if (phsc->more) {
256
+ /* trigger further scanning */
257
+ phsc->more = pp_next(
258
+ phsc->phrase_pos[PREV_NUM(phsc->pp_first_idx, phsc->pp_cnt)]);
259
+ }
260
+
261
+ return phsc_do_next(self);
262
+ }
263
+
264
+ static bool phsc_skip_to(Scorer *self, int doc_num)
265
+ {
266
+ PhraseScorer *phsc = PhSc(self);
267
+ int i;
268
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
269
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) {
270
+ break;
271
+ }
272
+ }
273
+
274
+ if (phsc->more) {
275
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
276
+ sizeof(PhPos *), &pp_cmp);
277
+ phsc->pp_first_idx = 0;
278
+ }
279
+ return phsc_do_next(self);
280
+ }
281
+
282
+ static Explanation *phsc_explain(Scorer *self, int doc_num)
283
+ {
284
+ PhraseScorer *phsc = PhSc(self);
285
+ float phrase_freq;
286
+
287
+ phsc_skip_to(self, doc_num);
288
+
289
+ phrase_freq = (self->doc == doc_num) ? phsc->freq : 0.0f;
290
+ return expl_new(sim_tf(self->similarity, phrase_freq),
291
+ "tf(phrase_freq=%f)", phrase_freq);
292
+ }
293
+
294
+ static void phsc_destroy(Scorer *self)
295
+ {
296
+ PhraseScorer *phsc = PhSc(self);
297
+ int i;
298
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
299
+ pp_destroy(phsc->phrase_pos[i]);
300
+ }
301
+ free(phsc->phrase_pos);
302
+ scorer_destroy_i(self);
303
+ }
304
+
305
+ static Scorer *phsc_new(Weight *weight,
306
+ TermDocEnum **term_pos_enum,
307
+ PhrasePosition *positions, int pos_cnt,
308
+ Similarity *similarity,
309
+ uchar *norms,
310
+ int slop)
311
+ {
312
+ int i;
313
+ Scorer *self = scorer_new(PhraseScorer, similarity);
314
+ HashSet *term_set = NULL;
315
+
316
+
317
+ PhSc(self)->weight = weight;
318
+ PhSc(self)->norms = norms;
319
+ PhSc(self)->value = weight->value;
320
+ PhSc(self)->phrase_pos = ALLOC_N(PhPos *, pos_cnt);
321
+ PhSc(self)->pp_first_idx = 0;
322
+ PhSc(self)->pp_cnt = pos_cnt;
323
+ PhSc(self)->slop = slop;
324
+ PhSc(self)->first_time = true;
325
+ PhSc(self)->more = true;
326
+ PhSc(self)->check_repeats = false;
327
+
328
+ if (slop) {
329
+ term_set = hs_new_str((free_ft)NULL);
330
+ }
331
+ for (i = 0; i < pos_cnt; i++) {
332
+ /* check for repeats */
333
+ if (slop && !PhSc(self)->check_repeats) {
334
+ char **terms = positions[i].terms;
335
+ const int t_cnt = ary_size(terms);
336
+ int j;
337
+ for (j = 0; j < t_cnt; j++) {
338
+ if (hs_add(term_set, terms[j])) {
339
+ PhSc(self)->check_repeats = true;
340
+ goto repeat_check_done;
341
+ }
342
+ }
343
+ }
344
+ repeat_check_done:
345
+ PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos);
346
+ }
347
+
348
+ if (slop) {
349
+ hs_destroy(term_set);
350
+ }
351
+
352
+ self->score = &phsc_score;
353
+ self->next = &phsc_next;
354
+ self->skip_to = &phsc_skip_to;
355
+ self->explain = &phsc_explain;
356
+ self->destroy = &phsc_destroy;
357
+
358
+ return self;
359
+ }
360
+
361
+ /***************************************************************************
362
+ * ExactPhraseScorer
363
+ ***************************************************************************/
364
+
365
+ static float ephsc_phrase_freq(Scorer *self)
366
+ {
367
+ PhraseScorer *phsc = PhSc(self);
368
+ int i;
369
+ int pp_first_idx = 0;
370
+ const int pp_cnt = phsc->pp_cnt;
371
+ float freq = 0.0;
372
+ PhPos **phrase_positions = phsc->phrase_pos;
373
+ PhPos *first;
374
+ PhPos *last;
375
+
376
+ for (i = 0; i < pp_cnt; i++) {
377
+ pp_first_position(phrase_positions[i]);
378
+ }
379
+ qsort(phrase_positions, pp_cnt, sizeof(PhPos *), &pp_pos_cmp);
380
+
381
+ first = phrase_positions[0];
382
+ last = phrase_positions[pp_cnt - 1];
383
+
384
+ /* scan to position with all terms */
385
+ do {
386
+ /* scan forward in first */
387
+ while (first->position < last->position) {
388
+ do {
389
+ if (! pp_next_position(first)) {
390
+ /* maintain first position */
391
+ phsc->pp_first_idx = pp_first_idx;
392
+ return freq;
393
+ }
394
+ } while (first->position < last->position);
395
+ last = first;
396
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
397
+ first = phrase_positions[pp_first_idx];
398
+ }
399
+ freq += 1.0; /* all equal: a match */
400
+ } while (pp_next_position(last));
401
+
402
+ /* maintain first position */
403
+ phsc->pp_first_idx = pp_first_idx;
404
+ return freq;
405
+ }
406
+
407
+ static Scorer *exact_phrase_scorer_new(Weight *weight,
408
+ TermDocEnum **term_pos_enum,
409
+ PhrasePosition *positions, int pp_cnt,
410
+ Similarity *similarity, uchar *norms)
411
+ {
412
+ Scorer *self = phsc_new(weight,
413
+ term_pos_enum,
414
+ positions,
415
+ pp_cnt,
416
+ similarity,
417
+ norms,
418
+ 0);
419
+
420
+ PhSc(self)->phrase_freq = &ephsc_phrase_freq;
421
+ return self;
422
+ }
423
+
424
+ /***************************************************************************
425
+ * SloppyPhraseScorer
426
+ ***************************************************************************/
427
+
428
+ static bool sphsc_check_repeats(PhPos *pp,
429
+ PhPos **positions,
430
+ const int p_cnt)
431
+ {
432
+ int j;
433
+ for (j = 0; j < p_cnt; j++) {
434
+ PhPos *ppj = positions[j];
435
+ /* If offsets are equal, either we are at the current PhPos +pp+ or
436
+ * +pp+ and +ppj+ are supposed to match in the same position in which
437
+ * case we don't need to check. */
438
+ if (ppj->offset == pp->offset) {
439
+ continue;
440
+ }
441
+ /* the two phrase positions are matching on the same term
442
+ * which we want to avoid */
443
+ if ((ppj->position + ppj->offset) == (pp->position + pp->offset)) {
444
+ if (!pp_next_position(pp)) {
445
+ /* We have no matches for this document */
446
+ return false;
447
+ }
448
+ /* we changed the position so we need to start check again */
449
+ j = -1;
450
+ }
451
+ }
452
+ return true;
453
+ }
454
+
455
+ static float sphsc_phrase_freq(Scorer *self)
456
+ {
457
+ PhraseScorer *phsc = PhSc(self);
458
+ PhPos *pp;
459
+ PriorityQueue *pq = pq_new(phsc->pp_cnt, (lt_ft)&pp_less_than, NULL);
460
+ const int pp_cnt = phsc->pp_cnt;
461
+
462
+ int last_pos = 0, pos, next_pos, start, match_length, i;
463
+ bool done = false;
464
+ bool check_repeats = phsc->check_repeats;
465
+ float freq = 0.0;
466
+
467
+ for (i = 0; i < pp_cnt; i++) {
468
+ bool res;
469
+ pp = phsc->phrase_pos[i];
470
+ /* we should always have at least one position or this functions
471
+ * shouldn't have been called. */
472
+ res = pp_first_position(pp);
473
+ assert(res);(void)res;
474
+ if (check_repeats && i > 0) {
475
+ if (!sphsc_check_repeats(pp, phsc->phrase_pos, i - 1)) {
476
+ goto return_freq;
477
+ }
478
+ }
479
+ if (pp->position > last_pos) {
480
+ last_pos = pp->position;
481
+ }
482
+ pq_push(pq, pp);
483
+ }
484
+
485
+ do {
486
+ pp = (PhPos *)pq_pop(pq);
487
+ pos = start = pp->position;
488
+ next_pos = PP(pq_top(pq))->position;
489
+ while (pos <= next_pos) {
490
+ start = pos; /* advance pp to min window */
491
+ if (!pp_next_position(pp)
492
+ || (check_repeats
493
+ && !sphsc_check_repeats(pp, phsc->phrase_pos, pp_cnt))) {
494
+ done = true;
495
+ break;
496
+ }
497
+ pos = pp->position;
498
+ }
499
+
500
+ match_length = last_pos - start;
501
+ if (match_length <= phsc->slop) {
502
+ /* score match */
503
+ freq += sim_sloppy_freq(self->similarity, match_length);
504
+ }
505
+
506
+ if (pp->position > last_pos) {
507
+ last_pos = pp->position;
508
+ }
509
+ pq_push(pq, pp); /* restore pq */
510
+ } while (!done);
511
+
512
+ return_freq:
513
+
514
+ pq_destroy(pq);
515
+ return freq;
516
+ }
517
+
518
+ static Scorer *sloppy_phrase_scorer_new(Weight *weight,
519
+ TermDocEnum **term_pos_enum,
520
+ PhrasePosition *positions,
521
+ int pp_cnt, Similarity *similarity,
522
+ int slop, uchar *norms)
523
+ {
524
+ Scorer *self = phsc_new(weight,
525
+ term_pos_enum,
526
+ positions,
527
+ pp_cnt,
528
+ similarity,
529
+ norms,
530
+ slop);
531
+
532
+ PhSc(self)->phrase_freq = &sphsc_phrase_freq;
533
+ return self;
534
+ }
535
+
536
+ /***************************************************************************
537
+ *
538
+ * PhraseWeight
539
+ *
540
+ ***************************************************************************/
541
+
542
+ static char *phw_to_s(Weight *self)
543
+ {
544
+ return strfmt("PhraseWeight(%f)", self->value);
545
+ }
546
+
547
+ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
548
+ {
549
+ int i;
550
+ Scorer *phsc = NULL;
551
+ PhraseQuery *phq = PhQ(self->query);
552
+ TermDocEnum **tps, *tpe;
553
+ PhrasePosition *positions = phq->positions;
554
+ const int pos_cnt = phq->pos_cnt;
555
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
556
+
557
+ if (pos_cnt == 0 || field_num < 0) {
558
+ return NULL;
559
+ }
560
+
561
+ tps = ALLOC_N(TermDocEnum *, pos_cnt);
562
+
563
+ for (i = 0; i < pos_cnt; i++) {
564
+ char **terms = positions[i].terms;
565
+ const int t_cnt = ary_size(terms);
566
+ if (t_cnt == 1) {
567
+ tpe = tps[i] = ir->term_positions(ir);
568
+ tpe->seek(tpe, field_num, terms[0]);
569
+ }
570
+ else {
571
+ tps[i] = mtdpe_new(ir, field_num, terms, t_cnt);
572
+ }
573
+ /* neither mtdpe_new nor ir->term_positions should return NULL */
574
+ assert(NULL != tps[i]);
575
+ }
576
+
577
+ if (phq->slop == 0) { /* optimize exact (common) case */
578
+ phsc = exact_phrase_scorer_new(self, tps, positions, pos_cnt,
579
+ self->similarity,
580
+ ir_get_norms_i(ir, field_num));
581
+ }
582
+ else {
583
+ phsc = sloppy_phrase_scorer_new(self, tps, positions, pos_cnt,
584
+ self->similarity, phq->slop,
585
+ ir_get_norms_i(ir, field_num));
586
+ }
587
+ free(tps);
588
+ return phsc;
589
+ }
590
+
591
+ static Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
592
+ {
593
+ Explanation *expl;
594
+ Explanation *idf_expl1;
595
+ Explanation *idf_expl2;
596
+ Explanation *query_expl;
597
+ Explanation *qnorm_expl;
598
+ Explanation *field_expl;
599
+ Explanation *tf_expl;
600
+ Scorer *scorer;
601
+ uchar *field_norms;
602
+ float field_norm;
603
+ Explanation *field_norm_expl;
604
+ char *query_str;
605
+ PhraseQuery *phq = PhQ(self->query);
606
+ const int pos_cnt = phq->pos_cnt;
607
+ PhrasePosition *positions = phq->positions;
608
+ int i, j;
609
+ char *doc_freqs = NULL;
610
+ size_t len = 0, pos = 0;
611
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
612
+ const char *field = S(phq->field);
613
+
614
+ if (field_num < 0) {
615
+ return expl_new(0.0, "field \"%s\" does not exist in the index", field);
616
+ }
617
+
618
+ query_str = self->query->to_s(self->query, NULL);
619
+
620
+ expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
621
+
622
+ /* ensure the phrase positions are in order for explanation */
623
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
624
+
625
+ for (i = 0; i < phq->pos_cnt; i++) {
626
+ char **terms = phq->positions[i].terms;
627
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
628
+ len += strlen(terms[j]) + 30;
629
+ }
630
+ }
631
+ doc_freqs = ALLOC_N(char, len);
632
+ for (i = 0; i < phq->pos_cnt; i++) {
633
+ char **terms = phq->positions[i].terms;
634
+ const int t_cnt = ary_size(terms);
635
+ for (j = 0; j < t_cnt; j++) {
636
+ char *term = terms[j];
637
+ pos += sprintf(doc_freqs + pos, "%s=%d, ",
638
+ term, ir->doc_freq(ir, field_num, term));
639
+ }
640
+ }
641
+ pos -= 2; /* remove ", " from the end */
642
+ doc_freqs[pos] = 0;
643
+
644
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
645
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
646
+ free(doc_freqs);
647
+
648
+ /* explain query weight */
649
+ query_expl = expl_new(0.0, "query_weight(%s), product of:", query_str);
650
+
651
+ if (self->query->boost != 1.0) {
652
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
653
+ }
654
+ expl_add_detail(query_expl, idf_expl1);
655
+
656
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
657
+ expl_add_detail(query_expl, qnorm_expl);
658
+
659
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
660
+
661
+ expl_add_detail(expl, query_expl);
662
+
663
+ /* explain field weight */
664
+ field_expl = expl_new(0.0, "field_weight(%s in %d), product of:",
665
+ query_str, doc_num);
666
+ free(query_str);
667
+
668
+ scorer = self->scorer(self, ir);
669
+ tf_expl = scorer->explain(scorer, doc_num);
670
+ scorer->destroy(scorer);
671
+ expl_add_detail(field_expl, tf_expl);
672
+ expl_add_detail(field_expl, idf_expl2);
673
+
674
+ field_norms = ir->get_norms(ir, field_num);
675
+ field_norm = (field_norms != NULL)
676
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
677
+ : (float)0.0;
678
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
679
+ field, doc_num);
680
+
681
+ expl_add_detail(field_expl, field_norm_expl);
682
+
683
+ field_expl->value = tf_expl->value * self->idf * field_norm;
684
+
685
+ /* combine them */
686
+ if (query_expl->value == 1.0) {
687
+ expl_destroy(expl);
688
+ return field_expl;
689
+ }
690
+ else {
691
+ expl->value = (query_expl->value * field_expl->value);
692
+ expl_add_detail(expl, field_expl);
693
+ return expl;
694
+ }
695
+ }
696
+
697
+ static Weight *phw_new(Query *query, Searcher *searcher)
698
+ {
699
+ Weight *self = w_new(Weight, query);
700
+
701
+ self->scorer = &phw_scorer;
702
+ self->explain = &phw_explain;
703
+ self->to_s = &phw_to_s;
704
+
705
+ self->similarity = query->get_similarity(query, searcher);
706
+ self->value = query->boost;
707
+ self->idf = sim_idf_phrase(self->similarity, PhQ(query)->field,
708
+ PhQ(query)->positions,
709
+ PhQ(query)->pos_cnt, searcher);
710
+ return self;
711
+ }
712
+
713
+ /***************************************************************************
714
+ *
715
+ * PhraseQuery
716
+ *
717
+ ***************************************************************************/
718
+
719
+ /* ** TVPosEnum ** */
720
+ typedef struct TVPosEnum
721
+ {
722
+ int index;
723
+ int size;
724
+ int offset;
725
+ int pos;
726
+ int positions[1];
727
+ } TVPosEnum;
728
+
729
+ static bool tvpe_next(TVPosEnum *self)
730
+ {
731
+ if (++(self->index) < self->size) {
732
+ self->pos = self->positions[self->index] - self->offset;
733
+ return true;
734
+ }
735
+ else {
736
+ self->pos = -1;
737
+ return false;
738
+ }
739
+ }
740
+
741
+ static int tvpe_skip_to(TVPosEnum *self, int position)
742
+ {
743
+ int i;
744
+ int search_pos = position + self->offset;
745
+ for (i = self->index + 1; i < self->size; i++) {
746
+ if (self->positions[i] >= search_pos) {
747
+ self->pos = self->positions[i] - self->offset;
748
+ break;
749
+ }
750
+ }
751
+ self->index = i;
752
+ if (i == self->size) {
753
+ self->pos = -1;
754
+ return false;
755
+ }
756
+ return true;
757
+ }
758
+
759
+ static bool tvpe_lt(TVPosEnum *tvpe1, TVPosEnum *tvpe2)
760
+ {
761
+ return tvpe1->pos < tvpe2->pos;
762
+ }
763
+
764
+ static TVPosEnum *tvpe_new(int *positions, int size, int offset)
765
+ {
766
+ TVPosEnum *self = (TVPosEnum*)emalloc(sizeof(TVPosEnum) + size*sizeof(int));
767
+ memcpy(self->positions, positions, size * sizeof(int));
768
+ self->size = size;
769
+ self->offset = offset;
770
+ self->index = -1;
771
+ self->pos = -1;
772
+ return self;
773
+ }
774
+
775
+ static TVPosEnum *tvpe_new_merge(char **terms, int t_cnt, TermVector *tv,
776
+ int offset)
777
+ {
778
+ int i, total_positions = 0;
779
+ PriorityQueue *tvpe_pq = pq_new(t_cnt, (lt_ft)tvpe_lt, &free);
780
+ TVPosEnum *self = NULL;
781
+
782
+ for (i = 0; i < t_cnt; i++) {
783
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[i]);
784
+ if (tv_term) {
785
+ TVPosEnum *tvpe = tvpe_new(tv_term->positions, tv_term->freq, 0);
786
+ /* got tv_term so tvpe_next should always return true once here */
787
+ bool res = tvpe_next(tvpe);
788
+ assert(res);(void)res;
789
+ pq_push(tvpe_pq, tvpe);
790
+ total_positions += tv_term->freq;
791
+ }
792
+ }
793
+ if (tvpe_pq->size == 0) {
794
+ pq_destroy(tvpe_pq);
795
+ }
796
+ else {
797
+ int index = 0;
798
+ self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
799
+ + total_positions * sizeof(int));
800
+ self->size = total_positions;
801
+ self->offset = offset;
802
+ self->index = -1;
803
+ self->pos = -1;
804
+ while (tvpe_pq->size > 0) {
805
+ TVPosEnum *top = (TVPosEnum *)pq_top(tvpe_pq);
806
+ self->positions[index++] = top->pos;
807
+ if (! tvpe_next(top)) {
808
+ pq_pop(tvpe_pq);
809
+ free(top);
810
+ }
811
+ else {
812
+ pq_down(tvpe_pq);
813
+ }
814
+ }
815
+ pq_destroy(tvpe_pq);
816
+ }
817
+ return self;
818
+ }
819
+
820
+ static TVPosEnum *get_tvpe(TermVector *tv, char **terms, int t_cnt, int offset)
821
+ {
822
+ TVPosEnum *tvpe = NULL;
823
+ if (t_cnt == 1) {
824
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[0]);
825
+ if (tv_term) {
826
+ tvpe = tvpe_new(tv_term->positions, tv_term->freq, offset);
827
+ }
828
+ }
829
+ else {
830
+ tvpe = tvpe_new_merge(terms, t_cnt, tv, offset);
831
+ }
832
+ return tvpe;
833
+ }
834
+
835
+ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
836
+ TermVector *tv)
837
+ {
838
+ if (tv->field == PhQ(self)->field) {
839
+ const int pos_cnt = PhQ(self)->pos_cnt;
840
+ int i;
841
+ int slop = PhQ(self)->slop;
842
+ bool done = false;
843
+
844
+ if (slop > 0) {
845
+ PriorityQueue *tvpe_pq = pq_new(pos_cnt, (lt_ft)tvpe_lt, &free);
846
+ int last_pos = 0;
847
+ for (i = 0; i < pos_cnt; i++) {
848
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
849
+ const int t_cnt = ary_size(pp->terms);
850
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
851
+ if (tvpe && tvpe_next(tvpe)) {
852
+ if (tvpe->pos > last_pos) {
853
+ last_pos = tvpe->pos;
854
+ }
855
+ pq_push(tvpe_pq, tvpe);
856
+ }
857
+ else {
858
+ done = true;
859
+ free(tvpe);
860
+ break;
861
+ }
862
+ }
863
+ while (! done) {
864
+ TVPosEnum *tvpe = (TVPosEnum *)pq_pop(tvpe_pq);
865
+ int pos;
866
+ int start = pos = tvpe->pos;
867
+ int next_pos = ((TVPosEnum *)pq_top(tvpe_pq))->pos;
868
+ while (pos <= next_pos) {
869
+ start = pos;
870
+ if (!tvpe_next(tvpe)) {
871
+ done = true;
872
+ break;
873
+ }
874
+ pos = tvpe->pos;
875
+ }
876
+
877
+ if (last_pos - start <= slop) {
878
+ int min, max = min = start + tvpe->offset;
879
+ for (i = tvpe_pq->size; i > 0; i--) {
880
+ TVPosEnum *t = (TVPosEnum *)tvpe_pq->heap[i];
881
+ int p = t->pos + t->offset;
882
+ max = p > max ? p : max;
883
+ min = p < min ? p : min;
884
+ }
885
+ matchv_add(mv, min, max);
886
+ }
887
+ if (tvpe->pos > last_pos) {
888
+ last_pos = tvpe->pos;
889
+ }
890
+ pq_push(tvpe_pq, tvpe);
891
+ }
892
+
893
+ pq_destroy(tvpe_pq);
894
+ }
895
+ else { /* exact match */
896
+ TVPosEnum **tvpe_a = ALLOC_AND_ZERO_N(TVPosEnum *, pos_cnt);
897
+ TVPosEnum *first, *last;
898
+ int first_index = 0;
899
+ done = false;
900
+ qsort(PhQ(self)->positions, pos_cnt, sizeof(PhrasePosition),
901
+ &phrase_pos_cmp);
902
+ for (i = 0; i < pos_cnt; i++) {
903
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
904
+ const int t_cnt = ary_size(pp->terms);
905
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
906
+ if (tvpe && ((i == 0 && tvpe_next(tvpe))
907
+ || tvpe_skip_to(tvpe, tvpe_a[i-1]->pos))) {
908
+ tvpe_a[i] = tvpe;
909
+ }
910
+ else {
911
+ done = true;
912
+ free(tvpe);
913
+ break;
914
+ }
915
+ }
916
+
917
+ first = tvpe_a[0];
918
+ last = tvpe_a[pos_cnt - 1];
919
+
920
+ while (!done) {
921
+ while (first->pos < last->pos) {
922
+ if (tvpe_skip_to(first, last->pos)) {
923
+ last = first;
924
+ first_index = NEXT_NUM(first_index, pos_cnt);
925
+ first = tvpe_a[first_index];
926
+ }
927
+ else {
928
+ done = true;
929
+ break;
930
+ }
931
+ }
932
+ if (!done) {
933
+ matchv_add(mv, tvpe_a[0]->pos + tvpe_a[0]->offset,
934
+ tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
935
+ }
936
+ if (!tvpe_next(last)) {
937
+ done = true;
938
+ }
939
+ }
940
+ for (i = 0; i < pos_cnt; i++) {
941
+ free(tvpe_a[i]);
942
+ }
943
+ free(tvpe_a);
944
+ }
945
+ }
946
+ return mv;
947
+ }
948
+
949
+
950
+ /* ** PhraseQuery besides highlighting stuff ** */
951
+
952
+ #define PhQ_INIT_CAPA 4
953
+
954
+ static void phq_extract_terms(Query *self, HashSet *term_set)
955
+ {
956
+ PhraseQuery *phq = PhQ(self);
957
+ int i, j;
958
+ for (i = 0; i < phq->pos_cnt; i++) {
959
+ char **terms = phq->positions[i].terms;
960
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
961
+ hs_add(term_set, term_new(phq->field, terms[j]));
962
+ }
963
+ }
964
+ }
965
+
966
+ static char *phq_to_s(Query *self, Symbol default_field)
967
+ {
968
+ PhraseQuery *phq = PhQ(self);
969
+ const int pos_cnt = phq->pos_cnt;
970
+ PhrasePosition *positions = phq->positions;
971
+ const char *field = S(phq->field);
972
+ int flen = strlen(field);
973
+
974
+ int i, j, buf_index = 0, pos, last_pos;
975
+ size_t len = 0;
976
+ char *buffer;
977
+
978
+ if (phq->pos_cnt == 0) {
979
+ if (default_field != phq->field) {
980
+ return strfmt("%s:\"\"", field);
981
+ }
982
+ else {
983
+ return estrdup("\"\"");
984
+ }
985
+ }
986
+
987
+ /* sort the phrase positions by position */
988
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
989
+
990
+ len = flen + 1;
991
+
992
+ for (i = 0; i < pos_cnt; i++) {
993
+ char **terms = phq->positions[i].terms;
994
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
995
+ len += strlen(terms[j]) + 5;
996
+ }
997
+ }
998
+
999
+ /* add space for extra <> characters and boost and slop */
1000
+ len += 100 + 3
1001
+ * (phq->positions[phq->pos_cnt - 1].pos - phq->positions[0].pos);
1002
+
1003
+ buffer = ALLOC_N(char, len);
1004
+
1005
+ if (default_field != phq->field) {
1006
+ memcpy(buffer, field, flen);
1007
+ buffer[flen] = ':';
1008
+ buf_index += flen + 1;
1009
+ }
1010
+
1011
+ buffer[buf_index++] = '"';
1012
+
1013
+ last_pos = positions[0].pos - 1;
1014
+ for (i = 0; i < pos_cnt; i++) {
1015
+ char **terms = positions[i].terms;
1016
+ const int t_cnt = ary_size(terms);
1017
+
1018
+ pos = positions[i].pos;
1019
+ if (pos == last_pos) {
1020
+ buffer[buf_index - 1] = '&';
1021
+ }
1022
+ else {
1023
+ for (j = last_pos; j < pos - 1; j++) {
1024
+ memcpy(buffer + buf_index, "<> ", 3);
1025
+ buf_index += 3;
1026
+ }
1027
+ }
1028
+
1029
+ last_pos = pos;
1030
+ for (j = 0; j < t_cnt; j++) {
1031
+ char *term = terms[j];
1032
+ len = strlen(term);
1033
+ memcpy(buffer + buf_index, term, len);
1034
+ buf_index += len;
1035
+ buffer[buf_index++] = '|';
1036
+ }
1037
+ buffer[buf_index-1] = ' '; /* change last '|' to ' ' */
1038
+ }
1039
+
1040
+ if (buffer[buf_index-1] == ' ') {
1041
+ buf_index--;
1042
+ }
1043
+
1044
+ buffer[buf_index++] = '"';
1045
+ buffer[buf_index] = 0;
1046
+
1047
+ if (phq->slop != 0) {
1048
+ buf_index += sprintf(buffer + buf_index, "~%d", phq->slop);
1049
+ }
1050
+
1051
+ if (self->boost != 1.0) {
1052
+ buffer[buf_index++] = '^';
1053
+ dbl_to_s(buffer + buf_index, self->boost);
1054
+ }
1055
+
1056
+ return buffer;
1057
+ }
1058
+
1059
+ static void phq_destroy(Query *self)
1060
+ {
1061
+ PhraseQuery *phq = PhQ(self);
1062
+ int i;
1063
+ for (i = 0; i < phq->pos_cnt; i++) {
1064
+ ary_destroy(phq->positions[i].terms, &free);
1065
+ }
1066
+ free(phq->positions);
1067
+ q_destroy_i(self);
1068
+ }
1069
+
1070
+ static Query *phq_rewrite(Query *self, IndexReader *ir)
1071
+ {
1072
+ PhraseQuery *phq = PhQ(self);
1073
+ (void)ir;
1074
+ if (phq->pos_cnt == 1) {
1075
+ /* optimize one-position case */
1076
+ char **terms = phq->positions[0].terms;
1077
+ const int t_cnt = ary_size(terms);
1078
+ if (t_cnt == 1) {
1079
+ Query *tq = tq_new(phq->field, terms[0]);
1080
+ tq->boost = self->boost;
1081
+ return tq;
1082
+ }
1083
+ else {
1084
+ Query *q = multi_tq_new(phq->field);
1085
+ int i;
1086
+ for (i = 0; i < t_cnt; i++) {
1087
+ multi_tq_add_term(q, terms[i]);
1088
+ }
1089
+ q->boost = self->boost;
1090
+ return q;
1091
+ }
1092
+ } else {
1093
+ self->ref_cnt++;
1094
+ return self;
1095
+ }
1096
+ }
1097
+
1098
+ static unsigned long phq_hash(Query *self)
1099
+ {
1100
+ int i, j;
1101
+ PhraseQuery *phq = PhQ(self);
1102
+ unsigned long hash = sym_hash(phq->field);
1103
+ for (i = 0; i < phq->pos_cnt; i++) {
1104
+ char **terms = phq->positions[i].terms;
1105
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
1106
+ hash = (hash << 1) ^ (str_hash(terms[j])
1107
+ ^ phq->positions[i].pos);
1108
+ }
1109
+ }
1110
+ return (hash ^ phq->slop);
1111
+ }
1112
+
1113
+ static int phq_eq(Query *self, Query *o)
1114
+ {
1115
+ int i, j;
1116
+ PhraseQuery *phq1 = PhQ(self);
1117
+ PhraseQuery *phq2 = PhQ(o);
1118
+ if (phq1->slop != phq2->slop
1119
+ || phq1->field != phq2->field
1120
+ || phq1->pos_cnt != phq2->pos_cnt) {
1121
+ return false;
1122
+ }
1123
+ for (i = 0; i < phq1->pos_cnt; i++) {
1124
+ char **terms1 = phq1->positions[i].terms;
1125
+ char **terms2 = phq2->positions[i].terms;
1126
+ const int t_cnt = ary_size(terms1);
1127
+ if (t_cnt != ary_size(terms2)
1128
+ || phq1->positions[i].pos != phq2->positions[i].pos) {
1129
+ return false;
1130
+ }
1131
+ for (j = 0; j < t_cnt; j++) {
1132
+ if (strcmp(terms1[j], terms2[j]) != 0) {
1133
+ return false;
1134
+ }
1135
+ }
1136
+ }
1137
+ return true;
1138
+ }
1139
+
1140
+ Query *phq_new(Symbol field)
1141
+ {
1142
+ Query *self = q_new(PhraseQuery);
1143
+
1144
+ PhQ(self)->field = field;
1145
+ PhQ(self)->pos_cnt = 0;
1146
+ PhQ(self)->pos_capa = PhQ_INIT_CAPA;
1147
+ PhQ(self)->positions = ALLOC_N(PhrasePosition, PhQ_INIT_CAPA);
1148
+
1149
+ self->type = PHRASE_QUERY;
1150
+ self->rewrite = &phq_rewrite;
1151
+ self->extract_terms = &phq_extract_terms;
1152
+ self->to_s = &phq_to_s;
1153
+ self->hash = &phq_hash;
1154
+ self->eq = &phq_eq;
1155
+ self->destroy_i = &phq_destroy;
1156
+ self->create_weight_i = &phw_new;
1157
+ self->get_matchv_i = &phq_get_matchv_i;
1158
+ return self;
1159
+ }
1160
+
1161
+ void phq_add_term_abs(Query *self, const char *term, int position)
1162
+ {
1163
+ PhraseQuery *phq = PhQ(self);
1164
+ int index = phq->pos_cnt;
1165
+ PhrasePosition *pp;
1166
+ if (index >= phq->pos_capa) {
1167
+ phq->pos_capa <<= 1;
1168
+ REALLOC_N(phq->positions, PhrasePosition, phq->pos_capa);
1169
+ }
1170
+ pp = &(phq->positions[index]);
1171
+ pp->terms = ary_new_type_capa(char *, 2);
1172
+ ary_push(pp->terms, estrdup(term));
1173
+ pp->pos = position;
1174
+ phq->pos_cnt++;
1175
+ }
1176
+
1177
+ void phq_add_term(Query *self, const char *term, int pos_inc)
1178
+ {
1179
+ PhraseQuery *phq = PhQ(self);
1180
+ int position;
1181
+ if (phq->pos_cnt == 0) {
1182
+ position = 0;
1183
+ }
1184
+ else {
1185
+ position = phq->positions[phq->pos_cnt - 1].pos + pos_inc;
1186
+ }
1187
+ phq_add_term_abs(self, term, position);
1188
+ }
1189
+
1190
+ void phq_append_multi_term(Query *self, const char *term)
1191
+ {
1192
+ PhraseQuery *phq = PhQ(self);
1193
+ int index = phq->pos_cnt - 1;
1194
+
1195
+ if (index < 0) {
1196
+ phq_add_term(self, term, 0);
1197
+ }
1198
+ else {
1199
+ ary_push(phq->positions[index].terms, estrdup(term));
1200
+ }
1201
+ }
1202
+
1203
+ void frt_phq_set_slop(FrtQuery *self, int slop)
1204
+ {
1205
+ PhQ(self)->slop = slop;
1206
+ }