jk-ferret 0.11.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/q_phrase.c ADDED
@@ -0,0 +1,1206 @@
1
+ #include <string.h>
2
+ #include <limits.h>
3
+ #include "search.h"
4
+ #include "array.h"
5
+ #include "symbol.h"
6
+ #include "internal.h"
7
+
8
+ #define PhQ(query) ((PhraseQuery *)(query))
9
+
10
+ /**
11
+ * Use to sort the phrase positions into positional order. For phrase
12
+ * positions matching at the same position (a very unusual case) we order by
13
+ * first terms. The only real reason for the sorting by first terms is to get
14
+ * consistant order of positions when testing. Functionally it makes no
15
+ * difference.
16
+ */
17
+ static int phrase_pos_cmp(const void *p1, const void *p2)
18
+ {
19
+ int pos1 = ((PhrasePosition *)p1)->pos;
20
+ int pos2 = ((PhrasePosition *)p2)->pos;
21
+ if (pos1 > pos2) {
22
+ return 1;
23
+ }
24
+ if (pos1 < pos2) {
25
+ return -1;
26
+ }
27
+ return strcmp(((PhrasePosition *)p1)->terms[0],
28
+ ((PhrasePosition *)p2)->terms[0]);
29
+ }
30
+
31
+
32
+ /***************************************************************************
33
+ *
34
+ * PhraseScorer
35
+ *
36
+ ***************************************************************************/
37
+
38
+ /***************************************************************************
39
+ * PhPos
40
+ ***************************************************************************/
41
+
42
+ #define PP(p) ((PhPos *)(p))
43
+ typedef struct PhPos
44
+ {
45
+ TermDocEnum *tpe;
46
+ int offset;
47
+ int count;
48
+ int doc;
49
+ int position;
50
+ } PhPos;
51
+
52
+ static bool pp_next(PhPos *self)
53
+ {
54
+ TermDocEnum *tpe = self->tpe;
55
+ if (!tpe->next(tpe)) {
56
+ tpe->close(tpe); /* close stream */
57
+ self->tpe = NULL;
58
+ self->doc = INT_MAX; /* sentinel value */
59
+ return false;
60
+ }
61
+ self->doc = tpe->doc_num(tpe);
62
+ self->position = 0;
63
+ return true;
64
+ }
65
+
66
+ static bool pp_skip_to(PhPos *self, int doc_num)
67
+ {
68
+ TermDocEnum *tpe = self->tpe;
69
+ if (!tpe) {
70
+ return false;
71
+ }
72
+
73
+ if (!tpe->skip_to(tpe, doc_num)) {
74
+ tpe->close(tpe); /* close stream */
75
+ self->tpe = NULL;
76
+ self->doc = INT_MAX; /* sentinel value */
77
+ return false;
78
+ }
79
+ self->doc = tpe->doc_num(tpe);
80
+ self->position = 0;
81
+ return true;
82
+ }
83
+
84
+ static bool pp_next_position(PhPos *self)
85
+ {
86
+ TermDocEnum *tpe = self->tpe;
87
+ self->count--;
88
+ if (self->count >= 0) { /* read subsequent pos's */
89
+ self->position = tpe->next_position(tpe) - self->offset;
90
+ return true;
91
+ }
92
+ else {
93
+ return false;
94
+ }
95
+ }
96
+
97
+ static bool pp_first_position(PhPos *self)
98
+ {
99
+ TermDocEnum *tpe = self->tpe;
100
+ self->count = tpe->freq(tpe); /* read first pos */
101
+ return pp_next_position(self);
102
+ }
103
+
104
+ /*
105
+ static char *pp_to_s(PhPos *self)
106
+ {
107
+ return strfmt("pp->(doc => %d, position => %d)", self->doc, self->position);
108
+ }
109
+ */
110
+
111
+ #define PP_pp(p) (*(PhPos **)p)
112
+ static int pp_cmp(const void *const p1, const void *const p2)
113
+ {
114
+ int cmp = PP_pp(p1)->doc - PP_pp(p2)->doc;
115
+ if (cmp == 0) {
116
+ return PP_pp(p1)->position - PP_pp(p2)->position;
117
+ }
118
+ else {
119
+ return cmp;
120
+ }
121
+ }
122
+
123
+ static int pp_pos_cmp(const void *const p1, const void *const p2)
124
+ {
125
+ return PP_pp(p1)->position - PP_pp(p2)->position;
126
+ }
127
+
128
+ static bool pp_less_than(const PhPos *pp1, const PhPos *pp2)
129
+ {
130
+ if (pp1->position == pp2->position) {
131
+ return pp1->offset < pp2->offset;
132
+ }
133
+ else {
134
+ return pp1->position < pp2->position;
135
+ }
136
+ }
137
+
138
+ static void pp_destroy(PhPos *pp)
139
+ {
140
+ if (pp->tpe) {
141
+ pp->tpe->close(pp->tpe);
142
+ }
143
+ free(pp);
144
+ }
145
+
146
+ static PhPos *pp_new(TermDocEnum *tpe, int offset)
147
+ {
148
+ PhPos *self = ALLOC(PhPos);
149
+
150
+ self->tpe = tpe;
151
+ self->count = self->doc = self->position = -1;
152
+ self->offset = offset;
153
+
154
+ return self;
155
+ }
156
+
157
+ /***************************************************************************
158
+ * PhraseScorer
159
+ ***************************************************************************/
160
+
161
+ #define PhSc(scorer) ((PhraseScorer *)(scorer))
162
+
163
+ typedef struct PhraseScorer
164
+ {
165
+ Scorer super;
166
+ float (*phrase_freq)(Scorer *self);
167
+ float freq;
168
+ uchar *norms;
169
+ float value;
170
+ Weight *weight;
171
+ PhPos **phrase_pos;
172
+ int pp_first_idx;
173
+ int pp_cnt;
174
+ int slop;
175
+ bool first_time : 1;
176
+ bool more : 1;
177
+ bool check_repeats : 1;
178
+ } PhraseScorer;
179
+
180
+ static void phsc_init(PhraseScorer *phsc)
181
+ {
182
+ int i;
183
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
184
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
185
+ }
186
+
187
+ if (phsc->more) {
188
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
189
+ sizeof(PhPos *), &pp_cmp);
190
+ phsc->pp_first_idx = 0;
191
+ }
192
+ }
193
+
194
+ static bool phsc_do_next(Scorer *self)
195
+ {
196
+ PhraseScorer *phsc = PhSc(self);
197
+ const int pp_cnt = phsc->pp_cnt;
198
+ int pp_first_idx = phsc->pp_first_idx;
199
+ PhPos **phrase_positions = phsc->phrase_pos;
200
+
201
+ PhPos *first = phrase_positions[pp_first_idx];
202
+ PhPos *last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
203
+
204
+ while (phsc->more) {
205
+ /* find doc with all the terms */
206
+ while (phsc->more && first->doc < last->doc) {
207
+ /* skip first upto last */
208
+ phsc->more = pp_skip_to(first, last->doc);
209
+ last = first;
210
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
211
+ first = phrase_positions[pp_first_idx];
212
+ }
213
+
214
+ if (phsc->more) {
215
+ /* pp_first_idx will be used by phrase_freq */
216
+ phsc->pp_first_idx = pp_first_idx;
217
+
218
+ /* found a doc with all of the terms */
219
+ phsc->freq = phsc->phrase_freq(self);
220
+
221
+ if (phsc->freq == 0.0) { /* no match */
222
+ /* continuing search so re-set first and last */
223
+ pp_first_idx = phsc->pp_first_idx;
224
+ first = phrase_positions[pp_first_idx];
225
+ last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
226
+ phsc->more = pp_next(last); /* trigger further scanning */
227
+ }
228
+ else {
229
+ self->doc = first->doc;
230
+ return true; /* found a match */
231
+ }
232
+
233
+ }
234
+ }
235
+ return false;
236
+ }
237
+
238
+ static float phsc_score(Scorer *self)
239
+ {
240
+ PhraseScorer *phsc = PhSc(self);
241
+ float raw_score = sim_tf(self->similarity, phsc->freq) * phsc->value;
242
+ /* normalize */
243
+ return raw_score * sim_decode_norm(
244
+ self->similarity,
245
+ phsc->norms[self->doc]);
246
+ }
247
+
248
+ static bool phsc_next(Scorer *self)
249
+ {
250
+ PhraseScorer *phsc = PhSc(self);
251
+ if (phsc->first_time) {
252
+ phsc_init(phsc);
253
+ phsc->first_time = false;
254
+ }
255
+ else if (phsc->more) {
256
+ /* trigger further scanning */
257
+ phsc->more = pp_next(
258
+ phsc->phrase_pos[PREV_NUM(phsc->pp_first_idx, phsc->pp_cnt)]);
259
+ }
260
+
261
+ return phsc_do_next(self);
262
+ }
263
+
264
+ static bool phsc_skip_to(Scorer *self, int doc_num)
265
+ {
266
+ PhraseScorer *phsc = PhSc(self);
267
+ int i;
268
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
269
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) {
270
+ break;
271
+ }
272
+ }
273
+
274
+ if (phsc->more) {
275
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
276
+ sizeof(PhPos *), &pp_cmp);
277
+ phsc->pp_first_idx = 0;
278
+ }
279
+ return phsc_do_next(self);
280
+ }
281
+
282
+ static Explanation *phsc_explain(Scorer *self, int doc_num)
283
+ {
284
+ PhraseScorer *phsc = PhSc(self);
285
+ float phrase_freq;
286
+
287
+ phsc_skip_to(self, doc_num);
288
+
289
+ phrase_freq = (self->doc == doc_num) ? phsc->freq : 0.0f;
290
+ return expl_new(sim_tf(self->similarity, phrase_freq),
291
+ "tf(phrase_freq=%f)", phrase_freq);
292
+ }
293
+
294
+ static void phsc_destroy(Scorer *self)
295
+ {
296
+ PhraseScorer *phsc = PhSc(self);
297
+ int i;
298
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
299
+ pp_destroy(phsc->phrase_pos[i]);
300
+ }
301
+ free(phsc->phrase_pos);
302
+ scorer_destroy_i(self);
303
+ }
304
+
305
+ static Scorer *phsc_new(Weight *weight,
306
+ TermDocEnum **term_pos_enum,
307
+ PhrasePosition *positions, int pos_cnt,
308
+ Similarity *similarity,
309
+ uchar *norms,
310
+ int slop)
311
+ {
312
+ int i;
313
+ Scorer *self = scorer_new(PhraseScorer, similarity);
314
+ HashSet *term_set = NULL;
315
+
316
+
317
+ PhSc(self)->weight = weight;
318
+ PhSc(self)->norms = norms;
319
+ PhSc(self)->value = weight->value;
320
+ PhSc(self)->phrase_pos = ALLOC_N(PhPos *, pos_cnt);
321
+ PhSc(self)->pp_first_idx = 0;
322
+ PhSc(self)->pp_cnt = pos_cnt;
323
+ PhSc(self)->slop = slop;
324
+ PhSc(self)->first_time = true;
325
+ PhSc(self)->more = true;
326
+ PhSc(self)->check_repeats = false;
327
+
328
+ if (slop) {
329
+ term_set = hs_new_str((free_ft)NULL);
330
+ }
331
+ for (i = 0; i < pos_cnt; i++) {
332
+ /* check for repeats */
333
+ if (slop && !PhSc(self)->check_repeats) {
334
+ char **terms = positions[i].terms;
335
+ const int t_cnt = ary_size(terms);
336
+ int j;
337
+ for (j = 0; j < t_cnt; j++) {
338
+ if (hs_add(term_set, terms[j])) {
339
+ PhSc(self)->check_repeats = true;
340
+ goto repeat_check_done;
341
+ }
342
+ }
343
+ }
344
+ repeat_check_done:
345
+ PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos);
346
+ }
347
+
348
+ if (slop) {
349
+ hs_destroy(term_set);
350
+ }
351
+
352
+ self->score = &phsc_score;
353
+ self->next = &phsc_next;
354
+ self->skip_to = &phsc_skip_to;
355
+ self->explain = &phsc_explain;
356
+ self->destroy = &phsc_destroy;
357
+
358
+ return self;
359
+ }
360
+
361
+ /***************************************************************************
362
+ * ExactPhraseScorer
363
+ ***************************************************************************/
364
+
365
+ static float ephsc_phrase_freq(Scorer *self)
366
+ {
367
+ PhraseScorer *phsc = PhSc(self);
368
+ int i;
369
+ int pp_first_idx = 0;
370
+ const int pp_cnt = phsc->pp_cnt;
371
+ float freq = 0.0;
372
+ PhPos **phrase_positions = phsc->phrase_pos;
373
+ PhPos *first;
374
+ PhPos *last;
375
+
376
+ for (i = 0; i < pp_cnt; i++) {
377
+ pp_first_position(phrase_positions[i]);
378
+ }
379
+ qsort(phrase_positions, pp_cnt, sizeof(PhPos *), &pp_pos_cmp);
380
+
381
+ first = phrase_positions[0];
382
+ last = phrase_positions[pp_cnt - 1];
383
+
384
+ /* scan to position with all terms */
385
+ do {
386
+ /* scan forward in first */
387
+ while (first->position < last->position) {
388
+ do {
389
+ if (! pp_next_position(first)) {
390
+ /* maintain first position */
391
+ phsc->pp_first_idx = pp_first_idx;
392
+ return freq;
393
+ }
394
+ } while (first->position < last->position);
395
+ last = first;
396
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
397
+ first = phrase_positions[pp_first_idx];
398
+ }
399
+ freq += 1.0; /* all equal: a match */
400
+ } while (pp_next_position(last));
401
+
402
+ /* maintain first position */
403
+ phsc->pp_first_idx = pp_first_idx;
404
+ return freq;
405
+ }
406
+
407
+ static Scorer *exact_phrase_scorer_new(Weight *weight,
408
+ TermDocEnum **term_pos_enum,
409
+ PhrasePosition *positions, int pp_cnt,
410
+ Similarity *similarity, uchar *norms)
411
+ {
412
+ Scorer *self = phsc_new(weight,
413
+ term_pos_enum,
414
+ positions,
415
+ pp_cnt,
416
+ similarity,
417
+ norms,
418
+ 0);
419
+
420
+ PhSc(self)->phrase_freq = &ephsc_phrase_freq;
421
+ return self;
422
+ }
423
+
424
+ /***************************************************************************
425
+ * SloppyPhraseScorer
426
+ ***************************************************************************/
427
+
428
+ static bool sphsc_check_repeats(PhPos *pp,
429
+ PhPos **positions,
430
+ const int p_cnt)
431
+ {
432
+ int j;
433
+ for (j = 0; j < p_cnt; j++) {
434
+ PhPos *ppj = positions[j];
435
+ /* If offsets are equal, either we are at the current PhPos +pp+ or
436
+ * +pp+ and +ppj+ are supposed to match in the same position in which
437
+ * case we don't need to check. */
438
+ if (ppj->offset == pp->offset) {
439
+ continue;
440
+ }
441
+ /* the two phrase positions are matching on the same term
442
+ * which we want to avoid */
443
+ if ((ppj->position + ppj->offset) == (pp->position + pp->offset)) {
444
+ if (!pp_next_position(pp)) {
445
+ /* We have no matches for this document */
446
+ return false;
447
+ }
448
+ /* we changed the position so we need to start check again */
449
+ j = -1;
450
+ }
451
+ }
452
+ return true;
453
+ }
454
+
455
+ static float sphsc_phrase_freq(Scorer *self)
456
+ {
457
+ PhraseScorer *phsc = PhSc(self);
458
+ PhPos *pp;
459
+ PriorityQueue *pq = pq_new(phsc->pp_cnt, (lt_ft)&pp_less_than, NULL);
460
+ const int pp_cnt = phsc->pp_cnt;
461
+
462
+ int last_pos = 0, pos, next_pos, start, match_length, i;
463
+ bool done = false;
464
+ bool check_repeats = phsc->check_repeats;
465
+ float freq = 0.0;
466
+
467
+ for (i = 0; i < pp_cnt; i++) {
468
+ bool res;
469
+ pp = phsc->phrase_pos[i];
470
+ /* we should always have at least one position or this functions
471
+ * shouldn't have been called. */
472
+ res = pp_first_position(pp);
473
+ assert(res);(void)res;
474
+ if (check_repeats && i > 0) {
475
+ if (!sphsc_check_repeats(pp, phsc->phrase_pos, i - 1)) {
476
+ goto return_freq;
477
+ }
478
+ }
479
+ if (pp->position > last_pos) {
480
+ last_pos = pp->position;
481
+ }
482
+ pq_push(pq, pp);
483
+ }
484
+
485
+ do {
486
+ pp = (PhPos *)pq_pop(pq);
487
+ pos = start = pp->position;
488
+ next_pos = PP(pq_top(pq))->position;
489
+ while (pos <= next_pos) {
490
+ start = pos; /* advance pp to min window */
491
+ if (!pp_next_position(pp)
492
+ || (check_repeats
493
+ && !sphsc_check_repeats(pp, phsc->phrase_pos, pp_cnt))) {
494
+ done = true;
495
+ break;
496
+ }
497
+ pos = pp->position;
498
+ }
499
+
500
+ match_length = last_pos - start;
501
+ if (match_length <= phsc->slop) {
502
+ /* score match */
503
+ freq += sim_sloppy_freq(self->similarity, match_length);
504
+ }
505
+
506
+ if (pp->position > last_pos) {
507
+ last_pos = pp->position;
508
+ }
509
+ pq_push(pq, pp); /* restore pq */
510
+ } while (!done);
511
+
512
+ return_freq:
513
+
514
+ pq_destroy(pq);
515
+ return freq;
516
+ }
517
+
518
+ static Scorer *sloppy_phrase_scorer_new(Weight *weight,
519
+ TermDocEnum **term_pos_enum,
520
+ PhrasePosition *positions,
521
+ int pp_cnt, Similarity *similarity,
522
+ int slop, uchar *norms)
523
+ {
524
+ Scorer *self = phsc_new(weight,
525
+ term_pos_enum,
526
+ positions,
527
+ pp_cnt,
528
+ similarity,
529
+ norms,
530
+ slop);
531
+
532
+ PhSc(self)->phrase_freq = &sphsc_phrase_freq;
533
+ return self;
534
+ }
535
+
536
+ /***************************************************************************
537
+ *
538
+ * PhraseWeight
539
+ *
540
+ ***************************************************************************/
541
+
542
+ static char *phw_to_s(Weight *self)
543
+ {
544
+ return strfmt("PhraseWeight(%f)", self->value);
545
+ }
546
+
547
+ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
548
+ {
549
+ int i;
550
+ Scorer *phsc = NULL;
551
+ PhraseQuery *phq = PhQ(self->query);
552
+ TermDocEnum **tps, *tpe;
553
+ PhrasePosition *positions = phq->positions;
554
+ const int pos_cnt = phq->pos_cnt;
555
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
556
+
557
+ if (pos_cnt == 0 || field_num < 0) {
558
+ return NULL;
559
+ }
560
+
561
+ tps = ALLOC_N(TermDocEnum *, pos_cnt);
562
+
563
+ for (i = 0; i < pos_cnt; i++) {
564
+ char **terms = positions[i].terms;
565
+ const int t_cnt = ary_size(terms);
566
+ if (t_cnt == 1) {
567
+ tpe = tps[i] = ir->term_positions(ir);
568
+ tpe->seek(tpe, field_num, terms[0]);
569
+ }
570
+ else {
571
+ tps[i] = mtdpe_new(ir, field_num, terms, t_cnt);
572
+ }
573
+ /* neither mtdpe_new nor ir->term_positions should return NULL */
574
+ assert(NULL != tps[i]);
575
+ }
576
+
577
+ if (phq->slop == 0) { /* optimize exact (common) case */
578
+ phsc = exact_phrase_scorer_new(self, tps, positions, pos_cnt,
579
+ self->similarity,
580
+ ir_get_norms_i(ir, field_num));
581
+ }
582
+ else {
583
+ phsc = sloppy_phrase_scorer_new(self, tps, positions, pos_cnt,
584
+ self->similarity, phq->slop,
585
+ ir_get_norms_i(ir, field_num));
586
+ }
587
+ free(tps);
588
+ return phsc;
589
+ }
590
+
591
+ static Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
592
+ {
593
+ Explanation *expl;
594
+ Explanation *idf_expl1;
595
+ Explanation *idf_expl2;
596
+ Explanation *query_expl;
597
+ Explanation *qnorm_expl;
598
+ Explanation *field_expl;
599
+ Explanation *tf_expl;
600
+ Scorer *scorer;
601
+ uchar *field_norms;
602
+ float field_norm;
603
+ Explanation *field_norm_expl;
604
+ char *query_str;
605
+ PhraseQuery *phq = PhQ(self->query);
606
+ const int pos_cnt = phq->pos_cnt;
607
+ PhrasePosition *positions = phq->positions;
608
+ int i, j;
609
+ char *doc_freqs = NULL;
610
+ size_t len = 0, pos = 0;
611
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
612
+ const char *field = S(phq->field);
613
+
614
+ if (field_num < 0) {
615
+ return expl_new(0.0, "field \"%s\" does not exist in the index", field);
616
+ }
617
+
618
+ query_str = self->query->to_s(self->query, NULL);
619
+
620
+ expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
621
+
622
+ /* ensure the phrase positions are in order for explanation */
623
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
624
+
625
+ for (i = 0; i < phq->pos_cnt; i++) {
626
+ char **terms = phq->positions[i].terms;
627
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
628
+ len += strlen(terms[j]) + 30;
629
+ }
630
+ }
631
+ doc_freqs = ALLOC_N(char, len);
632
+ for (i = 0; i < phq->pos_cnt; i++) {
633
+ char **terms = phq->positions[i].terms;
634
+ const int t_cnt = ary_size(terms);
635
+ for (j = 0; j < t_cnt; j++) {
636
+ char *term = terms[j];
637
+ pos += sprintf(doc_freqs + pos, "%s=%d, ",
638
+ term, ir->doc_freq(ir, field_num, term));
639
+ }
640
+ }
641
+ pos -= 2; /* remove ", " from the end */
642
+ doc_freqs[pos] = 0;
643
+
644
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
645
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
646
+ free(doc_freqs);
647
+
648
+ /* explain query weight */
649
+ query_expl = expl_new(0.0, "query_weight(%s), product of:", query_str);
650
+
651
+ if (self->query->boost != 1.0) {
652
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
653
+ }
654
+ expl_add_detail(query_expl, idf_expl1);
655
+
656
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
657
+ expl_add_detail(query_expl, qnorm_expl);
658
+
659
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
660
+
661
+ expl_add_detail(expl, query_expl);
662
+
663
+ /* explain field weight */
664
+ field_expl = expl_new(0.0, "field_weight(%s in %d), product of:",
665
+ query_str, doc_num);
666
+ free(query_str);
667
+
668
+ scorer = self->scorer(self, ir);
669
+ tf_expl = scorer->explain(scorer, doc_num);
670
+ scorer->destroy(scorer);
671
+ expl_add_detail(field_expl, tf_expl);
672
+ expl_add_detail(field_expl, idf_expl2);
673
+
674
+ field_norms = ir->get_norms(ir, field_num);
675
+ field_norm = (field_norms != NULL)
676
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
677
+ : (float)0.0;
678
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
679
+ field, doc_num);
680
+
681
+ expl_add_detail(field_expl, field_norm_expl);
682
+
683
+ field_expl->value = tf_expl->value * self->idf * field_norm;
684
+
685
+ /* combine them */
686
+ if (query_expl->value == 1.0) {
687
+ expl_destroy(expl);
688
+ return field_expl;
689
+ }
690
+ else {
691
+ expl->value = (query_expl->value * field_expl->value);
692
+ expl_add_detail(expl, field_expl);
693
+ return expl;
694
+ }
695
+ }
696
+
697
+ static Weight *phw_new(Query *query, Searcher *searcher)
698
+ {
699
+ Weight *self = w_new(Weight, query);
700
+
701
+ self->scorer = &phw_scorer;
702
+ self->explain = &phw_explain;
703
+ self->to_s = &phw_to_s;
704
+
705
+ self->similarity = query->get_similarity(query, searcher);
706
+ self->value = query->boost;
707
+ self->idf = sim_idf_phrase(self->similarity, PhQ(query)->field,
708
+ PhQ(query)->positions,
709
+ PhQ(query)->pos_cnt, searcher);
710
+ return self;
711
+ }
712
+
713
+ /***************************************************************************
714
+ *
715
+ * PhraseQuery
716
+ *
717
+ ***************************************************************************/
718
+
719
+ /* ** TVPosEnum ** */
720
+ typedef struct TVPosEnum
721
+ {
722
+ int index;
723
+ int size;
724
+ int offset;
725
+ int pos;
726
+ int positions[1];
727
+ } TVPosEnum;
728
+
729
+ static bool tvpe_next(TVPosEnum *self)
730
+ {
731
+ if (++(self->index) < self->size) {
732
+ self->pos = self->positions[self->index] - self->offset;
733
+ return true;
734
+ }
735
+ else {
736
+ self->pos = -1;
737
+ return false;
738
+ }
739
+ }
740
+
741
+ static int tvpe_skip_to(TVPosEnum *self, int position)
742
+ {
743
+ int i;
744
+ int search_pos = position + self->offset;
745
+ for (i = self->index + 1; i < self->size; i++) {
746
+ if (self->positions[i] >= search_pos) {
747
+ self->pos = self->positions[i] - self->offset;
748
+ break;
749
+ }
750
+ }
751
+ self->index = i;
752
+ if (i == self->size) {
753
+ self->pos = -1;
754
+ return false;
755
+ }
756
+ return true;
757
+ }
758
+
759
+ static bool tvpe_lt(TVPosEnum *tvpe1, TVPosEnum *tvpe2)
760
+ {
761
+ return tvpe1->pos < tvpe2->pos;
762
+ }
763
+
764
+ static TVPosEnum *tvpe_new(int *positions, int size, int offset)
765
+ {
766
+ TVPosEnum *self = (TVPosEnum*)emalloc(sizeof(TVPosEnum) + size*sizeof(int));
767
+ memcpy(self->positions, positions, size * sizeof(int));
768
+ self->size = size;
769
+ self->offset = offset;
770
+ self->index = -1;
771
+ self->pos = -1;
772
+ return self;
773
+ }
774
+
775
+ static TVPosEnum *tvpe_new_merge(char **terms, int t_cnt, TermVector *tv,
776
+ int offset)
777
+ {
778
+ int i, total_positions = 0;
779
+ PriorityQueue *tvpe_pq = pq_new(t_cnt, (lt_ft)tvpe_lt, &free);
780
+ TVPosEnum *self = NULL;
781
+
782
+ for (i = 0; i < t_cnt; i++) {
783
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[i]);
784
+ if (tv_term) {
785
+ TVPosEnum *tvpe = tvpe_new(tv_term->positions, tv_term->freq, 0);
786
+ /* got tv_term so tvpe_next should always return true once here */
787
+ bool res = tvpe_next(tvpe);
788
+ assert(res);(void)res;
789
+ pq_push(tvpe_pq, tvpe);
790
+ total_positions += tv_term->freq;
791
+ }
792
+ }
793
+ if (tvpe_pq->size == 0) {
794
+ pq_destroy(tvpe_pq);
795
+ }
796
+ else {
797
+ int index = 0;
798
+ self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
799
+ + total_positions * sizeof(int));
800
+ self->size = total_positions;
801
+ self->offset = offset;
802
+ self->index = -1;
803
+ self->pos = -1;
804
+ while (tvpe_pq->size > 0) {
805
+ TVPosEnum *top = (TVPosEnum *)pq_top(tvpe_pq);
806
+ self->positions[index++] = top->pos;
807
+ if (! tvpe_next(top)) {
808
+ pq_pop(tvpe_pq);
809
+ free(top);
810
+ }
811
+ else {
812
+ pq_down(tvpe_pq);
813
+ }
814
+ }
815
+ pq_destroy(tvpe_pq);
816
+ }
817
+ return self;
818
+ }
819
+
820
+ static TVPosEnum *get_tvpe(TermVector *tv, char **terms, int t_cnt, int offset)
821
+ {
822
+ TVPosEnum *tvpe = NULL;
823
+ if (t_cnt == 1) {
824
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[0]);
825
+ if (tv_term) {
826
+ tvpe = tvpe_new(tv_term->positions, tv_term->freq, offset);
827
+ }
828
+ }
829
+ else {
830
+ tvpe = tvpe_new_merge(terms, t_cnt, tv, offset);
831
+ }
832
+ return tvpe;
833
+ }
834
+
835
+ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
836
+ TermVector *tv)
837
+ {
838
+ if (tv->field == PhQ(self)->field) {
839
+ const int pos_cnt = PhQ(self)->pos_cnt;
840
+ int i;
841
+ int slop = PhQ(self)->slop;
842
+ bool done = false;
843
+
844
+ if (slop > 0) {
845
+ PriorityQueue *tvpe_pq = pq_new(pos_cnt, (lt_ft)tvpe_lt, &free);
846
+ int last_pos = 0;
847
+ for (i = 0; i < pos_cnt; i++) {
848
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
849
+ const int t_cnt = ary_size(pp->terms);
850
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
851
+ if (tvpe && tvpe_next(tvpe)) {
852
+ if (tvpe->pos > last_pos) {
853
+ last_pos = tvpe->pos;
854
+ }
855
+ pq_push(tvpe_pq, tvpe);
856
+ }
857
+ else {
858
+ done = true;
859
+ free(tvpe);
860
+ break;
861
+ }
862
+ }
863
+ while (! done) {
864
+ TVPosEnum *tvpe = (TVPosEnum *)pq_pop(tvpe_pq);
865
+ int pos;
866
+ int start = pos = tvpe->pos;
867
+ int next_pos = ((TVPosEnum *)pq_top(tvpe_pq))->pos;
868
+ while (pos <= next_pos) {
869
+ start = pos;
870
+ if (!tvpe_next(tvpe)) {
871
+ done = true;
872
+ break;
873
+ }
874
+ pos = tvpe->pos;
875
+ }
876
+
877
+ if (last_pos - start <= slop) {
878
+ int min, max = min = start + tvpe->offset;
879
+ for (i = tvpe_pq->size; i > 0; i--) {
880
+ TVPosEnum *t = (TVPosEnum *)tvpe_pq->heap[i];
881
+ int p = t->pos + t->offset;
882
+ max = p > max ? p : max;
883
+ min = p < min ? p : min;
884
+ }
885
+ matchv_add(mv, min, max);
886
+ }
887
+ if (tvpe->pos > last_pos) {
888
+ last_pos = tvpe->pos;
889
+ }
890
+ pq_push(tvpe_pq, tvpe);
891
+ }
892
+
893
+ pq_destroy(tvpe_pq);
894
+ }
895
+ else { /* exact match */
896
+ TVPosEnum **tvpe_a = ALLOC_AND_ZERO_N(TVPosEnum *, pos_cnt);
897
+ TVPosEnum *first, *last;
898
+ int first_index = 0;
899
+ done = false;
900
+ qsort(PhQ(self)->positions, pos_cnt, sizeof(PhrasePosition),
901
+ &phrase_pos_cmp);
902
+ for (i = 0; i < pos_cnt; i++) {
903
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
904
+ const int t_cnt = ary_size(pp->terms);
905
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
906
+ if (tvpe && ((i == 0 && tvpe_next(tvpe))
907
+ || tvpe_skip_to(tvpe, tvpe_a[i-1]->pos))) {
908
+ tvpe_a[i] = tvpe;
909
+ }
910
+ else {
911
+ done = true;
912
+ free(tvpe);
913
+ break;
914
+ }
915
+ }
916
+
917
+ first = tvpe_a[0];
918
+ last = tvpe_a[pos_cnt - 1];
919
+
920
+ while (!done) {
921
+ while (first->pos < last->pos) {
922
+ if (tvpe_skip_to(first, last->pos)) {
923
+ last = first;
924
+ first_index = NEXT_NUM(first_index, pos_cnt);
925
+ first = tvpe_a[first_index];
926
+ }
927
+ else {
928
+ done = true;
929
+ break;
930
+ }
931
+ }
932
+ if (!done) {
933
+ matchv_add(mv, tvpe_a[0]->pos + tvpe_a[0]->offset,
934
+ tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
935
+ }
936
+ if (!tvpe_next(last)) {
937
+ done = true;
938
+ }
939
+ }
940
+ for (i = 0; i < pos_cnt; i++) {
941
+ free(tvpe_a[i]);
942
+ }
943
+ free(tvpe_a);
944
+ }
945
+ }
946
+ return mv;
947
+ }
948
+
949
+
950
+ /* ** PhraseQuery besides highlighting stuff ** */
951
+
952
+ #define PhQ_INIT_CAPA 4
953
+
954
+ static void phq_extract_terms(Query *self, HashSet *term_set)
955
+ {
956
+ PhraseQuery *phq = PhQ(self);
957
+ int i, j;
958
+ for (i = 0; i < phq->pos_cnt; i++) {
959
+ char **terms = phq->positions[i].terms;
960
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
961
+ hs_add(term_set, term_new(phq->field, terms[j]));
962
+ }
963
+ }
964
+ }
965
+
966
+ static char *phq_to_s(Query *self, Symbol default_field)
967
+ {
968
+ PhraseQuery *phq = PhQ(self);
969
+ const int pos_cnt = phq->pos_cnt;
970
+ PhrasePosition *positions = phq->positions;
971
+ const char *field = S(phq->field);
972
+ int flen = strlen(field);
973
+
974
+ int i, j, buf_index = 0, pos, last_pos;
975
+ size_t len = 0;
976
+ char *buffer;
977
+
978
+ if (phq->pos_cnt == 0) {
979
+ if (default_field != phq->field) {
980
+ return strfmt("%s:\"\"", field);
981
+ }
982
+ else {
983
+ return estrdup("\"\"");
984
+ }
985
+ }
986
+
987
+ /* sort the phrase positions by position */
988
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
989
+
990
+ len = flen + 1;
991
+
992
+ for (i = 0; i < pos_cnt; i++) {
993
+ char **terms = phq->positions[i].terms;
994
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
995
+ len += strlen(terms[j]) + 5;
996
+ }
997
+ }
998
+
999
+ /* add space for extra <> characters and boost and slop */
1000
+ len += 100 + 3
1001
+ * (phq->positions[phq->pos_cnt - 1].pos - phq->positions[0].pos);
1002
+
1003
+ buffer = ALLOC_N(char, len);
1004
+
1005
+ if (default_field != phq->field) {
1006
+ memcpy(buffer, field, flen);
1007
+ buffer[flen] = ':';
1008
+ buf_index += flen + 1;
1009
+ }
1010
+
1011
+ buffer[buf_index++] = '"';
1012
+
1013
+ last_pos = positions[0].pos - 1;
1014
+ for (i = 0; i < pos_cnt; i++) {
1015
+ char **terms = positions[i].terms;
1016
+ const int t_cnt = ary_size(terms);
1017
+
1018
+ pos = positions[i].pos;
1019
+ if (pos == last_pos) {
1020
+ buffer[buf_index - 1] = '&';
1021
+ }
1022
+ else {
1023
+ for (j = last_pos; j < pos - 1; j++) {
1024
+ memcpy(buffer + buf_index, "<> ", 3);
1025
+ buf_index += 3;
1026
+ }
1027
+ }
1028
+
1029
+ last_pos = pos;
1030
+ for (j = 0; j < t_cnt; j++) {
1031
+ char *term = terms[j];
1032
+ len = strlen(term);
1033
+ memcpy(buffer + buf_index, term, len);
1034
+ buf_index += len;
1035
+ buffer[buf_index++] = '|';
1036
+ }
1037
+ buffer[buf_index-1] = ' '; /* change last '|' to ' ' */
1038
+ }
1039
+
1040
+ if (buffer[buf_index-1] == ' ') {
1041
+ buf_index--;
1042
+ }
1043
+
1044
+ buffer[buf_index++] = '"';
1045
+ buffer[buf_index] = 0;
1046
+
1047
+ if (phq->slop != 0) {
1048
+ buf_index += sprintf(buffer + buf_index, "~%d", phq->slop);
1049
+ }
1050
+
1051
+ if (self->boost != 1.0) {
1052
+ buffer[buf_index++] = '^';
1053
+ dbl_to_s(buffer + buf_index, self->boost);
1054
+ }
1055
+
1056
+ return buffer;
1057
+ }
1058
+
1059
+ static void phq_destroy(Query *self)
1060
+ {
1061
+ PhraseQuery *phq = PhQ(self);
1062
+ int i;
1063
+ for (i = 0; i < phq->pos_cnt; i++) {
1064
+ ary_destroy(phq->positions[i].terms, &free);
1065
+ }
1066
+ free(phq->positions);
1067
+ q_destroy_i(self);
1068
+ }
1069
+
1070
+ static Query *phq_rewrite(Query *self, IndexReader *ir)
1071
+ {
1072
+ PhraseQuery *phq = PhQ(self);
1073
+ (void)ir;
1074
+ if (phq->pos_cnt == 1) {
1075
+ /* optimize one-position case */
1076
+ char **terms = phq->positions[0].terms;
1077
+ const int t_cnt = ary_size(terms);
1078
+ if (t_cnt == 1) {
1079
+ Query *tq = tq_new(phq->field, terms[0]);
1080
+ tq->boost = self->boost;
1081
+ return tq;
1082
+ }
1083
+ else {
1084
+ Query *q = multi_tq_new(phq->field);
1085
+ int i;
1086
+ for (i = 0; i < t_cnt; i++) {
1087
+ multi_tq_add_term(q, terms[i]);
1088
+ }
1089
+ q->boost = self->boost;
1090
+ return q;
1091
+ }
1092
+ } else {
1093
+ self->ref_cnt++;
1094
+ return self;
1095
+ }
1096
+ }
1097
+
1098
+ static unsigned long phq_hash(Query *self)
1099
+ {
1100
+ int i, j;
1101
+ PhraseQuery *phq = PhQ(self);
1102
+ unsigned long hash = sym_hash(phq->field);
1103
+ for (i = 0; i < phq->pos_cnt; i++) {
1104
+ char **terms = phq->positions[i].terms;
1105
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
1106
+ hash = (hash << 1) ^ (str_hash(terms[j])
1107
+ ^ phq->positions[i].pos);
1108
+ }
1109
+ }
1110
+ return (hash ^ phq->slop);
1111
+ }
1112
+
1113
+ static int phq_eq(Query *self, Query *o)
1114
+ {
1115
+ int i, j;
1116
+ PhraseQuery *phq1 = PhQ(self);
1117
+ PhraseQuery *phq2 = PhQ(o);
1118
+ if (phq1->slop != phq2->slop
1119
+ || phq1->field != phq2->field
1120
+ || phq1->pos_cnt != phq2->pos_cnt) {
1121
+ return false;
1122
+ }
1123
+ for (i = 0; i < phq1->pos_cnt; i++) {
1124
+ char **terms1 = phq1->positions[i].terms;
1125
+ char **terms2 = phq2->positions[i].terms;
1126
+ const int t_cnt = ary_size(terms1);
1127
+ if (t_cnt != ary_size(terms2)
1128
+ || phq1->positions[i].pos != phq2->positions[i].pos) {
1129
+ return false;
1130
+ }
1131
+ for (j = 0; j < t_cnt; j++) {
1132
+ if (strcmp(terms1[j], terms2[j]) != 0) {
1133
+ return false;
1134
+ }
1135
+ }
1136
+ }
1137
+ return true;
1138
+ }
1139
+
1140
+ Query *phq_new(Symbol field)
1141
+ {
1142
+ Query *self = q_new(PhraseQuery);
1143
+
1144
+ PhQ(self)->field = field;
1145
+ PhQ(self)->pos_cnt = 0;
1146
+ PhQ(self)->pos_capa = PhQ_INIT_CAPA;
1147
+ PhQ(self)->positions = ALLOC_N(PhrasePosition, PhQ_INIT_CAPA);
1148
+
1149
+ self->type = PHRASE_QUERY;
1150
+ self->rewrite = &phq_rewrite;
1151
+ self->extract_terms = &phq_extract_terms;
1152
+ self->to_s = &phq_to_s;
1153
+ self->hash = &phq_hash;
1154
+ self->eq = &phq_eq;
1155
+ self->destroy_i = &phq_destroy;
1156
+ self->create_weight_i = &phw_new;
1157
+ self->get_matchv_i = &phq_get_matchv_i;
1158
+ return self;
1159
+ }
1160
+
1161
+ void phq_add_term_abs(Query *self, const char *term, int position)
1162
+ {
1163
+ PhraseQuery *phq = PhQ(self);
1164
+ int index = phq->pos_cnt;
1165
+ PhrasePosition *pp;
1166
+ if (index >= phq->pos_capa) {
1167
+ phq->pos_capa <<= 1;
1168
+ REALLOC_N(phq->positions, PhrasePosition, phq->pos_capa);
1169
+ }
1170
+ pp = &(phq->positions[index]);
1171
+ pp->terms = ary_new_type_capa(char *, 2);
1172
+ ary_push(pp->terms, estrdup(term));
1173
+ pp->pos = position;
1174
+ phq->pos_cnt++;
1175
+ }
1176
+
1177
+ void phq_add_term(Query *self, const char *term, int pos_inc)
1178
+ {
1179
+ PhraseQuery *phq = PhQ(self);
1180
+ int position;
1181
+ if (phq->pos_cnt == 0) {
1182
+ position = 0;
1183
+ }
1184
+ else {
1185
+ position = phq->positions[phq->pos_cnt - 1].pos + pos_inc;
1186
+ }
1187
+ phq_add_term_abs(self, term, position);
1188
+ }
1189
+
1190
+ void phq_append_multi_term(Query *self, const char *term)
1191
+ {
1192
+ PhraseQuery *phq = PhQ(self);
1193
+ int index = phq->pos_cnt - 1;
1194
+
1195
+ if (index < 0) {
1196
+ phq_add_term(self, term, 0);
1197
+ }
1198
+ else {
1199
+ ary_push(phq->positions[index].terms, estrdup(term));
1200
+ }
1201
+ }
1202
+
1203
+ void frt_phq_set_slop(FrtQuery *self, int slop)
1204
+ {
1205
+ PhQ(self)->slop = slop;
1206
+ }