sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,1126 @@
1
+ #include <string.h>
2
+ #include <limits.h>
3
+ #include "search.h"
4
+ #include "array.h"
5
+
6
+ #define PhQ(query) ((PhraseQuery *)(query))
7
+
8
+ static int phrase_pos_cmp(const void *p1, const void *p2)
9
+ {
10
+ int pos1 = ((PhrasePosition *)p1)->pos;
11
+ int pos2 = ((PhrasePosition *)p2)->pos;
12
+ if (pos1 > pos2) {
13
+ return 1;
14
+ }
15
+ if (pos1 < pos2) {
16
+ return -1;
17
+ }
18
+ return strcmp(((PhrasePosition *)p1)->terms[0],
19
+ ((PhrasePosition *)p2)->terms[0]);
20
+ }
21
+
22
+
23
+ /***************************************************************************
24
+ *
25
+ * PhraseScorer
26
+ *
27
+ ***************************************************************************/
28
+
29
+ /***************************************************************************
30
+ * PhPos
31
+ ***************************************************************************/
32
+
33
+ #define PP(p) ((PhPos *)(p))
34
+ typedef struct PhPos
35
+ {
36
+ TermDocEnum *tpe;
37
+ int offset;
38
+ int count;
39
+ int doc;
40
+ int position;
41
+ } PhPos;
42
+
43
+ static bool pp_next(PhPos *self)
44
+ {
45
+ TermDocEnum *tpe = self->tpe;
46
+ if (!tpe->next(tpe)) {
47
+ tpe->close(tpe); /* close stream */
48
+ self->tpe = NULL;
49
+ self->doc = INT_MAX; /* sentinel value */
50
+ return false;
51
+ }
52
+ self->doc = tpe->doc_num(tpe);
53
+ self->position = 0;
54
+ return true;
55
+ }
56
+
57
+ static bool pp_skip_to(PhPos *self, int doc_num)
58
+ {
59
+ TermDocEnum *tpe = self->tpe;
60
+ if (!tpe->skip_to(tpe, doc_num)) {
61
+ tpe->close(tpe); /* close stream */
62
+ self->tpe = NULL;
63
+ self->doc = INT_MAX; /* sentinel value */
64
+ return false;
65
+ }
66
+ self->doc = tpe->doc_num(tpe);
67
+ self->position = 0;
68
+ return true;
69
+ }
70
+
71
+ static bool pp_next_position(PhPos *self)
72
+ {
73
+ TermDocEnum *tpe = self->tpe;
74
+ self->count--;
75
+ if (self->count >= 0) { /* read subsequent pos's */
76
+ self->position = tpe->next_position(tpe) - self->offset;
77
+ return true;
78
+ }
79
+ else {
80
+ return false;
81
+ }
82
+ }
83
+
84
+ static bool pp_first_position(PhPos *self)
85
+ {
86
+ TermDocEnum *tpe = self->tpe;
87
+ self->count = tpe->freq(tpe); /* read first pos */
88
+ return pp_next_position(self);
89
+ }
90
+
91
+ /*
92
+ static char *pp_to_s(PhPos *self)
93
+ {
94
+ return strfmt("pp->(doc => %d, position => %d)", self->doc, self->position);
95
+ }
96
+ */
97
+
98
+ #define PP_pp(p) (*(PhPos **)p)
99
+ static int pp_cmp(const void *const p1, const void *const p2)
100
+ {
101
+ int cmp = PP_pp(p1)->doc - PP_pp(p2)->doc;
102
+ if (cmp == 0) {
103
+ return PP_pp(p1)->position - PP_pp(p2)->position;
104
+ }
105
+ else {
106
+ return cmp;
107
+ }
108
+ }
109
+
110
+ static int pp_pos_cmp(const void *const p1, const void *const p2)
111
+ {
112
+ return PP_pp(p1)->position - PP_pp(p2)->position;
113
+ }
114
+
115
+ static bool pp_less_than(const PhPos *pp1, const PhPos *pp2)
116
+ {
117
+ /* docs will all be equal when this method is used */
118
+ return pp1->position < pp2->position;
119
+ /*
120
+ if (PP(p)->doc == PP(p)->doc) {
121
+ return PP(p)->position < PP(p)->position;
122
+ }
123
+ else {
124
+ return PP(p)->doc < PP(p)->doc;
125
+ }
126
+ */
127
+ }
128
+
129
+ void pp_destroy(PhPos *pp)
130
+ {
131
+ if (pp->tpe) {
132
+ pp->tpe->close(pp->tpe);
133
+ }
134
+ free(pp);
135
+ }
136
+
137
+ PhPos *pp_new(TermDocEnum *tpe, int offset)
138
+ {
139
+ PhPos *self = ALLOC(PhPos);
140
+
141
+ self->tpe = tpe;
142
+ self->count = self->doc = self->position = -1;
143
+ self->offset = offset;
144
+
145
+ return self;
146
+ }
147
+
148
+ /***************************************************************************
149
+ * PhraseScorer
150
+ ***************************************************************************/
151
+
152
+ #define PhSc(scorer) ((PhraseScorer *)(scorer))
153
+
154
+ typedef struct PhraseScorer
155
+ {
156
+ Scorer super;
157
+ float (*phrase_freq)(Scorer *self);
158
+ float freq;
159
+ uchar *norms;
160
+ float value;
161
+ Weight *weight;
162
+ PhPos **phrase_pos;
163
+ int pp_first_idx;
164
+ int pp_cnt;
165
+ int slop;
166
+ bool first_time : 1;
167
+ bool more : 1;
168
+ } PhraseScorer;
169
+
170
+ static void phsc_init(PhraseScorer *phsc)
171
+ {
172
+ int i;
173
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
174
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
175
+ }
176
+
177
+ if (phsc->more) {
178
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
179
+ sizeof(PhPos *), &pp_cmp);
180
+ phsc->pp_first_idx = 0;
181
+ }
182
+ }
183
+
184
+ static bool phsc_do_next(Scorer *self)
185
+ {
186
+ PhraseScorer *phsc = PhSc(self);
187
+ const int pp_cnt = phsc->pp_cnt;
188
+ int pp_first_idx = phsc->pp_first_idx;
189
+ PhPos **phrase_positions = phsc->phrase_pos;
190
+
191
+ PhPos *first = phrase_positions[pp_first_idx];
192
+ PhPos *last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
193
+
194
+ while (phsc->more) {
195
+ /* find doc with all the terms */
196
+ while (phsc->more && first->doc < last->doc) {
197
+ /* skip first upto last */
198
+ phsc->more = pp_skip_to(first, last->doc);
199
+ last = first;
200
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
201
+ first = phrase_positions[pp_first_idx];
202
+ }
203
+
204
+ if (phsc->more) {
205
+ /* pp_first_idx will be used by phrase_freq */
206
+ phsc->pp_first_idx = pp_first_idx;
207
+
208
+ /* found a doc with all of the terms */
209
+ phsc->freq = phsc->phrase_freq(self);
210
+
211
+ if (phsc->freq == 0.0) { /* no match */
212
+ /* continuing search so re-set first and last */
213
+ pp_first_idx = phsc->pp_first_idx;
214
+ first = phrase_positions[pp_first_idx];
215
+ last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
216
+ phsc->more = pp_next(last); /* trigger further scanning */
217
+ }
218
+ else {
219
+ self->doc = first->doc;
220
+ return true; /* found a match */
221
+ }
222
+
223
+ }
224
+ }
225
+ return false;
226
+ }
227
+
228
+ static float phsc_score(Scorer *self)
229
+ {
230
+ PhraseScorer *phsc = PhSc(self);
231
+ float raw_score = sim_tf(self->similarity, phsc->freq) * phsc->value;
232
+ /* normalize */
233
+ return raw_score * sim_decode_norm(
234
+ self->similarity,
235
+ phsc->norms[phsc->phrase_pos[phsc->pp_first_idx]->doc]);
236
+ }
237
+
238
+ static bool phsc_next(Scorer *self)
239
+ {
240
+ PhraseScorer *phsc = PhSc(self);
241
+ if (phsc->first_time) {
242
+ phsc_init(phsc);
243
+ phsc->first_time = false;
244
+ }
245
+ else if (phsc->more) {
246
+ /* trigger further scanning */
247
+ phsc->more = pp_next(
248
+ phsc->phrase_pos[PREV_NUM(phsc->pp_first_idx, phsc->pp_cnt)]);
249
+ }
250
+
251
+ return phsc_do_next(self);
252
+ }
253
+
254
+ static bool phsc_skip_to(Scorer *self, int doc_num)
255
+ {
256
+ PhraseScorer *phsc = PhSc(self);
257
+ int i;
258
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
259
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) {
260
+ break;
261
+ }
262
+ }
263
+
264
+ if (phsc->more) {
265
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
266
+ sizeof(PhPos *), &pp_cmp);
267
+ phsc->pp_first_idx = 0;
268
+ }
269
+ return phsc_do_next(self);
270
+ }
271
+
272
+ static Explanation *phsc_explain(Scorer *self, int doc_num)
273
+ {
274
+ PhraseScorer *phsc = PhSc(self);
275
+ float phrase_freq;
276
+
277
+ phsc_skip_to(self, doc_num);
278
+
279
+ phrase_freq = (self->doc == doc_num) ? phsc->freq : (float)0.0;
280
+ return expl_new(sim_tf(self->similarity, phrase_freq),
281
+ "tf(phrase_freq=%f)", phrase_freq);
282
+ }
283
+
284
+ static void phsc_destroy(Scorer *self)
285
+ {
286
+ PhraseScorer *phsc = PhSc(self);
287
+ int i;
288
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
289
+ pp_destroy(phsc->phrase_pos[i]);
290
+ }
291
+ free(phsc->phrase_pos);
292
+ scorer_destroy_i(self);
293
+ }
294
+
295
+ static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum,
296
+ PhrasePosition *positions, int pos_cnt,
297
+ Similarity *similarity, uchar *norms)
298
+ {
299
+ int i;
300
+ Scorer *self = scorer_new(PhraseScorer, similarity);
301
+
302
+ PhSc(self)->weight = weight;
303
+ PhSc(self)->norms = norms;
304
+ PhSc(self)->value = weight->value;
305
+ PhSc(self)->phrase_pos = ALLOC_N(PhPos *, pos_cnt);
306
+ PhSc(self)->pp_first_idx = 0;
307
+ PhSc(self)->pp_cnt = pos_cnt;
308
+ PhSc(self)->slop = 0;
309
+ PhSc(self)->first_time = true;
310
+ PhSc(self)->more = true;
311
+
312
+ for (i = 0; i < pos_cnt; i++) {
313
+ PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos);
314
+ }
315
+
316
+ self->score = &phsc_score;
317
+ self->next = &phsc_next;
318
+ self->skip_to = &phsc_skip_to;
319
+ self->explain = &phsc_explain;
320
+ self->destroy = &phsc_destroy;
321
+
322
+ return self;
323
+ }
324
+
325
+ /***************************************************************************
326
+ * ExactPhraseScorer
327
+ ***************************************************************************/
328
+
329
+ static float ephsc_phrase_freq(Scorer *self)
330
+ {
331
+ PhraseScorer *phsc = PhSc(self);
332
+ int i;
333
+ int pp_first_idx = 0;
334
+ const int pp_cnt = phsc->pp_cnt;
335
+ float freq = 0.0;
336
+ PhPos **phrase_positions = phsc->phrase_pos;
337
+ PhPos *first;
338
+ PhPos *last;
339
+
340
+ for (i = 0; i < pp_cnt; i++) {
341
+ pp_first_position(phrase_positions[i]);
342
+ }
343
+ qsort(phrase_positions, pp_cnt, sizeof(PhPos *), &pp_pos_cmp);
344
+
345
+ first = phrase_positions[0];
346
+ last = phrase_positions[pp_cnt - 1];
347
+
348
+ /* scan to position with all terms */
349
+ do {
350
+ /* scan forward in first */
351
+ while (first->position < last->position) {
352
+ do {
353
+ if (! pp_next_position(first)) {
354
+ /* maintain first position */
355
+ phsc->pp_first_idx = pp_first_idx;
356
+ return freq;
357
+ }
358
+ } while (first->position < last->position);
359
+ last = first;
360
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
361
+ first = phrase_positions[pp_first_idx];
362
+ }
363
+ freq += 1.0; /* all equal: a match */
364
+ } while (pp_next_position(last));
365
+
366
+ /* maintain first position */
367
+ phsc->pp_first_idx = pp_first_idx;
368
+ return freq;
369
+ }
370
+
371
+ static Scorer *exact_phrase_scorer_new(Weight *weight,
372
+ TermDocEnum **term_pos_enum,
373
+ PhrasePosition *positions, int pp_cnt,
374
+ Similarity *similarity, uchar *norms)
375
+ {
376
+ Scorer *self =
377
+ phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
378
+
379
+ PhSc(self)->phrase_freq = &ephsc_phrase_freq;
380
+ return self;
381
+ }
382
+
383
+ /***************************************************************************
384
+ * SloppyPhraseScorer
385
+ ***************************************************************************/
386
+
387
+ static float sphsc_phrase_freq(Scorer *self)
388
+ {
389
+ PhraseScorer *phsc = PhSc(self);
390
+ PhPos *pp;
391
+ PriorityQueue *pq = pq_new(phsc->pp_cnt, (lt_ft)&pp_less_than, NULL);
392
+ const int pp_cnt = phsc->pp_cnt;
393
+
394
+ int last_pos = 0, pos, next_pos, start, match_length, i;
395
+ bool done = false;
396
+ float freq = 0.0;
397
+
398
+ for (i = 0; i < pp_cnt; i++) {
399
+ pp = phsc->phrase_pos[i];
400
+ pp_first_position(pp);
401
+ if (pp->position > last_pos) {
402
+ last_pos = pp->position;
403
+ }
404
+ pq_push(pq, pp);
405
+ }
406
+
407
+ do {
408
+ pp = pq_pop(pq);
409
+ pos = start = pp->position;
410
+ next_pos = PP(pq_top(pq))->position;
411
+ while (pos <= next_pos) {
412
+ start = pos; /* advance pp to min window */
413
+ if (!pp_next_position(pp)) {
414
+ done = true; /* ran out of a positions for a term - done */
415
+ break;
416
+ }
417
+ pos = pp->position;
418
+ }
419
+
420
+ match_length = last_pos - start;
421
+ if (match_length <= phsc->slop) {
422
+ /* score match */
423
+ freq += sim_sloppy_freq(self->similarity, match_length);
424
+ }
425
+
426
+ if (pp->position > last_pos) {
427
+ last_pos = pp->position;
428
+ }
429
+ pq_push(pq, pp); /* restore pq */
430
+ } while (!done);
431
+
432
+ pq_destroy(pq);
433
+ return freq;
434
+ }
435
+
436
+ static Scorer *sloppy_phrase_scorer_new(Weight *weight,
437
+ TermDocEnum **term_pos_enum,
438
+ PhrasePosition *positions,
439
+ int pp_cnt, Similarity *similarity,
440
+ int slop, uchar *norms)
441
+ {
442
+ Scorer *self =
443
+ phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
444
+
445
+ PhSc(self)->slop = slop;
446
+ PhSc(self)->phrase_freq = &sphsc_phrase_freq;
447
+ return self;
448
+ }
449
+
450
+ /***************************************************************************
451
+ *
452
+ * PhraseWeight
453
+ *
454
+ ***************************************************************************/
455
+
456
+ static char *phw_to_s(Weight *self)
457
+ {
458
+ return strfmt("PhraseWeight(%f)", self->value);
459
+ }
460
+
461
+ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
462
+ {
463
+ int i;
464
+ Scorer *phsc = NULL;
465
+ PhraseQuery *phq = PhQ(self->query);
466
+ TermDocEnum **tps, *tpe;
467
+ PhrasePosition *positions = phq->positions;
468
+ const int pos_cnt = phq->pos_cnt;
469
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
470
+
471
+ if (pos_cnt == 0 || field_num < 0) {
472
+ return NULL;
473
+ }
474
+
475
+ tps = ALLOC_N(TermDocEnum *, pos_cnt);
476
+
477
+ for (i = 0; i < pos_cnt; i++) {
478
+ char **terms = positions[i].terms;
479
+ const int t_cnt = ary_size(terms);
480
+ if (t_cnt == 1) {
481
+ tpe = tps[i] = ir->term_positions(ir);
482
+ tpe->seek(tpe, field_num, terms[0]);
483
+ }
484
+ else {
485
+ tps[i] = mtdpe_new(ir, field_num, terms, t_cnt);
486
+ }
487
+ if (tps[i] == NULL) {
488
+ /* free everything we just created and return NULL */
489
+ int j;
490
+ for (j = 0; j < i; j++) {
491
+ tps[i]->close(tps[i]);
492
+ }
493
+ free(tps);
494
+ return NULL;
495
+ }
496
+ }
497
+
498
+ if (phq->slop == 0) { /* optimize exact (common) case */
499
+ phsc = exact_phrase_scorer_new(self, tps, positions, pos_cnt,
500
+ self->similarity,
501
+ ir_get_norms_i(ir, field_num));
502
+ }
503
+ else {
504
+ phsc = sloppy_phrase_scorer_new(self, tps, positions, pos_cnt,
505
+ self->similarity, phq->slop,
506
+ ir_get_norms_i(ir, field_num));
507
+ }
508
+ free(tps);
509
+ return phsc;
510
+ }
511
+
512
+ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
513
+ {
514
+ Explanation *expl;
515
+ Explanation *idf_expl1;
516
+ Explanation *idf_expl2;
517
+ Explanation *query_expl;
518
+ Explanation *qnorm_expl;
519
+ Explanation *field_expl;
520
+ Explanation *tf_expl;
521
+ Scorer *scorer;
522
+ uchar *field_norms;
523
+ float field_norm;
524
+ Explanation *field_norm_expl;
525
+ char *query_str;
526
+ PhraseQuery *phq = PhQ(self->query);
527
+ const int pos_cnt = phq->pos_cnt;
528
+ PhrasePosition *positions = phq->positions;
529
+ int i, j;
530
+ char *doc_freqs = NULL;
531
+ size_t len = 0, pos = 0;
532
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
533
+
534
+ if (field_num < 0) {
535
+ return expl_new(0.0, "field \"%s\" does not exist in the index", phq->field);
536
+ }
537
+
538
+ query_str = self->query->to_s(self->query, "");
539
+
540
+ expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
541
+
542
+ /* ensure the phrase positions are in order for explanation */
543
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
544
+
545
+ for (i = 0; i < phq->pos_cnt; i++) {
546
+ char **terms = phq->positions[i].terms;
547
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
548
+ len += strlen(terms[j]) + 30;
549
+ }
550
+ }
551
+ doc_freqs = ALLOC_N(char, len);
552
+ for (i = 0; i < phq->pos_cnt; i++) {
553
+ char **terms = phq->positions[i].terms;
554
+ const int t_cnt = ary_size(terms);
555
+ for (j = 0; j < t_cnt; j++) {
556
+ char *term = terms[j];
557
+ sprintf(doc_freqs + pos, "%s=%d, ",
558
+ term, ir->doc_freq(ir, field_num, term));
559
+ pos += strlen(doc_freqs + pos);
560
+ }
561
+ }
562
+ pos -= 2; /* remove ", " from the end */
563
+ doc_freqs[pos] = 0;
564
+
565
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
566
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
567
+ free(doc_freqs);
568
+
569
+ /* explain query weight */
570
+ query_expl = expl_new(0.0, "query_weight(%s), product of:", query_str);
571
+
572
+ if (self->query->boost != 1.0) {
573
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
574
+ }
575
+ expl_add_detail(query_expl, idf_expl1);
576
+
577
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
578
+ expl_add_detail(query_expl, qnorm_expl);
579
+
580
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
581
+
582
+ expl_add_detail(expl, query_expl);
583
+
584
+ /* explain field weight */
585
+ field_expl = expl_new(0.0, "field_weight(%s in %d), product of:",
586
+ query_str, doc_num);
587
+ free(query_str);
588
+
589
+ scorer = self->scorer(self, ir);
590
+ tf_expl = scorer->explain(scorer, doc_num);
591
+ scorer->destroy(scorer);
592
+ expl_add_detail(field_expl, tf_expl);
593
+ expl_add_detail(field_expl, idf_expl2);
594
+
595
+ field_norms = ir->get_norms(ir, field_num);
596
+ field_norm = (field_norms != NULL)
597
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
598
+ : (float)0.0;
599
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
600
+ phq->field, doc_num);
601
+
602
+ expl_add_detail(field_expl, field_norm_expl);
603
+
604
+ field_expl->value = tf_expl->value * self->idf * field_norm;
605
+
606
+ /* combine them */
607
+ if (query_expl->value == 1.0) {
608
+ expl_destroy(expl);
609
+ return field_expl;
610
+ }
611
+ else {
612
+ expl->value = (query_expl->value * field_expl->value);
613
+ expl_add_detail(expl, field_expl);
614
+ return expl;
615
+ }
616
+ }
617
+
618
+ static Weight *phw_new(Query *query, Searcher *searcher)
619
+ {
620
+ Weight *self = w_new(Weight, query);
621
+
622
+ self->scorer = &phw_scorer;
623
+ self->explain = &phw_explain;
624
+ self->to_s = &phw_to_s;
625
+
626
+ self->similarity = query->get_similarity(query, searcher);
627
+ self->value = query->boost;
628
+ self->idf = sim_idf_phrase(self->similarity, PhQ(query)->field,
629
+ PhQ(query)->positions,
630
+ PhQ(query)->pos_cnt, searcher);
631
+ return self;
632
+ }
633
+
634
+ /***************************************************************************
635
+ *
636
+ * PhraseQuery
637
+ *
638
+ ***************************************************************************/
639
+
640
+ /* ** TVPosEnum ** */
641
+ typedef struct TVPosEnum
642
+ {
643
+ int index;
644
+ int size;
645
+ int offset;
646
+ int pos;
647
+ int positions[];
648
+ } TVPosEnum;
649
+
650
+ static bool tvpe_next(TVPosEnum *self)
651
+ {
652
+ if (++(self->index) < self->size) {
653
+ self->pos = self->positions[self->index] - self->offset;
654
+ return true;
655
+ }
656
+ else {
657
+ self->pos = -1;
658
+ return false;
659
+ }
660
+ }
661
+
662
+ static int tvpe_skip_to(TVPosEnum *self, int position)
663
+ {
664
+ int i;
665
+ int search_pos = position + self->offset;
666
+ for (i = self->index + 1; i < self->size; i++) {
667
+ if (self->positions[i] >= search_pos) {
668
+ self->pos = self->positions[i] - self->offset;
669
+ break;
670
+ }
671
+ }
672
+ self->index = i;
673
+ if (i == self->size) {
674
+ self->pos = -1;
675
+ return false;
676
+ }
677
+ return true;
678
+ }
679
+
680
+ static bool tvpe_lt(TVPosEnum *tvpe1, TVPosEnum *tvpe2)
681
+ {
682
+ return tvpe1->pos < tvpe2->pos;
683
+ }
684
+
685
+ static TVPosEnum *tvpe_new(int *positions, int size, int offset)
686
+ {
687
+ TVPosEnum *self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
688
+ + size * sizeof(int));
689
+ memcpy(self->positions, positions, size * sizeof(int));
690
+ self->size = size;
691
+ self->offset = offset;
692
+ self->index = -1;
693
+ self->pos = -1;
694
+ return self;
695
+ }
696
+
697
+ static TVPosEnum *tvpe_new_merge(char **terms, int t_cnt, TermVector *tv,
698
+ int offset)
699
+ {
700
+ int i, total_positions = 0;
701
+ PriorityQueue *tvpe_pq = pq_new(t_cnt, (lt_ft)tvpe_lt, &free);
702
+ TVPosEnum *self = NULL;
703
+
704
+ for (i = 0; i < t_cnt; i++) {
705
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[i]);
706
+ if (tv_term) {
707
+ TVPosEnum *tvpe = tvpe_new(tv_term->positions, tv_term->freq, 0);
708
+ if (tvpe_next(tvpe)) {
709
+ pq_push(tvpe_pq, tvpe);
710
+ total_positions += tv_term->freq;
711
+ }
712
+ else {
713
+ free(tvpe);
714
+ }
715
+ }
716
+ }
717
+ if (tvpe_pq->size == 0) {
718
+ pq_destroy(tvpe_pq);
719
+ }
720
+ else {
721
+ int index = 0;
722
+ self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
723
+ + total_positions * sizeof(int));
724
+ self->size = total_positions;
725
+ self->offset = offset;
726
+ self->index = -1;
727
+ self->pos = -1;
728
+ while (tvpe_pq->size > 0) {
729
+ TVPosEnum *top = (TVPosEnum *)pq_top(tvpe_pq);
730
+ self->positions[index++] = top->pos;
731
+ if (! tvpe_next(top)) {
732
+ pq_pop(tvpe_pq);
733
+ free(top);
734
+ }
735
+ else {
736
+ pq_down(tvpe_pq);
737
+ }
738
+ }
739
+ pq_destroy(tvpe_pq);
740
+ }
741
+ return self;
742
+ }
743
+
744
+ static TVPosEnum *get_tvpe(TermVector *tv, char **terms, int t_cnt, int offset)
745
+ {
746
+ TVPosEnum *tvpe = NULL;
747
+ if (t_cnt == 1) {
748
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[0]);
749
+ if (tv_term) {
750
+ tvpe = tvpe_new(tv_term->positions, tv_term->freq, offset);
751
+ }
752
+ }
753
+ else {
754
+ tvpe = tvpe_new_merge(terms, t_cnt, tv, offset);
755
+ }
756
+ return tvpe;
757
+ }
758
+
759
+ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
760
+ TermVector *tv)
761
+ {
762
+ if (strcmp(tv->field, PhQ(self)->field) == 0) {
763
+ const int pos_cnt = PhQ(self)->pos_cnt;
764
+ int i;
765
+ int slop = PhQ(self)->slop;
766
+ bool done = false;
767
+
768
+ if (slop > 0) {
769
+ PriorityQueue *tvpe_pq = pq_new(pos_cnt, (lt_ft)tvpe_lt, &free);
770
+ int last_pos = 0;
771
+ for (i = 0; i < pos_cnt; i++) {
772
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
773
+ const int t_cnt = ary_size(pp->terms);
774
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
775
+ if (tvpe && tvpe_next(tvpe)) {
776
+ if (tvpe->pos > last_pos) {
777
+ last_pos = tvpe->pos;
778
+ }
779
+ pq_push(tvpe_pq, tvpe);
780
+ }
781
+ else {
782
+ done = true;
783
+ free(tvpe);
784
+ break;
785
+ }
786
+ }
787
+ while (! done) {
788
+ TVPosEnum *tvpe = pq_pop(tvpe_pq);
789
+ int pos;
790
+ int start = pos = tvpe->pos;
791
+ int next_pos = ((TVPosEnum *)pq_top(tvpe_pq))->pos;
792
+ while (pos <= next_pos) {
793
+ start = pos;
794
+ if (!tvpe_next(tvpe)) {
795
+ done = true;
796
+ break;
797
+ }
798
+ pos = tvpe->pos;
799
+ }
800
+
801
+ if (last_pos - start <= slop) {
802
+ int min, max = min = start + tvpe->offset;
803
+ for (i = tvpe_pq->size; i > 0; i--) {
804
+ TVPosEnum *t = (TVPosEnum *)tvpe_pq->heap[i];
805
+ int p = t->pos + t->offset;
806
+ max = p > max ? p : max;
807
+ min = p < min ? p : min;
808
+ }
809
+ matchv_add(mv, min, max);
810
+ }
811
+ if (tvpe->pos > last_pos) {
812
+ last_pos = tvpe->pos;
813
+ }
814
+ pq_push(tvpe_pq, tvpe);
815
+ }
816
+
817
+ pq_destroy(tvpe_pq);
818
+ }
819
+ else { /* exact match */
820
+ TVPosEnum **tvpe_a = ALLOC_AND_ZERO_N(TVPosEnum *, pos_cnt);
821
+ TVPosEnum *first, *last;
822
+ int first_index = 0;
823
+ done = false;
824
+ qsort(PhQ(self)->positions, pos_cnt, sizeof(PhrasePosition),
825
+ &phrase_pos_cmp);
826
+ for (i = 0; i < pos_cnt; i++) {
827
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
828
+ const int t_cnt = ary_size(pp->terms);
829
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
830
+ if (tvpe && ((i == 0 && tvpe_next(tvpe))
831
+ || tvpe_skip_to(tvpe, tvpe_a[i-1]->pos))) {
832
+ tvpe_a[i] = tvpe;
833
+ }
834
+ else {
835
+ done = true;
836
+ free(tvpe);
837
+ break;
838
+ }
839
+ }
840
+
841
+ first = tvpe_a[0];
842
+ last = tvpe_a[pos_cnt - 1];
843
+
844
+ while (!done) {
845
+ while (first->pos < last->pos) {
846
+ if (tvpe_skip_to(first, last->pos)) {
847
+ last = first;
848
+ first_index = NEXT_NUM(first_index, pos_cnt);
849
+ first = tvpe_a[first_index];
850
+ }
851
+ else {
852
+ done = true;
853
+ break;
854
+ }
855
+ }
856
+ if (!done) {
857
+ matchv_add(mv, tvpe_a[0]->pos + tvpe_a[0]->offset,
858
+ tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
859
+ }
860
+ if (!tvpe_next(last)) {
861
+ done = true;
862
+ }
863
+ }
864
+ for (i = 0; i < pos_cnt; i++) {
865
+ free(tvpe_a[i]);
866
+ }
867
+ free(tvpe_a);
868
+ }
869
+ }
870
+ return mv;
871
+ }
872
+
873
+
874
+ /* ** PhraseQuery besides highlighting stuff ** */
875
+
876
+ #define PhQ_INIT_CAPA 4
877
+
878
+ static void phq_extract_terms(Query *self, HashSet *term_set)
879
+ {
880
+ PhraseQuery *phq = PhQ(self);
881
+ int i, j;
882
+ for (i = 0; i < phq->pos_cnt; i++) {
883
+ char **terms = phq->positions[i].terms;
884
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
885
+ hs_add(term_set, term_new(phq->field, terms[j]));
886
+ }
887
+ }
888
+ }
889
+
890
+ static char *phq_to_s(Query *self, const char *field)
891
+ {
892
+ PhraseQuery *phq = PhQ(self);
893
+ const int pos_cnt = phq->pos_cnt;
894
+ PhrasePosition *positions = phq->positions;
895
+
896
+ int i, j, buf_index = 0, pos, last_pos;
897
+ size_t len = 0;
898
+ char *buffer;
899
+
900
+ if (phq->pos_cnt == 0) {
901
+ if (strcmp(field, phq->field) != 0) {
902
+ return strfmt("%s:\"\"", phq->field);
903
+ }
904
+ else {
905
+ return estrdup("\"\"");
906
+ }
907
+ }
908
+
909
+ /* sort the phrase positions by position */
910
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
911
+
912
+ len = strlen(phq->field) + 1;
913
+
914
+ for (i = 0; i < pos_cnt; i++) {
915
+ char **terms = phq->positions[i].terms;
916
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
917
+ len += strlen(terms[j]) + 5;
918
+ }
919
+ }
920
+
921
+ /* add space for extra <> characters and boost and slop */
922
+ len += 100 + 3
923
+ * (phq->positions[phq->pos_cnt - 1].pos - phq->positions[0].pos);
924
+
925
+ buffer = ALLOC_N(char, len);
926
+
927
+ if (strcmp(field, phq->field) != 0) {
928
+ len = strlen(phq->field);
929
+ memcpy(buffer, phq->field, len);
930
+ buffer[len] = ':';
931
+ buf_index += len + 1;
932
+ }
933
+
934
+ buffer[buf_index++] = '"';
935
+
936
+ last_pos = positions[0].pos - 1;
937
+ for (i = 0; i < pos_cnt; i++) {
938
+ char **terms = positions[i].terms;
939
+ const int t_cnt = ary_size(terms);
940
+
941
+ pos = positions[i].pos;
942
+ if (pos == last_pos) {
943
+ buffer[buf_index - 1] = '&';
944
+ }
945
+ else {
946
+ for (j = last_pos; j < pos - 1; j++) {
947
+ memcpy(buffer + buf_index, "<> ", 3);
948
+ buf_index += 3;
949
+ }
950
+ }
951
+
952
+ last_pos = pos;
953
+ for (j = 0; j < t_cnt; j++) {
954
+ char *term = terms[j];
955
+ len = strlen(term);
956
+ memcpy(buffer + buf_index, term, len);
957
+ buf_index += len;
958
+ buffer[buf_index++] = '|';
959
+ }
960
+ buffer[buf_index-1] = ' '; /* change last '|' to ' ' */
961
+ }
962
+
963
+ if (buffer[buf_index-1] == ' ') {
964
+ buf_index--;
965
+ }
966
+
967
+ buffer[buf_index++] = '"';
968
+ buffer[buf_index] = 0;
969
+
970
+ if (phq->slop != 0) {
971
+ sprintf(buffer + buf_index, "~%d", phq->slop);
972
+ buf_index += strlen(buffer + buf_index);
973
+ }
974
+
975
+ if (self->boost != 1.0) {
976
+ buffer[buf_index++] = '^';
977
+ dbl_to_s(buffer + buf_index, self->boost);
978
+ }
979
+
980
+ return buffer;
981
+ }
982
+
983
+ static void phq_destroy(Query *self)
984
+ {
985
+ PhraseQuery *phq = PhQ(self);
986
+ int i;
987
+ free(phq->field);
988
+ for (i = 0; i < phq->pos_cnt; i++) {
989
+ ary_destroy(phq->positions[i].terms, &free);
990
+ }
991
+ free(phq->positions);
992
+ q_destroy_i(self);
993
+ }
994
+
995
+ static Query *phq_rewrite(Query *self, IndexReader *ir)
996
+ {
997
+ PhraseQuery *phq = PhQ(self);
998
+ (void)ir;
999
+ if (phq->pos_cnt == 1) {
1000
+ /* optimize one-position case */
1001
+ char **terms = phq->positions[0].terms;
1002
+ const int t_cnt = ary_size(terms);
1003
+ if (t_cnt == 1) {
1004
+ Query *tq = tq_new(phq->field, terms[0]);
1005
+ tq->boost = self->boost;
1006
+ return tq;
1007
+ }
1008
+ else {
1009
+ Query *q = multi_tq_new(phq->field);
1010
+ int i;
1011
+ for (i = 0; i < t_cnt; i++) {
1012
+ multi_tq_add_term(q, terms[i]);
1013
+ }
1014
+ q->boost = self->boost;
1015
+ return q;
1016
+ }
1017
+ } else {
1018
+ self->ref_cnt++;
1019
+ return self;
1020
+ }
1021
+ }
1022
+
1023
+ static unsigned long phq_hash(Query *self)
1024
+ {
1025
+ int i, j;
1026
+ PhraseQuery *phq = PhQ(self);
1027
+ unsigned long hash = str_hash(phq->field);
1028
+ for (i = 0; i < phq->pos_cnt; i++) {
1029
+ char **terms = phq->positions[i].terms;
1030
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
1031
+ hash = (hash << 1) ^ (str_hash(terms[j])
1032
+ ^ phq->positions[i].pos);
1033
+ }
1034
+ }
1035
+ return (hash ^ phq->slop);
1036
+ }
1037
+
1038
+ static int phq_eq(Query *self, Query *o)
1039
+ {
1040
+ int i, j;
1041
+ PhraseQuery *phq1 = PhQ(self);
1042
+ PhraseQuery *phq2 = PhQ(o);
1043
+ if (phq1->slop != phq2->slop
1044
+ || strcmp(phq1->field, phq2->field) != 0
1045
+ || phq1->pos_cnt != phq2->pos_cnt) {
1046
+ return false;
1047
+ }
1048
+ for (i = 0; i < phq1->pos_cnt; i++) {
1049
+ char **terms1 = phq1->positions[i].terms;
1050
+ char **terms2 = phq2->positions[i].terms;
1051
+ const int t_cnt = ary_size(terms1);
1052
+ if (t_cnt != ary_size(terms2)
1053
+ || phq1->positions[i].pos != phq2->positions[i].pos) {
1054
+ return false;
1055
+ }
1056
+ for (j = 0; j < t_cnt; j++) {
1057
+ if (strcmp(terms1[j], terms2[j]) != 0) {
1058
+ return false;
1059
+ }
1060
+ }
1061
+ }
1062
+ return true;
1063
+ }
1064
+
1065
+ Query *phq_new(const char *field)
1066
+ {
1067
+ Query *self = q_new(PhraseQuery);
1068
+
1069
+ PhQ(self)->field = estrdup(field);
1070
+ PhQ(self)->pos_cnt = 0;
1071
+ PhQ(self)->pos_capa = PhQ_INIT_CAPA;
1072
+ PhQ(self)->positions = ALLOC_N(PhrasePosition, PhQ_INIT_CAPA);
1073
+
1074
+ self->type = PHRASE_QUERY;
1075
+ self->rewrite = &phq_rewrite;
1076
+ self->extract_terms = &phq_extract_terms;
1077
+ self->to_s = &phq_to_s;
1078
+ self->hash = &phq_hash;
1079
+ self->eq = &phq_eq;
1080
+ self->destroy_i = &phq_destroy;
1081
+ self->create_weight_i = &phw_new;
1082
+ self->get_matchv_i = &phq_get_matchv_i;
1083
+ return self;
1084
+ }
1085
+
1086
+ void phq_add_term_abs(Query *self, const char *term, int position)
1087
+ {
1088
+ PhraseQuery *phq = PhQ(self);
1089
+ int index = phq->pos_cnt;
1090
+ PhrasePosition *pp;
1091
+ if (index >= phq->pos_capa) {
1092
+ phq->pos_capa <<= 1;
1093
+ REALLOC_N(phq->positions, PhrasePosition, phq->pos_capa);
1094
+ }
1095
+ pp = &(phq->positions[index]);
1096
+ pp->terms = ary_new_type_capa(char *, 2);
1097
+ ary_push(pp->terms, estrdup(term));
1098
+ pp->pos = position;
1099
+ phq->pos_cnt++;
1100
+ }
1101
+
1102
+ void phq_add_term(Query *self, const char *term, int pos_inc)
1103
+ {
1104
+ PhraseQuery *phq = PhQ(self);
1105
+ int position;
1106
+ if (phq->pos_cnt == 0) {
1107
+ position = 0;
1108
+ }
1109
+ else {
1110
+ position = phq->positions[phq->pos_cnt - 1].pos + pos_inc;
1111
+ }
1112
+ phq_add_term_abs(self, term, position);
1113
+ }
1114
+
1115
+ void phq_append_multi_term(Query *self, const char *term)
1116
+ {
1117
+ PhraseQuery *phq = PhQ(self);
1118
+ int index = phq->pos_cnt - 1;
1119
+
1120
+ if (index < 0) {
1121
+ phq_add_term(self, term, 0);
1122
+ }
1123
+ else {
1124
+ ary_push(phq->positions[index].terms, estrdup(term));
1125
+ }
1126
+ }