ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/q_phrase.c ADDED
@@ -0,0 +1,657 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+
4
+ /***************************************************************************
5
+ *
6
+ * PhraseWeight
7
+ *
8
+ ***************************************************************************/
9
+
10
+ Scorer *phw_scorer(Weight *self, IndexReader *ir)
11
+ {
12
+ Scorer *phsc;
13
+ PhraseQuery *phq = (PhraseQuery *)self->query->data;
14
+ int i;
15
+ if (phq->t_cnt == 0) return NULL; // optimize zero-term case
16
+
17
+ TermDocEnum **tps = ALLOC_N(TermDocEnum *, phq->t_cnt);
18
+
19
+ for (i = 0; i < phq->t_cnt; i++) {
20
+ tps[i] = ir_term_positions_for(ir, phq->terms[i]);
21
+ if (tps[i] == NULL) {
22
+ // free everything we just created and return NULL
23
+ int j;
24
+ for (j = 0; j < i; j++) {
25
+ tps[i]->close(tps[i]);
26
+ }
27
+ free(tps);
28
+ return NULL;
29
+ }
30
+ }
31
+
32
+ if (phq->slop == 0) { // optimize exact case
33
+ phsc = exact_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
34
+ self->similarity,
35
+ ir->get_norms(ir, phq->field));
36
+ } else {
37
+ phsc = sloppy_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
38
+ self->similarity,
39
+ phq->slop,
40
+ ir->get_norms(ir, phq->field));
41
+ }
42
+ free(tps);
43
+ return phsc;
44
+ }
45
+
46
+ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
47
+ {
48
+ char *query_str = self->query->to_s(self->query, "");
49
+ PhraseQuery *phq = (PhraseQuery *)self->query->data;
50
+ int i;
51
+ char *doc_freqs = NULL;
52
+ int len = 0, pos = 0;
53
+
54
+ Explanation *expl = expl_create(0.0,
55
+ epstrdup("weight(%s in %d), product of:",
56
+ strlen(query_str) + 20,
57
+ query_str, doc_num));
58
+
59
+ for (i = 0; i < phq->t_cnt; i++) {
60
+ len += strlen(phq->terms[i]->text) + 30;
61
+ }
62
+ doc_freqs = ALLOC_N(char, len);
63
+ for (i = 0; i < phq->t_cnt; i++) {
64
+ Term *term = phq->terms[i];
65
+ sprintf(doc_freqs + pos, "%s=%d, ", term->text, ir->doc_freq(ir, term));
66
+ pos += strlen(doc_freqs + pos);
67
+ }
68
+ pos -= 2; // remove ", " from the end
69
+ doc_freqs[pos] = 0;
70
+
71
+ Explanation *idf_expl1 = expl_create(self->idf,
72
+ epstrdup("idf(%s:<%s>)", strlen(phq->field) + pos, phq->field, doc_freqs));
73
+ Explanation *idf_expl2 = expl_create(self->idf,
74
+ epstrdup("idf(%s:<%s>)", strlen(phq->field) + pos, phq->field, doc_freqs));
75
+ free(doc_freqs);
76
+
77
+ // explain query weight
78
+ Explanation *query_expl = expl_create(0.0,
79
+ epstrdup("query_weight(%s), product of:", strlen(query_str), query_str));
80
+
81
+ if (self->query->boost != 1.0) {
82
+ expl_add_detail(query_expl, expl_create(self->query->boost, estrdup("boost")));
83
+ }
84
+ expl_add_detail(query_expl, idf_expl1);
85
+
86
+ Explanation *qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
87
+ expl_add_detail(query_expl, qnorm_expl);
88
+
89
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
90
+
91
+ expl_add_detail(expl, query_expl);
92
+
93
+ // explain field weight
94
+ Explanation *field_expl = expl_create(0.0,
95
+ epstrdup("field_weight(%s in %d), product of:",
96
+ strlen(query_str) + 20, query_str, doc_num));
97
+ free(query_str);
98
+
99
+ Scorer *scorer = self->scorer(self, ir);
100
+ Explanation *tf_expl = scorer->explain(scorer, doc_num);
101
+ scorer->destroy(scorer);
102
+ expl_add_detail(field_expl, tf_expl);
103
+ expl_add_detail(field_expl, idf_expl2);
104
+
105
+ uchar *field_norms = ir->get_norms(ir, phq->field);
106
+ float field_norm = (field_norms != NULL)
107
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
108
+ : 0.0;
109
+ Explanation *field_norm_expl = expl_create(field_norm,
110
+ epstrdup("field_norm(field=%s, doc=%d)",
111
+ strlen(phq->field) + 20, phq->field, doc_num));
112
+
113
+ expl_add_detail(field_expl, field_norm_expl);
114
+
115
+ field_expl->value = tf_expl->value * self->idf * field_norm;
116
+
117
+ // combine them
118
+ if (query_expl->value == 1.0) {
119
+ expl_destoy(expl);
120
+ return field_expl;
121
+ } else {
122
+ expl->value = (query_expl->value * field_expl->value);
123
+ expl_add_detail(expl, field_expl);
124
+ return expl;
125
+ }
126
+ }
127
+
128
+ char *phw_to_s(Weight *self)
129
+ {
130
+ char dbuf[32];
131
+ dbl_to_s(dbuf, self->value);
132
+ return epstrdup("PhraseWeight(%s)", strlen(dbuf), dbuf);
133
+ }
134
+
135
+ Weight *phw_create(Query *query, Searcher *searcher)
136
+ {
137
+ PhraseQuery *phq = (PhraseQuery *)query->data;
138
+ Weight *self = ALLOC(Weight);
139
+ ZEROSET(self, Weight, 1);
140
+ self->get_query = &w_get_query;
141
+ self->get_value = &w_get_value;
142
+ self->normalize = &w_normalize;
143
+ self->scorer = &phw_scorer;
144
+ self->explain = &phw_explain;
145
+ self->to_s = &phw_to_s;
146
+ self->destroy = &free;
147
+ self->sum_of_squared_weights = &w_sum_of_squared_weights;
148
+
149
+ self->similarity = query->get_similarity(query, searcher);
150
+ self->query = query;
151
+ self->value = query->boost;
152
+ self->idf = sim_idf_phrase(self->similarity, phq->terms, phq->t_cnt, searcher);
153
+
154
+ return self;
155
+ }
156
+
157
+ /***************************************************************************
158
+ *
159
+ * PhraseQuery
160
+ *
161
+ ***************************************************************************/
162
+
163
+ #define GET_PHQ PhraseQuery *phq = (PhraseQuery *)self->data
164
+
165
+ void phq_extract_terms(Query *self, Array *terms)
166
+ {
167
+ GET_PHQ;
168
+ int i;
169
+ for (i = 0; i < phq->t_cnt; i++) {
170
+ ary_append(terms, phq->terms[i]);
171
+ }
172
+ }
173
+
174
+ char *phq_to_s(Query *self, char *field)
175
+ {
176
+ GET_PHQ;
177
+ int i, j, buf_index = 0, len = 0, pos, last_pos = -1;
178
+ char *buffer;
179
+ if (!phq->t_cnt) return NULL;
180
+ len = strlen(phq->field) + 1;
181
+ for (i = 0; i < phq->t_cnt; i++) {
182
+ len += strlen(phq->terms[i]->text) + 1;
183
+ }
184
+ // add space for extra characters and boost and slop
185
+ len += 100 + 3 * phq->positions[phq->t_cnt - 1];
186
+
187
+ buffer = ALLOC_N(char, len);
188
+
189
+ if (strcmp(field, phq->field) != 0) {
190
+ len = strlen(phq->field);
191
+ memcpy(buffer, phq->field, len);
192
+ buffer[len] = ':';
193
+ buf_index += len + 1;
194
+ }
195
+ buffer[buf_index++] = '"';
196
+
197
+ for (i = 0; i < phq->t_cnt; i++) {
198
+ Term *term = phq->terms[i];
199
+ pos = phq->positions[i];
200
+ for (j = last_pos; j < pos - 1; j++) {
201
+ memcpy(buffer + buf_index, "<> ", 3);
202
+ buf_index += 3;
203
+ }
204
+ last_pos = pos;
205
+
206
+ len = strlen(term->text);
207
+ memcpy(buffer + buf_index, term->text, len);
208
+ buf_index += len;
209
+ buffer[buf_index++] = ' ';
210
+ }
211
+ if (buffer[buf_index-1] == ' ') buf_index--;
212
+ buffer[buf_index++] = '"';
213
+ buffer[buf_index] = 0;
214
+ if (phq->slop != 0) {
215
+ sprintf(buffer + buf_index, "~%d", phq->slop);
216
+ buf_index += strlen(buffer + buf_index);
217
+ }
218
+ if (self->boost != 1.0) {
219
+ char dbuf[32];
220
+ dbl_to_s(dbuf, self->boost);
221
+ sprintf(buffer + buf_index, "^%s", dbuf);
222
+ }
223
+ return buffer;
224
+ }
225
+
226
+ void phq_destroy(void *p)
227
+ {
228
+ Query *self = (Query *)p;
229
+
230
+ GET_PHQ;
231
+ int i;
232
+ if (self->destroy_all) {
233
+ for (i = 0; i < phq->t_cnt; i++) {
234
+ term_destroy(phq->terms[i]);
235
+ }
236
+ }
237
+ free(phq->terms);
238
+ free(phq->positions);
239
+ free(phq);
240
+
241
+ q_destroy(self);
242
+ }
243
+
244
+ Query *phq_rewrite(Query *self, IndexReader *ir)
245
+ {
246
+ GET_PHQ;
247
+ if (phq->t_cnt == 1) { // optimize one-term case
248
+ Term *term = phq->terms[0];
249
+ Query *tq = tq_create(term_clone(term));
250
+ tq->boost = self->boost;
251
+ if (self->rewritten) self->rewritten->destroy(self->rewritten);
252
+ return self->rewritten = tq;
253
+ } else {
254
+ return self;
255
+ }
256
+ }
257
+
258
+ void phq_add_term(Query *self, Term *term, int pos_inc)
259
+ {
260
+ GET_PHQ;
261
+ int position, index = phq->t_cnt;
262
+ if (index >= phq->t_capa) {
263
+ phq->t_capa *= 2;
264
+ REALLOC_N(phq->terms, Term *, phq->t_capa);
265
+ REALLOC_N(phq->positions, int, phq->t_capa);
266
+ }
267
+ if (index == 0) {
268
+ position = 0;
269
+ phq->field = term->field;
270
+ } else {
271
+ position = phq->positions[index - 1] + pos_inc;
272
+ if (strcmp(term->field, phq->field) != 0) {
273
+ eprintf(ARG_ERROR, "All phrase terms must be in the same field. Current phrase is %s, tried to add %s\n", phq->field, term->field);
274
+ }
275
+ }
276
+ phq->terms[index] = term;
277
+ phq->positions[index] = position;
278
+ phq->t_cnt++;
279
+ }
280
+
281
+ Query *phq_create()
282
+ {
283
+ Query *self = q_create();
284
+ PhraseQuery *phq = ALLOC(PhraseQuery);
285
+ ZEROSET(phq, PhraseQuery, 1);
286
+ phq->t_capa = PHQ_INIT_CAPA;
287
+ phq->terms = ALLOC_N(Term *, PHQ_INIT_CAPA);
288
+ phq->positions = ALLOC_N(int, PHQ_INIT_CAPA);
289
+ self->data = phq;
290
+
291
+ self->create_weight = &phw_create;
292
+ self->extract_terms = &phq_extract_terms;
293
+ self->to_s = &phq_to_s;
294
+ self->destroy = &phq_destroy;
295
+ self->rewrite = &phq_rewrite;
296
+ self->type = PHRASE_QUERY;
297
+ return self;
298
+ }
299
+
300
+ /***************************************************************************
301
+ *
302
+ * PhraseScorer
303
+ *
304
+ ***************************************************************************/
305
+
306
+ /***************************************************************************
307
+ * PhrasePosition
308
+ ***************************************************************************/
309
+
310
+ bool pp_next(PhrasePosition *self)
311
+ {
312
+ TermDocEnum *tpe = self->tpe;
313
+ if (!tpe->next(tpe)) {
314
+ tpe->close(tpe); // close stream
315
+ self->tpe = NULL;
316
+ self->doc = INT_MAX; // sentinel value
317
+ return false;
318
+ }
319
+ self->doc = tpe->doc_num(tpe);
320
+ self->position = 0;
321
+ return true;
322
+ }
323
+
324
+ bool pp_skip_to(PhrasePosition *self, int doc_num)
325
+ {
326
+ TermDocEnum *tpe = self->tpe;
327
+ if (!tpe->skip_to(tpe, doc_num)) {
328
+ tpe->close(tpe); // close stream
329
+ self->tpe = NULL;
330
+ self->doc = INT_MAX; // sentinel value
331
+ return false;
332
+ }
333
+ self->doc = tpe->doc_num(tpe);
334
+ self->position = 0;
335
+ return true;
336
+ }
337
+
338
+ bool pp_next_position(PhrasePosition *self)
339
+ {
340
+ TermDocEnum *tpe = self->tpe;
341
+ self->count -= 1;
342
+ if (self->count >= 0) {// read subsequent pos's
343
+ self->position = tpe->next_position(tpe) - self->offset;
344
+ return true;
345
+ } else {
346
+ return false;
347
+ }
348
+ }
349
+
350
+ bool pp_first_position(PhrasePosition *self)
351
+ {
352
+ TermDocEnum *tpe = self->tpe;
353
+ self->count = tpe->freq(tpe); // read first pos
354
+ return pp_next_position(self);
355
+ }
356
+
357
+ char *pp_to_s(PhrasePosition *self)
358
+ {
359
+ return epstrdup("pp->(doc => %d, position => %d)", 40, self->doc, self->position);
360
+ }
361
+
362
+ inline int pp_cmp(const void *const p1, const void *const p2)
363
+ {
364
+ PhrasePosition *pp1 = *(PhrasePosition **)p1;
365
+ PhrasePosition *pp2 = *(PhrasePosition **)p2;
366
+ int cmp = pp1->doc - pp2->doc;
367
+ if (cmp == 0) {
368
+ return pp1->position - pp2->position;
369
+ } else {
370
+ return cmp;
371
+ }
372
+ }
373
+ bool pp_less_than(void *p1, void *p2)
374
+ {
375
+ PhrasePosition *pp1 = (PhrasePosition *)p1;
376
+ PhrasePosition *pp2 = (PhrasePosition *)p2;
377
+ if (pp1->doc == pp2->doc) {
378
+ return pp1->position < pp2->position;
379
+ } else {
380
+ return pp1->doc < pp2->doc;
381
+ }
382
+ }
383
+
384
+ void pp_destroy(void *p)
385
+ {
386
+ PhrasePosition *pp = (PhrasePosition *)p;
387
+ if (pp->tpe) pp->tpe->close(pp->tpe);
388
+ free(pp);
389
+ }
390
+
391
+ PhrasePosition *pp_create(TermDocEnum *tpe, int offset)
392
+ {
393
+ PhrasePosition *self = ALLOC(PhrasePosition);
394
+ self->tpe = tpe;
395
+ self->count = self->doc = self->position = -1;
396
+ self->offset = offset;
397
+ return self;
398
+ }
399
+
400
+ /***************************************************************************
401
+ * PhraseScorer
402
+ ***************************************************************************/
403
+
404
+ #define GET_PHSC PhraseScorer *phsc = (PhraseScorer *)self->data;
405
+
406
+
407
+ void phsc_init(PhraseScorer *phsc)
408
+ {
409
+ int i;
410
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
411
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
412
+ }
413
+
414
+ if (phsc->more) {
415
+ qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
416
+ phsc->pp_first = 0;
417
+ phsc->pp_last = phsc->pp_cnt - 1;
418
+ }
419
+ }
420
+
421
+ #define FIRST_TO_LAST() \
422
+ last = first;\
423
+ phsc->pp_last = phsc->pp_first;\
424
+ phsc->pp_first = (phsc->pp_first + 1) % phsc->pp_cnt;\
425
+ first = phsc->phrase_pos[phsc->pp_first];
426
+
427
+ bool phsc_do_next(Scorer *self)
428
+ {
429
+ GET_PHSC;
430
+ PhrasePosition *first = phsc->phrase_pos[phsc->pp_first];
431
+ PhrasePosition *last = phsc->phrase_pos[phsc->pp_last];
432
+
433
+ while (phsc->more) {
434
+ while (phsc->more && first->doc < last->doc) { // find doc w/ all the terms
435
+ phsc->more = pp_skip_to(first, last->doc); // skip first upto last
436
+ FIRST_TO_LAST(); // and move it to the end
437
+ }
438
+
439
+ if (phsc->more) {
440
+ // found a doc with all of the terms
441
+ phsc->freq = phsc->phrase_freq(self); // check for phrase
442
+ if (phsc->freq == 0.0) { // no match
443
+ first = phsc->phrase_pos[phsc->pp_first];
444
+ last = phsc->phrase_pos[phsc->pp_last];
445
+ phsc->more = pp_next(last); // trigger further scanning
446
+ } else {
447
+ self->doc = first->doc;
448
+ return true; // found a match
449
+ }
450
+ }
451
+ }
452
+ return false;
453
+ }
454
+
455
+ float phsc_score(Scorer *self)
456
+ {
457
+ GET_PHSC;
458
+ float raw = sim_tf(self->similarity, phsc->freq) * phsc->value; // raw score
459
+ // normalize
460
+ return raw * sim_decode_norm(self->similarity,
461
+ phsc->norms[phsc->phrase_pos[phsc->pp_first]->doc]);
462
+ }
463
+
464
+ bool phsc_next(Scorer *self)
465
+ {
466
+ GET_PHSC;
467
+ if (phsc->first_time) {
468
+ phsc_init(phsc);
469
+ phsc->first_time = false;
470
+ } else if (phsc->more) {
471
+ phsc->more = pp_next(phsc->phrase_pos[phsc->pp_last]); // trigger further scanning
472
+ }
473
+ return phsc_do_next(self);
474
+ }
475
+
476
+ bool phsc_skip_to(Scorer *self, int doc_num)
477
+ {
478
+ GET_PHSC;
479
+ int i;
480
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
481
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) break;
482
+ }
483
+
484
+ if (phsc->more) {
485
+ qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
486
+ phsc->pp_first = 0;
487
+ phsc->pp_last = phsc->pp_cnt - 1;
488
+ }
489
+ return phsc_do_next(self);
490
+ }
491
+
492
+ Explanation *phsc_explain(Scorer *self, int doc_num)
493
+ {
494
+ char dbuf[32];
495
+ GET_PHSC;
496
+ while (phsc_next(self) && self->doc < doc_num)
497
+ ;
498
+
499
+ float phrase_freq = (self->doc == doc_num) ? phsc->freq : 0.0;
500
+ dbl_to_s(dbuf, phrase_freq);
501
+ return expl_create(sim_tf(self->similarity, phrase_freq),
502
+ epstrdup("tf(phrase_freq=%s)", strlen(dbuf), dbuf));
503
+ }
504
+
505
+ void phsc_destroy(void *p)
506
+ {
507
+ Scorer *self = (Scorer *)p;
508
+ GET_PHSC;
509
+ int i;
510
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
511
+ pp_destroy(phsc->phrase_pos[i]);
512
+ }
513
+ free(phsc->phrase_pos);
514
+ scorer_destroy(self);
515
+ }
516
+
517
+ Scorer *phsc_create(Weight *weight, TermDocEnum **term_pos_enum,
518
+ int *positions, int t_cnt, Similarity *similarity, uchar *norms)
519
+ {
520
+ int i;
521
+ Scorer *self = scorer_create(similarity);
522
+ PhraseScorer *phsc = ALLOC(PhraseScorer);
523
+ ZEROSET(phsc, PhraseScorer, 1);
524
+
525
+ phsc->weight = weight;
526
+ phsc->norms = norms;
527
+ phsc->value = weight->value;
528
+
529
+ phsc->phrase_pos = ALLOC_N(PhrasePosition *, t_cnt);
530
+ for (i = 0; i < t_cnt; i++) {
531
+ phsc->phrase_pos[i] = pp_create(term_pos_enum[i], positions[i]);
532
+ }
533
+ phsc->pp_first = 0;
534
+ phsc->pp_last = t_cnt - 1;
535
+ phsc->pp_cnt = t_cnt;
536
+
537
+ phsc->slop = 0;
538
+
539
+ phsc->first_time = true;
540
+ phsc->more = true;
541
+
542
+ self->data = phsc;
543
+ self->score = &phsc_score;
544
+ self->next = &phsc_next;
545
+ self->skip_to = &phsc_skip_to;
546
+ self->explain = &phsc_explain;
547
+ self->destroy = &phsc_destroy;
548
+
549
+ return self;
550
+ }
551
+
552
+ /***************************************************************************
553
+ * ExactPhraseScorer
554
+ ***************************************************************************/
555
+
556
+ float ephsc_phrase_freq(Scorer *self)
557
+ {
558
+ GET_PHSC;
559
+ // sort list with pq
560
+ int i;
561
+ float freq = 0.0;
562
+ PhrasePosition *first;
563
+ PhrasePosition *last;
564
+
565
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
566
+ pp_first_position(phsc->phrase_pos[i]);
567
+ }
568
+ qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
569
+ phsc->pp_first = 0;
570
+ phsc->pp_last = phsc->pp_cnt - 1;
571
+
572
+ first = phsc->phrase_pos[0];
573
+ last = phsc->phrase_pos[phsc->pp_last];
574
+
575
+ do { // find position w/ all terms
576
+ while (first->position < last->position) { // scan forward in first
577
+ do {
578
+ if (! pp_next_position(first)) return freq;
579
+ } while (first->position < last->position);
580
+ FIRST_TO_LAST();
581
+ }
582
+ freq += 1.0; // all equal: a match
583
+ } while (pp_next_position(last));
584
+
585
+ return freq;
586
+ }
587
+
588
+ Scorer *exact_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
589
+ int *positions, int t_cnt, Similarity *similarity, uchar *norms)
590
+ {
591
+ Scorer *self =
592
+ phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
593
+ GET_PHSC;
594
+ phsc->phrase_freq = &ephsc_phrase_freq;
595
+ return self;
596
+ }
597
+
598
+ /***************************************************************************
599
+ * SloppyPhraseScorer
600
+ ***************************************************************************/
601
+
602
+ float sphsc_phrase_freq(Scorer *self)
603
+ {
604
+ GET_PHSC;
605
+ PhrasePosition *pp;
606
+ PriorityQueue *pq = pq_create(phsc->pp_cnt, &pp_less_than);
607
+
608
+ int last_pos = 0, pos, next_pos, start, match_length, i;
609
+ bool done = false;
610
+ float freq = 0.0;
611
+
612
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
613
+ pp = phsc->phrase_pos[i];
614
+ pp_first_position(pp);
615
+ if (pp->position > last_pos) last_pos = pp->position;
616
+ pq_push(pq, pp);
617
+ }
618
+
619
+ do {
620
+ pp = pq_pop(pq);
621
+ pos = start = pp->position;
622
+ next_pos = ((PhrasePosition *)pq_top(pq))->position;
623
+ while (pos <= next_pos) {
624
+ start = pos; // advance pp to min window
625
+ if (!pp_next_position(pp)) {
626
+ done = true; // ran out of a term -- done
627
+ break;
628
+ }
629
+ pos = pp->position;
630
+ }
631
+
632
+ match_length = last_pos - start;
633
+ if (match_length <= phsc->slop) {
634
+ freq += sim_sloppy_freq(self->similarity, match_length); // score match
635
+ }
636
+
637
+ if (pp->position > last_pos) {
638
+ last_pos = pp->position;
639
+ }
640
+ pq_push(pq, pp); // restore pq
641
+ } while (!done);
642
+
643
+ pq_destroy(pq);
644
+ return freq;
645
+ }
646
+
647
+ Scorer *sloppy_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
648
+ int *positions, int t_cnt, Similarity *similarity, int slop, uchar *norms)
649
+ {
650
+ Scorer *self =
651
+ phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
652
+ GET_PHSC;
653
+ phsc->slop = slop;
654
+ phsc->phrase_freq = &sphsc_phrase_freq;
655
+ return self;
656
+ }
657
+