ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/q_phrase.c ADDED
@@ -0,0 +1,657 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+
4
+ /***************************************************************************
5
+ *
6
+ * PhraseWeight
7
+ *
8
+ ***************************************************************************/
9
+
10
+ Scorer *phw_scorer(Weight *self, IndexReader *ir)
11
+ {
12
+ Scorer *phsc;
13
+ PhraseQuery *phq = (PhraseQuery *)self->query->data;
14
+ int i;
15
+ if (phq->t_cnt == 0) return NULL; // optimize zero-term case
16
+
17
+ TermDocEnum **tps = ALLOC_N(TermDocEnum *, phq->t_cnt);
18
+
19
+ for (i = 0; i < phq->t_cnt; i++) {
20
+ tps[i] = ir_term_positions_for(ir, phq->terms[i]);
21
+ if (tps[i] == NULL) {
22
+ // free everything we just created and return NULL
23
+ int j;
24
+ for (j = 0; j < i; j++) {
25
+ tps[i]->close(tps[i]);
26
+ }
27
+ free(tps);
28
+ return NULL;
29
+ }
30
+ }
31
+
32
+ if (phq->slop == 0) { // optimize exact case
33
+ phsc = exact_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
34
+ self->similarity,
35
+ ir->get_norms(ir, phq->field));
36
+ } else {
37
+ phsc = sloppy_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
38
+ self->similarity,
39
+ phq->slop,
40
+ ir->get_norms(ir, phq->field));
41
+ }
42
+ free(tps);
43
+ return phsc;
44
+ }
45
+
46
+ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
47
+ {
48
+ char *query_str = self->query->to_s(self->query, "");
49
+ PhraseQuery *phq = (PhraseQuery *)self->query->data;
50
+ int i;
51
+ char *doc_freqs = NULL;
52
+ int len = 0, pos = 0;
53
+
54
+ Explanation *expl = expl_create(0.0,
55
+ epstrdup("weight(%s in %d), product of:",
56
+ strlen(query_str) + 20,
57
+ query_str, doc_num));
58
+
59
+ for (i = 0; i < phq->t_cnt; i++) {
60
+ len += strlen(phq->terms[i]->text) + 30;
61
+ }
62
+ doc_freqs = ALLOC_N(char, len);
63
+ for (i = 0; i < phq->t_cnt; i++) {
64
+ Term *term = phq->terms[i];
65
+ sprintf(doc_freqs + pos, "%s=%d, ", term->text, ir->doc_freq(ir, term));
66
+ pos += strlen(doc_freqs + pos);
67
+ }
68
+ pos -= 2; // remove ", " from the end
69
+ doc_freqs[pos] = 0;
70
+
71
+ Explanation *idf_expl1 = expl_create(self->idf,
72
+ epstrdup("idf(%s:<%s>)", strlen(phq->field) + pos, phq->field, doc_freqs));
73
+ Explanation *idf_expl2 = expl_create(self->idf,
74
+ epstrdup("idf(%s:<%s>)", strlen(phq->field) + pos, phq->field, doc_freqs));
75
+ free(doc_freqs);
76
+
77
+ // explain query weight
78
+ Explanation *query_expl = expl_create(0.0,
79
+ epstrdup("query_weight(%s), product of:", strlen(query_str), query_str));
80
+
81
+ if (self->query->boost != 1.0) {
82
+ expl_add_detail(query_expl, expl_create(self->query->boost, estrdup("boost")));
83
+ }
84
+ expl_add_detail(query_expl, idf_expl1);
85
+
86
+ Explanation *qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
87
+ expl_add_detail(query_expl, qnorm_expl);
88
+
89
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
90
+
91
+ expl_add_detail(expl, query_expl);
92
+
93
+ // explain field weight
94
+ Explanation *field_expl = expl_create(0.0,
95
+ epstrdup("field_weight(%s in %d), product of:",
96
+ strlen(query_str) + 20, query_str, doc_num));
97
+ free(query_str);
98
+
99
+ Scorer *scorer = self->scorer(self, ir);
100
+ Explanation *tf_expl = scorer->explain(scorer, doc_num);
101
+ scorer->destroy(scorer);
102
+ expl_add_detail(field_expl, tf_expl);
103
+ expl_add_detail(field_expl, idf_expl2);
104
+
105
+ uchar *field_norms = ir->get_norms(ir, phq->field);
106
+ float field_norm = (field_norms != NULL)
107
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
108
+ : 0.0;
109
+ Explanation *field_norm_expl = expl_create(field_norm,
110
+ epstrdup("field_norm(field=%s, doc=%d)",
111
+ strlen(phq->field) + 20, phq->field, doc_num));
112
+
113
+ expl_add_detail(field_expl, field_norm_expl);
114
+
115
+ field_expl->value = tf_expl->value * self->idf * field_norm;
116
+
117
+ // combine them
118
+ if (query_expl->value == 1.0) {
119
+ expl_destoy(expl);
120
+ return field_expl;
121
+ } else {
122
+ expl->value = (query_expl->value * field_expl->value);
123
+ expl_add_detail(expl, field_expl);
124
+ return expl;
125
+ }
126
+ }
127
+
128
+ char *phw_to_s(Weight *self)
129
+ {
130
+ char dbuf[32];
131
+ dbl_to_s(dbuf, self->value);
132
+ return epstrdup("PhraseWeight(%s)", strlen(dbuf), dbuf);
133
+ }
134
+
135
+ Weight *phw_create(Query *query, Searcher *searcher)
136
+ {
137
+ PhraseQuery *phq = (PhraseQuery *)query->data;
138
+ Weight *self = ALLOC(Weight);
139
+ ZEROSET(self, Weight, 1);
140
+ self->get_query = &w_get_query;
141
+ self->get_value = &w_get_value;
142
+ self->normalize = &w_normalize;
143
+ self->scorer = &phw_scorer;
144
+ self->explain = &phw_explain;
145
+ self->to_s = &phw_to_s;
146
+ self->destroy = &free;
147
+ self->sum_of_squared_weights = &w_sum_of_squared_weights;
148
+
149
+ self->similarity = query->get_similarity(query, searcher);
150
+ self->query = query;
151
+ self->value = query->boost;
152
+ self->idf = sim_idf_phrase(self->similarity, phq->terms, phq->t_cnt, searcher);
153
+
154
+ return self;
155
+ }
156
+
157
+ /***************************************************************************
158
+ *
159
+ * PhraseQuery
160
+ *
161
+ ***************************************************************************/
162
+
163
+ #define GET_PHQ PhraseQuery *phq = (PhraseQuery *)self->data
164
+
165
+ void phq_extract_terms(Query *self, Array *terms)
166
+ {
167
+ GET_PHQ;
168
+ int i;
169
+ for (i = 0; i < phq->t_cnt; i++) {
170
+ ary_append(terms, phq->terms[i]);
171
+ }
172
+ }
173
+
174
+ char *phq_to_s(Query *self, char *field)
175
+ {
176
+ GET_PHQ;
177
+ int i, j, buf_index = 0, len = 0, pos, last_pos = -1;
178
+ char *buffer;
179
+ if (!phq->t_cnt) return NULL;
180
+ len = strlen(phq->field) + 1;
181
+ for (i = 0; i < phq->t_cnt; i++) {
182
+ len += strlen(phq->terms[i]->text) + 1;
183
+ }
184
+ // add space for extra characters and boost and slop
185
+ len += 100 + 3 * phq->positions[phq->t_cnt - 1];
186
+
187
+ buffer = ALLOC_N(char, len);
188
+
189
+ if (strcmp(field, phq->field) != 0) {
190
+ len = strlen(phq->field);
191
+ memcpy(buffer, phq->field, len);
192
+ buffer[len] = ':';
193
+ buf_index += len + 1;
194
+ }
195
+ buffer[buf_index++] = '"';
196
+
197
+ for (i = 0; i < phq->t_cnt; i++) {
198
+ Term *term = phq->terms[i];
199
+ pos = phq->positions[i];
200
+ for (j = last_pos; j < pos - 1; j++) {
201
+ memcpy(buffer + buf_index, "<> ", 3);
202
+ buf_index += 3;
203
+ }
204
+ last_pos = pos;
205
+
206
+ len = strlen(term->text);
207
+ memcpy(buffer + buf_index, term->text, len);
208
+ buf_index += len;
209
+ buffer[buf_index++] = ' ';
210
+ }
211
+ if (buffer[buf_index-1] == ' ') buf_index--;
212
+ buffer[buf_index++] = '"';
213
+ buffer[buf_index] = 0;
214
+ if (phq->slop != 0) {
215
+ sprintf(buffer + buf_index, "~%d", phq->slop);
216
+ buf_index += strlen(buffer + buf_index);
217
+ }
218
+ if (self->boost != 1.0) {
219
+ char dbuf[32];
220
+ dbl_to_s(dbuf, self->boost);
221
+ sprintf(buffer + buf_index, "^%s", dbuf);
222
+ }
223
+ return buffer;
224
+ }
225
+
226
+ void phq_destroy(void *p)
227
+ {
228
+ Query *self = (Query *)p;
229
+
230
+ GET_PHQ;
231
+ int i;
232
+ if (self->destroy_all) {
233
+ for (i = 0; i < phq->t_cnt; i++) {
234
+ term_destroy(phq->terms[i]);
235
+ }
236
+ }
237
+ free(phq->terms);
238
+ free(phq->positions);
239
+ free(phq);
240
+
241
+ q_destroy(self);
242
+ }
243
+
244
+ Query *phq_rewrite(Query *self, IndexReader *ir)
245
+ {
246
+ GET_PHQ;
247
+ if (phq->t_cnt == 1) { // optimize one-term case
248
+ Term *term = phq->terms[0];
249
+ Query *tq = tq_create(term_clone(term));
250
+ tq->boost = self->boost;
251
+ if (self->rewritten) self->rewritten->destroy(self->rewritten);
252
+ return self->rewritten = tq;
253
+ } else {
254
+ return self;
255
+ }
256
+ }
257
+
258
+ void phq_add_term(Query *self, Term *term, int pos_inc)
259
+ {
260
+ GET_PHQ;
261
+ int position, index = phq->t_cnt;
262
+ if (index >= phq->t_capa) {
263
+ phq->t_capa *= 2;
264
+ REALLOC_N(phq->terms, Term *, phq->t_capa);
265
+ REALLOC_N(phq->positions, int, phq->t_capa);
266
+ }
267
+ if (index == 0) {
268
+ position = 0;
269
+ phq->field = term->field;
270
+ } else {
271
+ position = phq->positions[index - 1] + pos_inc;
272
+ if (strcmp(term->field, phq->field) != 0) {
273
+ eprintf(ARG_ERROR, "All phrase terms must be in the same field. Current phrase is %s, tried to add %s\n", phq->field, term->field);
274
+ }
275
+ }
276
+ phq->terms[index] = term;
277
+ phq->positions[index] = position;
278
+ phq->t_cnt++;
279
+ }
280
+
281
+ Query *phq_create()
282
+ {
283
+ Query *self = q_create();
284
+ PhraseQuery *phq = ALLOC(PhraseQuery);
285
+ ZEROSET(phq, PhraseQuery, 1);
286
+ phq->t_capa = PHQ_INIT_CAPA;
287
+ phq->terms = ALLOC_N(Term *, PHQ_INIT_CAPA);
288
+ phq->positions = ALLOC_N(int, PHQ_INIT_CAPA);
289
+ self->data = phq;
290
+
291
+ self->create_weight = &phw_create;
292
+ self->extract_terms = &phq_extract_terms;
293
+ self->to_s = &phq_to_s;
294
+ self->destroy = &phq_destroy;
295
+ self->rewrite = &phq_rewrite;
296
+ self->type = PHRASE_QUERY;
297
+ return self;
298
+ }
299
+
300
+ /***************************************************************************
301
+ *
302
+ * PhraseScorer
303
+ *
304
+ ***************************************************************************/
305
+
306
+ /***************************************************************************
307
+ * PhrasePosition
308
+ ***************************************************************************/
309
+
310
+ bool pp_next(PhrasePosition *self)
311
+ {
312
+ TermDocEnum *tpe = self->tpe;
313
+ if (!tpe->next(tpe)) {
314
+ tpe->close(tpe); // close stream
315
+ self->tpe = NULL;
316
+ self->doc = INT_MAX; // sentinel value
317
+ return false;
318
+ }
319
+ self->doc = tpe->doc_num(tpe);
320
+ self->position = 0;
321
+ return true;
322
+ }
323
+
324
+ bool pp_skip_to(PhrasePosition *self, int doc_num)
325
+ {
326
+ TermDocEnum *tpe = self->tpe;
327
+ if (!tpe->skip_to(tpe, doc_num)) {
328
+ tpe->close(tpe); // close stream
329
+ self->tpe = NULL;
330
+ self->doc = INT_MAX; // sentinel value
331
+ return false;
332
+ }
333
+ self->doc = tpe->doc_num(tpe);
334
+ self->position = 0;
335
+ return true;
336
+ }
337
+
338
+ bool pp_next_position(PhrasePosition *self)
339
+ {
340
+ TermDocEnum *tpe = self->tpe;
341
+ self->count -= 1;
342
+ if (self->count >= 0) {// read subsequent pos's
343
+ self->position = tpe->next_position(tpe) - self->offset;
344
+ return true;
345
+ } else {
346
+ return false;
347
+ }
348
+ }
349
+
350
+ bool pp_first_position(PhrasePosition *self)
351
+ {
352
+ TermDocEnum *tpe = self->tpe;
353
+ self->count = tpe->freq(tpe); // read first pos
354
+ return pp_next_position(self);
355
+ }
356
+
357
+ char *pp_to_s(PhrasePosition *self)
358
+ {
359
+ return epstrdup("pp->(doc => %d, position => %d)", 40, self->doc, self->position);
360
+ }
361
+
362
+ inline int pp_cmp(const void *const p1, const void *const p2)
363
+ {
364
+ PhrasePosition *pp1 = *(PhrasePosition **)p1;
365
+ PhrasePosition *pp2 = *(PhrasePosition **)p2;
366
+ int cmp = pp1->doc - pp2->doc;
367
+ if (cmp == 0) {
368
+ return pp1->position - pp2->position;
369
+ } else {
370
+ return cmp;
371
+ }
372
+ }
373
+ bool pp_less_than(void *p1, void *p2)
374
+ {
375
+ PhrasePosition *pp1 = (PhrasePosition *)p1;
376
+ PhrasePosition *pp2 = (PhrasePosition *)p2;
377
+ if (pp1->doc == pp2->doc) {
378
+ return pp1->position < pp2->position;
379
+ } else {
380
+ return pp1->doc < pp2->doc;
381
+ }
382
+ }
383
+
384
+ void pp_destroy(void *p)
385
+ {
386
+ PhrasePosition *pp = (PhrasePosition *)p;
387
+ if (pp->tpe) pp->tpe->close(pp->tpe);
388
+ free(pp);
389
+ }
390
+
391
+ PhrasePosition *pp_create(TermDocEnum *tpe, int offset)
392
+ {
393
+ PhrasePosition *self = ALLOC(PhrasePosition);
394
+ self->tpe = tpe;
395
+ self->count = self->doc = self->position = -1;
396
+ self->offset = offset;
397
+ return self;
398
+ }
399
+
400
+ /***************************************************************************
401
+ * PhraseScorer
402
+ ***************************************************************************/
403
+
404
+ #define GET_PHSC PhraseScorer *phsc = (PhraseScorer *)self->data;
405
+
406
+
407
+ void phsc_init(PhraseScorer *phsc)
408
+ {
409
+ int i;
410
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
411
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
412
+ }
413
+
414
+ if (phsc->more) {
415
+ qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
416
+ phsc->pp_first = 0;
417
+ phsc->pp_last = phsc->pp_cnt - 1;
418
+ }
419
+ }
420
+
421
+ #define FIRST_TO_LAST() \
422
+ last = first;\
423
+ phsc->pp_last = phsc->pp_first;\
424
+ phsc->pp_first = (phsc->pp_first + 1) % phsc->pp_cnt;\
425
+ first = phsc->phrase_pos[phsc->pp_first];
426
+
427
+ bool phsc_do_next(Scorer *self)
428
+ {
429
+ GET_PHSC;
430
+ PhrasePosition *first = phsc->phrase_pos[phsc->pp_first];
431
+ PhrasePosition *last = phsc->phrase_pos[phsc->pp_last];
432
+
433
+ while (phsc->more) {
434
+ while (phsc->more && first->doc < last->doc) { // find doc w/ all the terms
435
+ phsc->more = pp_skip_to(first, last->doc); // skip first upto last
436
+ FIRST_TO_LAST(); // and move it to the end
437
+ }
438
+
439
+ if (phsc->more) {
440
+ // found a doc with all of the terms
441
+ phsc->freq = phsc->phrase_freq(self); // check for phrase
442
+ if (phsc->freq == 0.0) { // no match
443
+ first = phsc->phrase_pos[phsc->pp_first];
444
+ last = phsc->phrase_pos[phsc->pp_last];
445
+ phsc->more = pp_next(last); // trigger further scanning
446
+ } else {
447
+ self->doc = first->doc;
448
+ return true; // found a match
449
+ }
450
+ }
451
+ }
452
+ return false;
453
+ }
454
+
455
+ float phsc_score(Scorer *self)
456
+ {
457
+ GET_PHSC;
458
+ float raw = sim_tf(self->similarity, phsc->freq) * phsc->value; // raw score
459
+ // normalize
460
+ return raw * sim_decode_norm(self->similarity,
461
+ phsc->norms[phsc->phrase_pos[phsc->pp_first]->doc]);
462
+ }
463
+
464
+ bool phsc_next(Scorer *self)
465
+ {
466
+ GET_PHSC;
467
+ if (phsc->first_time) {
468
+ phsc_init(phsc);
469
+ phsc->first_time = false;
470
+ } else if (phsc->more) {
471
+ phsc->more = pp_next(phsc->phrase_pos[phsc->pp_last]); // trigger further scanning
472
+ }
473
+ return phsc_do_next(self);
474
+ }
475
+
476
+ bool phsc_skip_to(Scorer *self, int doc_num)
477
+ {
478
+ GET_PHSC;
479
+ int i;
480
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
481
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) break;
482
+ }
483
+
484
+ if (phsc->more) {
485
+ qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
486
+ phsc->pp_first = 0;
487
+ phsc->pp_last = phsc->pp_cnt - 1;
488
+ }
489
+ return phsc_do_next(self);
490
+ }
491
+
492
+ Explanation *phsc_explain(Scorer *self, int doc_num)
493
+ {
494
+ char dbuf[32];
495
+ GET_PHSC;
496
+ while (phsc_next(self) && self->doc < doc_num)
497
+ ;
498
+
499
+ float phrase_freq = (self->doc == doc_num) ? phsc->freq : 0.0;
500
+ dbl_to_s(dbuf, phrase_freq);
501
+ return expl_create(sim_tf(self->similarity, phrase_freq),
502
+ epstrdup("tf(phrase_freq=%s)", strlen(dbuf), dbuf));
503
+ }
504
+
505
+ void phsc_destroy(void *p)
506
+ {
507
+ Scorer *self = (Scorer *)p;
508
+ GET_PHSC;
509
+ int i;
510
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
511
+ pp_destroy(phsc->phrase_pos[i]);
512
+ }
513
+ free(phsc->phrase_pos);
514
+ scorer_destroy(self);
515
+ }
516
+
517
+ Scorer *phsc_create(Weight *weight, TermDocEnum **term_pos_enum,
518
+ int *positions, int t_cnt, Similarity *similarity, uchar *norms)
519
+ {
520
+ int i;
521
+ Scorer *self = scorer_create(similarity);
522
+ PhraseScorer *phsc = ALLOC(PhraseScorer);
523
+ ZEROSET(phsc, PhraseScorer, 1);
524
+
525
+ phsc->weight = weight;
526
+ phsc->norms = norms;
527
+ phsc->value = weight->value;
528
+
529
+ phsc->phrase_pos = ALLOC_N(PhrasePosition *, t_cnt);
530
+ for (i = 0; i < t_cnt; i++) {
531
+ phsc->phrase_pos[i] = pp_create(term_pos_enum[i], positions[i]);
532
+ }
533
+ phsc->pp_first = 0;
534
+ phsc->pp_last = t_cnt - 1;
535
+ phsc->pp_cnt = t_cnt;
536
+
537
+ phsc->slop = 0;
538
+
539
+ phsc->first_time = true;
540
+ phsc->more = true;
541
+
542
+ self->data = phsc;
543
+ self->score = &phsc_score;
544
+ self->next = &phsc_next;
545
+ self->skip_to = &phsc_skip_to;
546
+ self->explain = &phsc_explain;
547
+ self->destroy = &phsc_destroy;
548
+
549
+ return self;
550
+ }
551
+
552
+ /***************************************************************************
553
+ * ExactPhraseScorer
554
+ ***************************************************************************/
555
+
556
+ float ephsc_phrase_freq(Scorer *self)
557
+ {
558
+ GET_PHSC;
559
+ // sort list with pq
560
+ int i;
561
+ float freq = 0.0;
562
+ PhrasePosition *first;
563
+ PhrasePosition *last;
564
+
565
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
566
+ pp_first_position(phsc->phrase_pos[i]);
567
+ }
568
+ qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
569
+ phsc->pp_first = 0;
570
+ phsc->pp_last = phsc->pp_cnt - 1;
571
+
572
+ first = phsc->phrase_pos[0];
573
+ last = phsc->phrase_pos[phsc->pp_last];
574
+
575
+ do { // find position w/ all terms
576
+ while (first->position < last->position) { // scan forward in first
577
+ do {
578
+ if (! pp_next_position(first)) return freq;
579
+ } while (first->position < last->position);
580
+ FIRST_TO_LAST();
581
+ }
582
+ freq += 1.0; // all equal: a match
583
+ } while (pp_next_position(last));
584
+
585
+ return freq;
586
+ }
587
+
588
+ Scorer *exact_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
589
+ int *positions, int t_cnt, Similarity *similarity, uchar *norms)
590
+ {
591
+ Scorer *self =
592
+ phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
593
+ GET_PHSC;
594
+ phsc->phrase_freq = &ephsc_phrase_freq;
595
+ return self;
596
+ }
597
+
598
+ /***************************************************************************
599
+ * SloppyPhraseScorer
600
+ ***************************************************************************/
601
+
602
+ float sphsc_phrase_freq(Scorer *self)
603
+ {
604
+ GET_PHSC;
605
+ PhrasePosition *pp;
606
+ PriorityQueue *pq = pq_create(phsc->pp_cnt, &pp_less_than);
607
+
608
+ int last_pos = 0, pos, next_pos, start, match_length, i;
609
+ bool done = false;
610
+ float freq = 0.0;
611
+
612
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
613
+ pp = phsc->phrase_pos[i];
614
+ pp_first_position(pp);
615
+ if (pp->position > last_pos) last_pos = pp->position;
616
+ pq_push(pq, pp);
617
+ }
618
+
619
+ do {
620
+ pp = pq_pop(pq);
621
+ pos = start = pp->position;
622
+ next_pos = ((PhrasePosition *)pq_top(pq))->position;
623
+ while (pos <= next_pos) {
624
+ start = pos; // advance pp to min window
625
+ if (!pp_next_position(pp)) {
626
+ done = true; // ran out of a term -- done
627
+ break;
628
+ }
629
+ pos = pp->position;
630
+ }
631
+
632
+ match_length = last_pos - start;
633
+ if (match_length <= phsc->slop) {
634
+ freq += sim_sloppy_freq(self->similarity, match_length); // score match
635
+ }
636
+
637
+ if (pp->position > last_pos) {
638
+ last_pos = pp->position;
639
+ }
640
+ pq_push(pq, pp); // restore pq
641
+ } while (!done);
642
+
643
+ pq_destroy(pq);
644
+ return freq;
645
+ }
646
+
647
+ Scorer *sloppy_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
648
+ int *positions, int t_cnt, Similarity *similarity, int slop, uchar *norms)
649
+ {
650
+ Scorer *self =
651
+ phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
652
+ GET_PHSC;
653
+ phsc->slop = slop;
654
+ phsc->phrase_freq = &sphsc_phrase_freq;
655
+ return self;
656
+ }
657
+