ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_phrase.c CHANGED
@@ -1,684 +1,1121 @@
1
1
  #include <string.h>
2
+ #include <limits.h>
2
3
  #include "search.h"
4
+ #include "array.h"
5
+
6
+ #define PhQ(query) ((PhraseQuery *)(query))
7
+
8
+ static int phrase_pos_cmp(const void *p1, const void *p2)
9
+ {
10
+ int pos1 = ((PhrasePosition *)p1)->pos;
11
+ int pos2 = ((PhrasePosition *)p2)->pos;
12
+ if (pos1 > pos2) {
13
+ return 1;
14
+ }
15
+ if (pos1 < pos2) {
16
+ return -1;
17
+ }
18
+ return strcmp(((PhrasePosition *)p1)->terms[0],
19
+ ((PhrasePosition *)p2)->terms[0]);
20
+ }
3
21
 
4
- static char * const FIELD_CHANGE_ERROR_MSG = "Field illegally changed in the phrase";
5
22
 
6
23
  /***************************************************************************
7
24
  *
8
- * PhraseWeight
25
+ * PhraseScorer
9
26
  *
10
27
  ***************************************************************************/
11
28
 
12
- Scorer *phw_scorer(Weight *self, IndexReader *ir)
13
- {
14
- Scorer *phsc;
15
- PhraseQuery *phq = (PhraseQuery *)self->query->data;
16
- int i;
17
- TermDocEnum **tps;
18
-
19
- if (phq->t_cnt == 0) {
20
- return NULL; /* optimize zero-term case */
21
- }
22
-
23
- tps = ALLOC_N(TermDocEnum *, phq->t_cnt);
24
-
25
- for (i = 0; i < phq->t_cnt; i++) {
26
- tps[i] = ir_term_positions_for(ir, phq->terms[i]);
27
- if (tps[i] == NULL) {
28
- // free everything we just created and return NULL
29
- int j;
30
- for (j = 0; j < i; j++) {
31
- tps[i]->close(tps[i]);
32
- }
33
- free(tps);
34
- return NULL;
35
- }
36
- }
37
-
38
- if (phq->slop == 0) { // optimize exact case
39
- phsc = exact_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
40
- self->similarity,
41
- ir->get_norms(ir, phq->field));
42
- } else {
43
- phsc = sloppy_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
44
- self->similarity,
45
- phq->slop,
46
- ir->get_norms(ir, phq->field));
47
- }
48
- free(tps);
49
- return phsc;
29
+ /***************************************************************************
30
+ * PhPos
31
+ ***************************************************************************/
32
+
33
+ #define PP(p) ((PhPos *)(p))
34
+ typedef struct PhPos
35
+ {
36
+ TermDocEnum *tpe;
37
+ int offset;
38
+ int count;
39
+ int doc;
40
+ int position;
41
+ } PhPos;
42
+
43
+ static bool pp_next(PhPos *self)
44
+ {
45
+ TermDocEnum *tpe = self->tpe;
46
+ if (!tpe->next(tpe)) {
47
+ tpe->close(tpe); /* close stream */
48
+ self->tpe = NULL;
49
+ self->doc = INT_MAX; /* sentinel value */
50
+ return false;
51
+ }
52
+ self->doc = tpe->doc_num(tpe);
53
+ self->position = 0;
54
+ return true;
50
55
  }
51
56
 
52
- Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
57
+ static bool pp_skip_to(PhPos *self, int doc_num)
53
58
  {
54
- Explanation *idf_expl1;
55
- Explanation *idf_expl2;
56
- Explanation *query_expl;
57
- Explanation *qnorm_expl;
58
- Explanation *field_expl;
59
- Explanation *tf_expl;
60
- Scorer *scorer;
61
- uchar *field_norms;
62
- float field_norm;
63
- Explanation *field_norm_expl;
64
-
65
- char *query_str = self->query->to_s(self->query, "");
66
- PhraseQuery *phq = (PhraseQuery *)self->query->data;
67
- int i;
68
- char *doc_freqs = NULL;
69
- int len = 0, pos = 0;
70
-
71
- Explanation *expl = expl_create(0.0,
72
- strfmt("weight(%s in %d), product of:", query_str, doc_num));
73
-
74
- for (i = 0; i < phq->t_cnt; i++) {
75
- len += (int)strlen(phq->terms[i]->text) + 30;
76
- }
77
- doc_freqs = ALLOC_N(char, len);
78
- for (i = 0; i < phq->t_cnt; i++) {
79
- Term *term = phq->terms[i];
80
- sprintf(doc_freqs + pos, "%s=%d, ", term->text, ir->doc_freq(ir, term));
81
- pos += (int)strlen(doc_freqs + pos);
82
- }
83
- pos -= 2; // remove ", " from the end
84
- doc_freqs[pos] = 0;
85
-
86
- idf_expl1 = expl_create(self->idf,
87
- strfmt("idf(%s:<%s>)", phq->field, doc_freqs));
88
- idf_expl2 = expl_create(self->idf,
89
- strfmt("idf(%s:<%s>)", phq->field, doc_freqs));
90
- free(doc_freqs);
91
-
92
- /* explain query weight */
93
- query_expl = expl_create(0.0,
94
- strfmt("query_weight(%s), product of:", query_str));
95
-
96
- if (self->query->boost != 1.0) {
97
- expl_add_detail(query_expl, expl_create(self->query->boost, estrdup("boost")));
98
- }
99
- expl_add_detail(query_expl, idf_expl1);
100
-
101
- qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
102
- expl_add_detail(query_expl, qnorm_expl);
103
-
104
- query_expl->value = self->query->boost * self->idf * self->qnorm;
105
-
106
- expl_add_detail(expl, query_expl);
107
-
108
- /* explain field weight */
109
- field_expl = expl_create(0.0,
110
- strfmt("field_weight(%s in %d), product of:", query_str, doc_num));
111
- free(query_str);
112
-
113
- scorer = self->scorer(self, ir);
114
- tf_expl = scorer->explain(scorer, doc_num);
115
- scorer->destroy(scorer);
116
- expl_add_detail(field_expl, tf_expl);
117
- expl_add_detail(field_expl, idf_expl2);
118
-
119
- field_norms = ir->get_norms(ir, phq->field);
120
- field_norm = (field_norms != NULL)
121
- ? sim_decode_norm(self->similarity, field_norms[doc_num])
122
- : (float)0.0;
123
- field_norm_expl = expl_create(field_norm,
124
- strfmt("field_norm(field=%s, doc=%d)", phq->field, doc_num));
125
-
126
- expl_add_detail(field_expl, field_norm_expl);
127
-
128
- field_expl->value = tf_expl->value * self->idf * field_norm;
129
-
130
- /* combine them */
131
- if (query_expl->value == 1.0) {
132
- expl_destoy(expl);
133
- return field_expl;
134
- } else {
135
- expl->value = (query_expl->value * field_expl->value);
136
- expl_add_detail(expl, field_expl);
137
- return expl;
138
- }
139
- }
140
-
141
- char *phw_to_s(Weight *self)
142
- {
143
- return strfmt("PhraseWeight(%f)", self->value);
144
- }
145
-
146
- Weight *phw_create(Query *query, Searcher *searcher)
147
- {
148
- Weight *self = w_create(query);
149
- PhraseQuery *phq = (PhraseQuery *)query->data;
150
-
151
- self->scorer = &phw_scorer;
152
- self->explain = &phw_explain;
153
- self->to_s = &phw_to_s;
154
- self->sum_of_squared_weights = &w_sum_of_squared_weights;
155
-
156
- self->similarity = query->get_similarity(query, searcher);
157
- self->value = query->boost;
158
- self->idf = sim_idf_phrase(self->similarity, phq->terms, phq->t_cnt, searcher);
159
-
160
- return self;
59
+ TermDocEnum *tpe = self->tpe;
60
+ if (!tpe->skip_to(tpe, doc_num)) {
61
+ tpe->close(tpe); /* close stream */
62
+ self->tpe = NULL;
63
+ self->doc = INT_MAX; /* sentinel value */
64
+ return false;
65
+ }
66
+ self->doc = tpe->doc_num(tpe);
67
+ self->position = 0;
68
+ return true;
161
69
  }
162
70
 
163
- /***************************************************************************
164
- *
165
- * PhraseQuery
166
- *
167
- ***************************************************************************/
71
+ static bool pp_next_position(PhPos *self)
72
+ {
73
+ TermDocEnum *tpe = self->tpe;
74
+ self->count--;
75
+ if (self->count >= 0) { /* read subsequent pos's */
76
+ self->position = tpe->next_position(tpe) - self->offset;
77
+ return true;
78
+ }
79
+ else {
80
+ return false;
81
+ }
82
+ }
168
83
 
169
- #define GET_PHQ PhraseQuery *phq = (PhraseQuery *)self->data
170
-
171
- void phq_extract_terms(Query *self, HashSet *terms)
172
- {
173
- GET_PHQ;
174
- int i;
175
- for (i = 0; i < phq->t_cnt; i++) {
176
- hs_add(terms, term_clone(phq->terms[i]));
177
- }
178
- }
179
-
180
- char *phq_to_s(Query *self, char *field)
181
- {
182
- GET_PHQ;
183
- int i, j, buf_index = 0, len = 0, pos, last_pos = -1;
184
- char *buffer;
185
- if (!phq->t_cnt) return NULL;
186
- len = (int)strlen(phq->field) + 1;
187
- for (i = 0; i < phq->t_cnt; i++) {
188
- len += (int)strlen(phq->terms[i]->text) + 1;
189
- }
190
- // add space for extra characters and boost and slop
191
- len += 100 + 3 * phq->positions[phq->t_cnt - 1];
192
-
193
- buffer = ALLOC_N(char, len);
194
-
195
- if (strcmp(field, phq->field) != 0) {
196
- len = (int)strlen(phq->field);
197
- memcpy(buffer, phq->field, len);
198
- buffer[len] = ':';
199
- buf_index += len + 1;
200
- }
201
- buffer[buf_index++] = '"';
202
-
203
- for (i = 0; i < phq->t_cnt; i++) {
204
- Term *term = phq->terms[i];
205
- pos = phq->positions[i];
206
- for (j = last_pos; j < pos - 1; j++) {
207
- memcpy(buffer + buf_index, "<> ", 3);
208
- buf_index += 3;
209
- }
210
- last_pos = pos;
211
-
212
- len = (int)strlen(term->text);
213
- memcpy(buffer + buf_index, term->text, len);
214
- buf_index += len;
215
- buffer[buf_index++] = ' ';
216
- }
217
- if (buffer[buf_index-1] == ' ') buf_index--;
218
- buffer[buf_index++] = '"';
219
- buffer[buf_index] = 0;
220
- if (phq->slop != 0) {
221
- sprintf(buffer + buf_index, "~%d", phq->slop);
222
- buf_index += (int)strlen(buffer + buf_index);
223
- }
224
- if (self->boost != 1.0) {
225
- buffer[buf_index++] = '^';
226
- dbl_to_s(buffer + buf_index, self->boost);
227
- }
228
- return buffer;
229
- }
230
-
231
- void phq_destroy(Query *self)
232
- {
233
- GET_PHQ;
234
- int i;
235
- if (self->destroy_all) {
236
- for (i = 0; i < phq->t_cnt; i++) {
237
- term_destroy(phq->terms[i]);
238
- }
239
- }
240
- free(phq->terms);
241
- free(phq->positions);
242
- free(phq);
243
-
244
- q_destroy_i(self);
245
- }
246
-
247
- Query *phq_rewrite(Query *self, IndexReader *ir)
248
- {
249
- GET_PHQ;
250
- if (phq->t_cnt == 1) { // optimize one-term case
251
- Term *term = phq->terms[0];
252
- Query *tq = tq_create(term_clone(term));
253
- tq->boost = self->boost;
254
- return tq;
255
- } else {
256
- self->ref_cnt++;
257
- return self;
258
- }
84
+ static bool pp_first_position(PhPos *self)
85
+ {
86
+ TermDocEnum *tpe = self->tpe;
87
+ self->count = tpe->freq(tpe); /* read first pos */
88
+ return pp_next_position(self);
259
89
  }
260
90
 
261
- void phq_add_term(Query *self, Term *term, int pos_inc)
91
+ /*
92
+ static char *pp_to_s(PhPos *self)
262
93
  {
263
- GET_PHQ;
264
- int position, index = phq->t_cnt;
265
- if (index >= phq->t_capa) {
266
- phq->t_capa *= 2;
267
- REALLOC_N(phq->terms, Term *, phq->t_capa);
268
- REALLOC_N(phq->positions, int, phq->t_capa);
269
- }
270
- if (index == 0) {
271
- position = 0;
272
- phq->field = term->field;
273
- } else {
274
- position = phq->positions[index - 1] + pos_inc;
275
- if (strcmp(term->field, phq->field) != 0) {
276
- RAISE(ARG_ERROR, FIELD_CHANGE_ERROR_MSG);
94
+ return strfmt("pp->(doc => %d, position => %d)", self->doc, self->position);
95
+ }
96
+ */
97
+
98
+ #define PP_pp(p) (*(PhPos **)p)
99
+ static int pp_cmp(const void *const p1, const void *const p2)
100
+ {
101
+ int cmp = PP_pp(p1)->doc - PP_pp(p2)->doc;
102
+ if (cmp == 0) {
103
+ return PP_pp(p1)->position - PP_pp(p2)->position;
104
+ }
105
+ else {
106
+ return cmp;
277
107
  }
278
- }
279
- phq->terms[index] = term;
280
- phq->positions[index] = position;
281
- phq->t_cnt++;
282
108
  }
283
109
 
284
- static uint phq_hash(Query *self)
110
+ static int pp_pos_cmp(const void *const p1, const void *const p2)
285
111
  {
286
- int i;
287
- uint hash = 0;
288
- PhraseQuery *phq = (PhraseQuery *)self->data;
289
- for (i = 0; i < phq->t_cnt; i++) {
290
- hash = (hash << 1) ^ (term_hash(phq->terms[i]) ^ phq->positions[i]);
291
- }
292
- return (hash ^ phq->slop);
112
+ return PP_pp(p1)->position - PP_pp(p2)->position;
293
113
  }
294
114
 
295
- static int phq_eq(Query *self, Query *o)
115
+ static bool pp_less_than(const PhPos *pp1, const PhPos *pp2)
116
+ {
117
+ /* docs will all be equal when this method is used */
118
+ return pp1->position < pp2->position;
119
+ /*
120
+ if (PP(p)->doc == PP(p)->doc) {
121
+ return PP(p)->position < PP(p)->position;
122
+ }
123
+ else {
124
+ return PP(p)->doc < PP(p)->doc;
125
+ }
126
+ */
127
+ }
128
+
129
+ void pp_destroy(PhPos *pp)
296
130
  {
297
- int i;
298
- PhraseQuery *phq1 = (PhraseQuery *)self->data;
299
- PhraseQuery *phq2 = (PhraseQuery *)o->data;
300
- if (phq1->slop != phq2->slop) return false;
301
- for (i = 0; i < phq1->t_cnt; i++) {
302
- if (!term_eq(phq1->terms[i], phq2->terms[i]) ||
303
- (phq1->positions[i] != phq2->positions[i])) return false;
304
- }
305
- return true;
131
+ if (pp->tpe) {
132
+ pp->tpe->close(pp->tpe);
133
+ }
134
+ free(pp);
306
135
  }
307
136
 
308
- Query *phq_create()
137
+ PhPos *pp_new(TermDocEnum *tpe, int offset)
309
138
  {
310
- Query *self = q_create();
311
- PhraseQuery *phq = ALLOC_AND_ZERO_N(PhraseQuery, 1);
139
+ PhPos *self = ALLOC(PhPos);
312
140
 
313
- phq->t_capa = PHQ_INIT_CAPA;
314
- phq->terms = ALLOC_N(Term *, PHQ_INIT_CAPA);
315
- phq->positions = ALLOC_N(int, PHQ_INIT_CAPA);
316
- self->data = phq;
141
+ self->tpe = tpe;
142
+ self->count = self->doc = self->position = -1;
143
+ self->offset = offset;
317
144
 
318
- self->type = PHRASE_QUERY;
319
- self->rewrite = &phq_rewrite;
320
- self->extract_terms = &phq_extract_terms;
321
- self->to_s = &phq_to_s;
322
- self->hash = &phq_hash;
323
- self->eq = &phq_eq;
324
- self->destroy_i = &phq_destroy;
325
- self->create_weight_i = &phw_create;
326
- return self;
145
+ return self;
327
146
  }
328
147
 
329
148
  /***************************************************************************
330
- *
331
149
  * PhraseScorer
332
- *
333
150
  ***************************************************************************/
334
151
 
335
- /***************************************************************************
336
- * PhrasePosition
337
- ***************************************************************************/
152
+ #define PhSc(scorer) ((PhraseScorer *)(scorer))
338
153
 
339
- bool pp_next(PhrasePosition *self)
154
+ typedef struct PhraseScorer
340
155
  {
341
- TermDocEnum *tpe = self->tpe;
342
- if (!tpe->next(tpe)) {
343
- tpe->close(tpe); // close stream
344
- self->tpe = NULL;
345
- self->doc = INT_MAX; // sentinel value
346
- return false;
347
- }
348
- self->doc = tpe->doc_num(tpe);
349
- self->position = 0;
350
- return true;
156
+ Scorer super;
157
+ float (*phrase_freq)(Scorer *self);
158
+ float freq;
159
+ uchar *norms;
160
+ float value;
161
+ Weight *weight;
162
+ PhPos **phrase_pos;
163
+ int pp_first_idx;
164
+ int pp_cnt;
165
+ int slop;
166
+ bool first_time : 1;
167
+ bool more : 1;
168
+ } PhraseScorer;
169
+
170
+ static void phsc_init(PhraseScorer *phsc)
171
+ {
172
+ int i;
173
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
174
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
175
+ }
176
+
177
+ if (phsc->more) {
178
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
179
+ sizeof(PhPos *), &pp_cmp);
180
+ phsc->pp_first_idx = 0;
181
+ }
351
182
  }
352
183
 
353
- bool pp_skip_to(PhrasePosition *self, int doc_num)
184
+ static bool phsc_do_next(Scorer *self)
354
185
  {
355
- TermDocEnum *tpe = self->tpe;
356
- if (!tpe->skip_to(tpe, doc_num)) {
357
- tpe->close(tpe); // close stream
358
- self->tpe = NULL;
359
- self->doc = INT_MAX; // sentinel value
186
+ PhraseScorer *phsc = PhSc(self);
187
+ const int pp_cnt = phsc->pp_cnt;
188
+ int pp_first_idx = phsc->pp_first_idx;
189
+ PhPos **phrase_positions = phsc->phrase_pos;
190
+
191
+ PhPos *first = phrase_positions[pp_first_idx];
192
+ PhPos *last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
193
+
194
+ while (phsc->more) {
195
+ /* find doc with all the terms */
196
+ while (phsc->more && first->doc < last->doc) {
197
+ /* skip first upto last */
198
+ phsc->more = pp_skip_to(first, last->doc);
199
+ last = first;
200
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
201
+ first = phrase_positions[pp_first_idx];
202
+ }
203
+
204
+ if (phsc->more) {
205
+ /* pp_first_idx will be used by phrase_freq */
206
+ phsc->pp_first_idx = pp_first_idx;
207
+
208
+ /* found a doc with all of the terms */
209
+ phsc->freq = phsc->phrase_freq(self);
210
+
211
+ if (phsc->freq == 0.0) { /* no match */
212
+ /* continuing search so re-set first and last */
213
+ pp_first_idx = phsc->pp_first_idx;
214
+ first = phrase_positions[pp_first_idx];
215
+ last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
216
+ phsc->more = pp_next(last); /* trigger further scanning */
217
+ }
218
+ else {
219
+ self->doc = first->doc;
220
+ return true; /* found a match */
221
+ }
222
+
223
+ }
224
+ }
360
225
  return false;
361
- }
362
- self->doc = tpe->doc_num(tpe);
363
- self->position = 0;
364
- return true;
365
226
  }
366
227
 
367
- bool pp_next_position(PhrasePosition *self)
228
+ static float phsc_score(Scorer *self)
368
229
  {
369
- TermDocEnum *tpe = self->tpe;
370
- self->count -= 1;
371
- if (self->count >= 0) {// read subsequent pos's
372
- self->position = tpe->next_position(tpe) - self->offset;
373
- return true;
374
- } else {
375
- return false;
376
- }
230
+ PhraseScorer *phsc = PhSc(self);
231
+ float raw_score = sim_tf(self->similarity, phsc->freq) * phsc->value;
232
+ /* normalize */
233
+ return raw_score * sim_decode_norm(
234
+ self->similarity,
235
+ phsc->norms[phsc->phrase_pos[phsc->pp_first_idx]->doc]);
377
236
  }
378
237
 
379
- bool pp_first_position(PhrasePosition *self)
238
+ static bool phsc_next(Scorer *self)
380
239
  {
381
- TermDocEnum *tpe = self->tpe;
382
- self->count = tpe->freq(tpe); // read first pos
383
- return pp_next_position(self);
240
+ PhraseScorer *phsc = PhSc(self);
241
+ if (phsc->first_time) {
242
+ phsc_init(phsc);
243
+ phsc->first_time = false;
244
+ }
245
+ else if (phsc->more) {
246
+ /* trigger further scanning */
247
+ phsc->more = pp_next(
248
+ phsc->phrase_pos[PREV_NUM(phsc->pp_first_idx, phsc->pp_cnt)]);
249
+ }
250
+
251
+ return phsc_do_next(self);
252
+ }
253
+
254
+ static bool phsc_skip_to(Scorer *self, int doc_num)
255
+ {
256
+ PhraseScorer *phsc = PhSc(self);
257
+ int i;
258
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
259
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) {
260
+ break;
261
+ }
262
+ }
263
+
264
+ if (phsc->more) {
265
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
266
+ sizeof(PhPos *), &pp_cmp);
267
+ phsc->pp_first_idx = 0;
268
+ }
269
+ return phsc_do_next(self);
384
270
  }
385
271
 
386
- char *pp_to_s(PhrasePosition *self)
272
+ static Explanation *phsc_explain(Scorer *self, int doc_num)
387
273
  {
388
- return strfmt("pp->(doc => %d, position => %d)", self->doc, self->position);
274
+ PhraseScorer *phsc = PhSc(self);
275
+ float phrase_freq;
276
+
277
+ phsc_skip_to(self, doc_num);
278
+
279
+ phrase_freq = (self->doc == doc_num) ? phsc->freq : (float)0.0;
280
+ return expl_new(sim_tf(self->similarity, phrase_freq),
281
+ "tf(phrase_freq=%f)", phrase_freq);
389
282
  }
390
283
 
391
- inline int pp_cmp(const void *const p1, const void *const p2)
284
+ static void phsc_destroy(Scorer *self)
392
285
  {
393
- PhrasePosition *pp1 = *(PhrasePosition **)p1;
394
- PhrasePosition *pp2 = *(PhrasePosition **)p2;
395
- int cmp = pp1->doc - pp2->doc;
396
- if (cmp == 0) {
397
- return pp1->position - pp2->position;
398
- } else {
399
- return cmp;
400
- }
286
+ PhraseScorer *phsc = PhSc(self);
287
+ int i;
288
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
289
+ pp_destroy(phsc->phrase_pos[i]);
290
+ }
291
+ free(phsc->phrase_pos);
292
+ scorer_destroy_i(self);
401
293
  }
402
- bool pp_less_than(void *p1, void *p2)
294
+
295
+ static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum,
296
+ PhrasePosition *positions, int pos_cnt,
297
+ Similarity *similarity, uchar *norms)
403
298
  {
404
- PhrasePosition *pp1 = (PhrasePosition *)p1;
405
- PhrasePosition *pp2 = (PhrasePosition *)p2;
406
- if (pp1->doc == pp2->doc) {
407
- return pp1->position < pp2->position;
408
- } else {
409
- return pp1->doc < pp2->doc;
410
- }
299
+ int i;
300
+ Scorer *self = scorer_new(PhraseScorer, similarity);
301
+
302
+ PhSc(self)->weight = weight;
303
+ PhSc(self)->norms = norms;
304
+ PhSc(self)->value = weight->value;
305
+ PhSc(self)->phrase_pos = ALLOC_N(PhPos *, pos_cnt);
306
+ PhSc(self)->pp_first_idx = 0;
307
+ PhSc(self)->pp_cnt = pos_cnt;
308
+ PhSc(self)->slop = 0;
309
+ PhSc(self)->first_time = true;
310
+ PhSc(self)->more = true;
311
+
312
+ for (i = 0; i < pos_cnt; i++) {
313
+ PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos);
314
+ }
315
+
316
+ self->score = &phsc_score;
317
+ self->next = &phsc_next;
318
+ self->skip_to = &phsc_skip_to;
319
+ self->explain = &phsc_explain;
320
+ self->destroy = &phsc_destroy;
321
+
322
+ return self;
411
323
  }
412
324
 
413
- void pp_destroy(PhrasePosition *pp)
325
+ /***************************************************************************
326
+ * ExactPhraseScorer
327
+ ***************************************************************************/
328
+
329
+ static float ephsc_phrase_freq(Scorer *self)
414
330
  {
415
- if (pp->tpe) pp->tpe->close(pp->tpe);
416
- free(pp);
331
+ PhraseScorer *phsc = PhSc(self);
332
+ int i;
333
+ int pp_first_idx = 0;
334
+ const int pp_cnt = phsc->pp_cnt;
335
+ float freq = 0.0;
336
+ PhPos **phrase_positions = phsc->phrase_pos;
337
+ PhPos *first;
338
+ PhPos *last;
339
+
340
+ for (i = 0; i < pp_cnt; i++) {
341
+ pp_first_position(phrase_positions[i]);
342
+ }
343
+ qsort(phrase_positions, pp_cnt, sizeof(PhPos *), &pp_pos_cmp);
344
+
345
+ first = phrase_positions[0];
346
+ last = phrase_positions[pp_cnt - 1];
347
+
348
+ /* scan to position with all terms */
349
+ do {
350
+ /* scan forward in first */
351
+ while (first->position < last->position) {
352
+ do {
353
+ if (! pp_next_position(first)) {
354
+ /* maintain first position */
355
+ phsc->pp_first_idx = pp_first_idx;
356
+ return freq;
357
+ }
358
+ } while (first->position < last->position);
359
+ last = first;
360
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
361
+ first = phrase_positions[pp_first_idx];
362
+ }
363
+ freq += 1.0; /* all equal: a match */
364
+ } while (pp_next_position(last));
365
+
366
+ /* maintain first position */
367
+ phsc->pp_first_idx = pp_first_idx;
368
+ return freq;
417
369
  }
418
370
 
419
- PhrasePosition *pp_create(TermDocEnum *tpe, int offset)
371
+ static Scorer *exact_phrase_scorer_new(Weight *weight,
372
+ TermDocEnum **term_pos_enum,
373
+ PhrasePosition *positions, int pp_cnt,
374
+ Similarity *similarity, uchar *norms)
420
375
  {
421
- PhrasePosition *self = ALLOC(PhrasePosition);
422
- self->tpe = tpe;
423
- self->count = self->doc = self->position = -1;
424
- self->offset = offset;
425
- return self;
376
+ Scorer *self =
377
+ phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
378
+
379
+ PhSc(self)->phrase_freq = &ephsc_phrase_freq;
380
+ return self;
426
381
  }
427
382
 
428
383
  /***************************************************************************
429
- * PhraseScorer
384
+ * SloppyPhraseScorer
430
385
  ***************************************************************************/
431
386
 
432
- #define GET_PHSC PhraseScorer *phsc = (PhraseScorer *)self->data
387
+ static float sphsc_phrase_freq(Scorer *self)
388
+ {
389
+ PhraseScorer *phsc = PhSc(self);
390
+ PhPos *pp;
391
+ PriorityQueue *pq = pq_new(phsc->pp_cnt, (lt_ft)&pp_less_than, NULL);
392
+ const int pp_cnt = phsc->pp_cnt;
393
+
394
+ int last_pos = 0, pos, next_pos, start, match_length, i;
395
+ bool done = false;
396
+ float freq = 0.0;
397
+
398
+ for (i = 0; i < pp_cnt; i++) {
399
+ pp = phsc->phrase_pos[i];
400
+ pp_first_position(pp);
401
+ if (pp->position > last_pos) {
402
+ last_pos = pp->position;
403
+ }
404
+ pq_push(pq, pp);
405
+ }
433
406
 
407
+ do {
408
+ pp = pq_pop(pq);
409
+ pos = start = pp->position;
410
+ next_pos = PP(pq_top(pq))->position;
411
+ while (pos <= next_pos) {
412
+ start = pos; /* advance pp to min window */
413
+ if (!pp_next_position(pp)) {
414
+ done = true; /* ran out of a positions for a term - done */
415
+ break;
416
+ }
417
+ pos = pp->position;
418
+ }
419
+
420
+ match_length = last_pos - start;
421
+ if (match_length <= phsc->slop) {
422
+ /* score match */
423
+ freq += sim_sloppy_freq(self->similarity, match_length);
424
+ }
425
+
426
+ if (pp->position > last_pos) {
427
+ last_pos = pp->position;
428
+ }
429
+ pq_push(pq, pp); /* restore pq */
430
+ } while (!done);
431
+
432
+ pq_destroy(pq);
433
+ return freq;
434
+ }
434
435
 
435
- void phsc_init(PhraseScorer *phsc)
436
+ static Scorer *sloppy_phrase_scorer_new(Weight *weight,
437
+ TermDocEnum **term_pos_enum,
438
+ PhrasePosition *positions,
439
+ int pp_cnt, Similarity *similarity,
440
+ int slop, uchar *norms)
436
441
  {
437
- int i;
438
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
439
- if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
440
- }
442
+ Scorer *self =
443
+ phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
441
444
 
442
- if (phsc->more) {
443
- qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
444
- phsc->pp_first = 0;
445
- phsc->pp_last = phsc->pp_cnt - 1;
446
- }
445
+ PhSc(self)->slop = slop;
446
+ PhSc(self)->phrase_freq = &sphsc_phrase_freq;
447
+ return self;
447
448
  }
448
449
 
449
- #define FIRST_TO_LAST() \
450
- last = first;\
451
- phsc->pp_last = phsc->pp_first;\
452
- phsc->pp_first = (phsc->pp_first + 1) % phsc->pp_cnt;\
453
- first = phsc->phrase_pos[phsc->pp_first];
450
+ /***************************************************************************
451
+ *
452
+ * PhraseWeight
453
+ *
454
+ ***************************************************************************/
455
+
456
+ static char *phw_to_s(Weight *self)
457
+ {
458
+ return strfmt("PhraseWeight(%f)", self->value);
459
+ }
454
460
 
455
- bool phsc_do_next(Scorer *self)
461
+ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
456
462
  {
457
- GET_PHSC;
458
- PhrasePosition *first = phsc->phrase_pos[phsc->pp_first];
459
- PhrasePosition *last = phsc->phrase_pos[phsc->pp_last];
463
+ int i;
464
+ Scorer *phsc = NULL;
465
+ PhraseQuery *phq = PhQ(self->query);
466
+ TermDocEnum **tps, *tpe;
467
+ PhrasePosition *positions = phq->positions;
468
+ const int pos_cnt = phq->pos_cnt;
469
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
470
+
471
+ if (pos_cnt == 0 || field_num < 0) {
472
+ return NULL;
473
+ }
460
474
 
461
- while (phsc->more) {
462
- while (phsc->more && first->doc < last->doc) { // find doc w/ all the terms
463
- phsc->more = pp_skip_to(first, last->doc); // skip first upto last
464
- FIRST_TO_LAST(); // and move it to the end
475
+ tps = ALLOC_N(TermDocEnum *, pos_cnt);
476
+
477
+ for (i = 0; i < pos_cnt; i++) {
478
+ char **terms = positions[i].terms;
479
+ const int t_cnt = ary_size(terms);
480
+ if (t_cnt == 1) {
481
+ tpe = tps[i] = ir->term_positions(ir);
482
+ tpe->seek(tpe, field_num, terms[0]);
483
+ }
484
+ else {
485
+ tps[i] = mtdpe_new(ir, field_num, terms, t_cnt);
486
+ }
487
+ if (tps[i] == NULL) {
488
+ /* free everything we just created and return NULL */
489
+ int j;
490
+ for (j = 0; j < i; j++) {
491
+ tps[i]->close(tps[i]);
492
+ }
493
+ free(tps);
494
+ return NULL;
495
+ }
465
496
  }
466
497
 
467
- if (phsc->more) {
468
- // found a doc with all of the terms
469
- phsc->freq = phsc->phrase_freq(self); // check for phrase
470
- if (phsc->freq == 0.0) { // no match
471
- first = phsc->phrase_pos[phsc->pp_first];
472
- last = phsc->phrase_pos[phsc->pp_last];
473
- phsc->more = pp_next(last); // trigger further scanning
474
- } else {
475
- self->doc = first->doc;
476
- return true; // found a match
477
- }
498
+ if (phq->slop == 0) { /* optimize exact (common) case */
499
+ phsc = exact_phrase_scorer_new(self, tps, positions, pos_cnt,
500
+ self->similarity,
501
+ ir->get_norms(ir, field_num));
502
+ }
503
+ else {
504
+ phsc = sloppy_phrase_scorer_new(self, tps, positions, pos_cnt,
505
+ self->similarity, phq->slop,
506
+ ir->get_norms(ir, field_num));
507
+ }
508
+ free(tps);
509
+ return phsc;
510
+ }
511
+
512
+ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
513
+ {
514
+ Explanation *expl;
515
+ Explanation *idf_expl1;
516
+ Explanation *idf_expl2;
517
+ Explanation *query_expl;
518
+ Explanation *qnorm_expl;
519
+ Explanation *field_expl;
520
+ Explanation *tf_expl;
521
+ Scorer *scorer;
522
+ uchar *field_norms;
523
+ float field_norm;
524
+ Explanation *field_norm_expl;
525
+ char *query_str;
526
+ PhraseQuery *phq = PhQ(self->query);
527
+ const int pos_cnt = phq->pos_cnt;
528
+ PhrasePosition *positions = phq->positions;
529
+ int i, j;
530
+ char *doc_freqs = NULL;
531
+ size_t len = 0, pos = 0;
532
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
533
+
534
+ if (field_num < 0) {
535
+ return expl_new(0.0, "field \"%s\" does not exist in the index", phq->field);
536
+ }
537
+
538
+ query_str = self->query->to_s(self->query, "");
539
+
540
+ expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
541
+
542
+ /* ensure the phrase positions are in order for explanation */
543
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
544
+
545
+ for (i = 0; i < phq->pos_cnt; i++) {
546
+ char **terms = phq->positions[i].terms;
547
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
548
+ len += strlen(terms[j]) + 30;
549
+ }
550
+ }
551
+ doc_freqs = ALLOC_N(char, len);
552
+ for (i = 0; i < phq->pos_cnt; i++) {
553
+ char **terms = phq->positions[i].terms;
554
+ const int t_cnt = ary_size(terms);
555
+ for (j = 0; j < t_cnt; j++) {
556
+ char *term = terms[j];
557
+ sprintf(doc_freqs + pos, "%s=%d, ",
558
+ term, ir->doc_freq(ir, field_num, term));
559
+ pos += strlen(doc_freqs + pos);
560
+ }
561
+ }
562
+ pos -= 2; /* remove ", " from the end */
563
+ doc_freqs[pos] = 0;
564
+
565
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
566
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
567
+ free(doc_freqs);
568
+
569
+ /* explain query weight */
570
+ query_expl = expl_new(0.0, "query_weight(%s), product of:", query_str);
571
+
572
+ if (self->query->boost != 1.0) {
573
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
574
+ }
575
+ expl_add_detail(query_expl, idf_expl1);
576
+
577
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
578
+ expl_add_detail(query_expl, qnorm_expl);
579
+
580
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
581
+
582
+ expl_add_detail(expl, query_expl);
583
+
584
+ /* explain field weight */
585
+ field_expl = expl_new(0.0, "field_weight(%s in %d), product of:",
586
+ query_str, doc_num);
587
+ free(query_str);
588
+
589
+ scorer = self->scorer(self, ir);
590
+ tf_expl = scorer->explain(scorer, doc_num);
591
+ scorer->destroy(scorer);
592
+ expl_add_detail(field_expl, tf_expl);
593
+ expl_add_detail(field_expl, idf_expl2);
594
+
595
+ field_norms = ir->get_norms(ir, field_num);
596
+ field_norm = (field_norms != NULL)
597
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
598
+ : (float)0.0;
599
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
600
+ phq->field, doc_num);
601
+
602
+ expl_add_detail(field_expl, field_norm_expl);
603
+
604
+ field_expl->value = tf_expl->value * self->idf * field_norm;
605
+
606
+ /* combine them */
607
+ if (query_expl->value == 1.0) {
608
+ expl_destroy(expl);
609
+ return field_expl;
610
+ }
611
+ else {
612
+ expl->value = (query_expl->value * field_expl->value);
613
+ expl_add_detail(expl, field_expl);
614
+ return expl;
478
615
  }
479
- }
480
- return false;
481
616
  }
482
617
 
483
- float phsc_score(Scorer *self)
618
+ static Weight *phw_new(Query *query, Searcher *searcher)
484
619
  {
485
- GET_PHSC;
486
- float raw = sim_tf(self->similarity, phsc->freq) * phsc->value; // raw score
487
- // normalize
488
- return raw * sim_decode_norm(self->similarity,
489
- phsc->norms[phsc->phrase_pos[phsc->pp_first]->doc]);
620
+ Weight *self = w_new(Weight, query);
621
+
622
+ self->scorer = &phw_scorer;
623
+ self->explain = &phw_explain;
624
+ self->to_s = &phw_to_s;
625
+
626
+ self->similarity = query->get_similarity(query, searcher);
627
+ self->value = query->boost;
628
+ self->idf = sim_idf_phrase(self->similarity, PhQ(query)->field,
629
+ PhQ(query)->positions,
630
+ PhQ(query)->pos_cnt, searcher);
631
+ return self;
490
632
  }
491
633
 
492
- bool phsc_next(Scorer *self)
634
+ /***************************************************************************
635
+ *
636
+ * PhraseQuery
637
+ *
638
+ ***************************************************************************/
639
+
640
+ /* ** TVPosEnum ** */
641
+ typedef struct TVPosEnum
642
+ {
643
+ int index;
644
+ int size;
645
+ int offset;
646
+ int pos;
647
+ int positions[];
648
+ } TVPosEnum;
649
+
650
+ static bool tvpe_next(TVPosEnum *self)
493
651
  {
494
- GET_PHSC;
495
- if (phsc->first_time) {
496
- phsc_init(phsc);
497
- phsc->first_time = false;
498
- } else if (phsc->more) {
499
- phsc->more = pp_next(phsc->phrase_pos[phsc->pp_last]); // trigger further scanning
500
- }
501
- return phsc_do_next(self);
652
+ if (++(self->index) < self->size) {
653
+ self->pos = self->positions[self->index] - self->offset;
654
+ return true;
655
+ }
656
+ else {
657
+ self->pos = -1;
658
+ return false;
659
+ }
502
660
  }
503
661
 
504
- bool phsc_skip_to(Scorer *self, int doc_num)
662
+ static int tvpe_skip_to(TVPosEnum *self, int position)
505
663
  {
506
- GET_PHSC;
507
- int i;
508
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
509
- if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) break;
510
- }
664
+ int i;
665
+ int search_pos = position + self->offset;
666
+ for (i = self->index + 1; i < self->size; i++) {
667
+ if (self->positions[i] >= search_pos) {
668
+ self->pos = self->positions[i] - self->offset;
669
+ break;
670
+ }
671
+ }
672
+ self->index = i;
673
+ if (i == self->size) {
674
+ self->pos = -1;
675
+ return false;
676
+ }
677
+ return true;
678
+ }
511
679
 
512
- if (phsc->more) {
513
- qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
514
- phsc->pp_first = 0;
515
- phsc->pp_last = phsc->pp_cnt - 1;
516
- }
517
- return phsc_do_next(self);
680
+ static bool tvpe_lt(TVPosEnum *tvpe1, TVPosEnum *tvpe2)
681
+ {
682
+ return tvpe1->pos < tvpe2->pos;
518
683
  }
519
684
 
520
- static Explanation *phsc_explain(Scorer *self, int doc_num)
685
+ static TVPosEnum *tvpe_new(int *positions, int size, int offset)
521
686
  {
522
- GET_PHSC;
523
- float phrase_freq;
687
+ TVPosEnum *self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
688
+ + size * sizeof(int));
689
+ memcpy(self->positions, positions, size * sizeof(int));
690
+ self->size = size;
691
+ self->offset = offset;
692
+ self->index = -1;
693
+ self->pos = -1;
694
+ return self;
695
+ }
524
696
 
525
- while (phsc_next(self) && self->doc < doc_num)
526
- ;
697
+ static TVPosEnum *tvpe_new_merge(char **terms, int t_cnt, TermVector *tv,
698
+ int offset)
699
+ {
700
+ int i, total_positions = 0;
701
+ PriorityQueue *tvpe_pq = pq_new(t_cnt, (lt_ft)tvpe_lt, &free);
702
+ TVPosEnum *self = NULL;
703
+
704
+ for (i = 0; i < t_cnt; i++) {
705
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[i]);
706
+ if (tv_term) {
707
+ TVPosEnum *tvpe = tvpe_new(tv_term->positions, tv_term->freq, 0);
708
+ if (tvpe_next(tvpe)) {
709
+ pq_push(tvpe_pq, tvpe);
710
+ total_positions += tv_term->freq;
711
+ }
712
+ else {
713
+ free(tvpe);
714
+ }
715
+ }
716
+ }
717
+ if (tvpe_pq->size == 0) {
718
+ pq_destroy(tvpe_pq);
719
+ }
720
+ else {
721
+ int index = 0;
722
+ self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
723
+ + total_positions * sizeof(int));
724
+ self->size = total_positions;
725
+ self->offset = offset;
726
+ self->index = -1;
727
+ self->pos = -1;
728
+ while (tvpe_pq->size > 0) {
729
+ TVPosEnum *top = (TVPosEnum *)pq_top(tvpe_pq);
730
+ self->positions[index++] = top->pos;
731
+ if (! tvpe_next(top)) {
732
+ pq_pop(tvpe_pq);
733
+ free(top);
734
+ }
735
+ else {
736
+ pq_down(tvpe_pq);
737
+ }
738
+ }
739
+ pq_destroy(tvpe_pq);
740
+ }
741
+ return self;
742
+ }
527
743
 
528
- phrase_freq = (self->doc == doc_num) ? phsc->freq : (float)0.0;
529
- return expl_create(sim_tf(self->similarity, phrase_freq),
530
- strfmt("tf(phrase_freq=%f)", phrase_freq));
744
+ static TVPosEnum *get_tvpe(TermVector *tv, char **terms, int t_cnt, int offset)
745
+ {
746
+ TVPosEnum *tvpe = NULL;
747
+ if (t_cnt == 1) {
748
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[0]);
749
+ if (tv_term) {
750
+ tvpe = tvpe_new(tv_term->positions, tv_term->freq, offset);
751
+ }
752
+ }
753
+ else {
754
+ tvpe = tvpe_new_merge(terms, t_cnt, tv, offset);
755
+ }
756
+ return tvpe;
531
757
  }
532
758
 
533
- static void phsc_destroy(Scorer *self)
759
+ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
760
+ TermVector *tv)
534
761
  {
535
- GET_PHSC;
536
- int i;
537
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
538
- pp_destroy(phsc->phrase_pos[i]);
539
- }
540
- free(phsc->phrase_pos);
541
- scorer_destroy_i(self);
762
+ if (strcmp(tv->field, PhQ(self)->field) == 0) {
763
+ const int pos_cnt = PhQ(self)->pos_cnt;
764
+ int i;
765
+ int slop = PhQ(self)->slop;
766
+ bool done = false;
767
+
768
+ if (slop > 0) {
769
+ PriorityQueue *tvpe_pq = pq_new(pos_cnt, (lt_ft)tvpe_lt, &free);
770
+ int last_pos = 0;
771
+ for (i = 0; i < pos_cnt; i++) {
772
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
773
+ const int t_cnt = ary_size(pp->terms);
774
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
775
+ if (tvpe && tvpe_next(tvpe)) {
776
+ if (tvpe->pos > last_pos) {
777
+ last_pos = tvpe->pos;
778
+ }
779
+ pq_push(tvpe_pq, tvpe);
780
+ }
781
+ else {
782
+ done = true;
783
+ free(tvpe);
784
+ break;
785
+ }
786
+ }
787
+ while (! done) {
788
+ TVPosEnum *tvpe = pq_pop(tvpe_pq);
789
+ int pos;
790
+ int start = pos = tvpe->pos;
791
+ int next_pos = ((TVPosEnum *)pq_top(tvpe_pq))->pos;
792
+ while (pos <= next_pos) {
793
+ start = pos;
794
+ if (!tvpe_next(tvpe)) {
795
+ done = true;
796
+ break;
797
+ }
798
+ pos = tvpe->pos;
799
+ }
800
+
801
+ if (last_pos - start <= slop) {
802
+ int min, max = min = start + tvpe->offset;
803
+ for (i = tvpe_pq->size; i > 0; i--) {
804
+ TVPosEnum *t = (TVPosEnum *)tvpe_pq->heap[i];
805
+ int p = t->pos + t->offset;
806
+ max = p > max ? p : max;
807
+ min = p < min ? p : min;
808
+ }
809
+ matchv_add(mv, min, max);
810
+ }
811
+ if (tvpe->pos > last_pos) {
812
+ last_pos = tvpe->pos;
813
+ }
814
+ pq_push(tvpe_pq, tvpe);
815
+ }
816
+
817
+ pq_destroy(tvpe_pq);
818
+ }
819
+ else { /* exact match */
820
+ TVPosEnum **tvpe_a = ALLOC_AND_ZERO_N(TVPosEnum *, pos_cnt);
821
+ TVPosEnum *first, *last;
822
+ int first_index = 0;
823
+ done = false;
824
+ qsort(PhQ(self)->positions, pos_cnt, sizeof(PhrasePosition),
825
+ &phrase_pos_cmp);
826
+ for (i = 0; i < pos_cnt; i++) {
827
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
828
+ const int t_cnt = ary_size(pp->terms);
829
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
830
+ if (tvpe && ((i == 0 && tvpe_next(tvpe))
831
+ || tvpe_skip_to(tvpe, tvpe_a[i-1]->pos))) {
832
+ tvpe_a[i] = tvpe;
833
+ }
834
+ else {
835
+ done = true;
836
+ free(tvpe);
837
+ break;
838
+ }
839
+ }
840
+
841
+ first = tvpe_a[0];
842
+ last = tvpe_a[pos_cnt - 1];
843
+
844
+ while (!done) {
845
+ while (first->pos < last->pos) {
846
+ if (tvpe_skip_to(first, last->pos)) {
847
+ last = first;
848
+ first_index = NEXT_NUM(first_index, pos_cnt);
849
+ first = tvpe_a[first_index];
850
+ }
851
+ else {
852
+ done = true;
853
+ break;
854
+ }
855
+ }
856
+ if (!done) {
857
+ matchv_add(mv, tvpe_a[0]->pos + tvpe_a[0]->offset,
858
+ tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
859
+ }
860
+ if (!tvpe_next(last)) {
861
+ done = true;
862
+ }
863
+ }
864
+ for (i = 0; i < pos_cnt; i++) {
865
+ free(tvpe_a[i]);
866
+ }
867
+ free(tvpe_a);
868
+ }
869
+ }
870
+ return mv;
542
871
  }
543
872
 
544
- Scorer *phsc_create(Weight *weight, TermDocEnum **term_pos_enum,
545
- int *positions, int t_cnt, Similarity *similarity, uchar *norms)
873
+
874
+ /* ** PhraseQuery besides highlighting stuff ** */
875
+
876
+ #define PhQ_INIT_CAPA 4
877
+
878
+ static void phq_extract_terms(Query *self, HashSet *term_set)
546
879
  {
547
- int i;
548
- Scorer *self = scorer_create(similarity);
549
- PhraseScorer *phsc = ALLOC(PhraseScorer);
550
- ZEROSET(phsc, PhraseScorer, 1);
880
+ PhraseQuery *phq = PhQ(self);
881
+ int i, j;
882
+ for (i = 0; i < phq->pos_cnt; i++) {
883
+ char **terms = phq->positions[i].terms;
884
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
885
+ hs_add(term_set, term_new(phq->field, terms[j]));
886
+ }
887
+ }
888
+ }
551
889
 
552
- phsc->weight = weight;
553
- phsc->norms = norms;
554
- phsc->value = weight->value;
890
+ static char *phq_to_s(Query *self, const char *field)
891
+ {
892
+ PhraseQuery *phq = PhQ(self);
893
+ const int pos_cnt = phq->pos_cnt;
894
+ PhrasePosition *positions = phq->positions;
555
895
 
556
- phsc->phrase_pos = ALLOC_N(PhrasePosition *, t_cnt);
557
- for (i = 0; i < t_cnt; i++) {
558
- phsc->phrase_pos[i] = pp_create(term_pos_enum[i], positions[i]);
559
- }
560
- phsc->pp_first = 0;
561
- phsc->pp_last = t_cnt - 1;
562
- phsc->pp_cnt = t_cnt;
896
+ int i, j, buf_index = 0, pos, last_pos;
897
+ size_t len = 0;
898
+ char *buffer;
563
899
 
564
- phsc->slop = 0;
900
+ if (phq->pos_cnt == 0) {
901
+ return NULL;
902
+ }
565
903
 
566
- phsc->first_time = true;
567
- phsc->more = true;
904
+ /* sort the phrase positions by position */
905
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
568
906
 
569
- self->data = phsc;
570
- self->score = &phsc_score;
571
- self->next = &phsc_next;
572
- self->skip_to = &phsc_skip_to;
573
- self->explain = &phsc_explain;
574
- self->destroy = &phsc_destroy;
907
+ len = strlen(phq->field) + 1;
575
908
 
576
- return self;
577
- }
909
+ for (i = 0; i < pos_cnt; i++) {
910
+ char **terms = phq->positions[i].terms;
911
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
912
+ len += strlen(terms[j]) + 5;
913
+ }
914
+ }
578
915
 
579
- /***************************************************************************
580
- * ExactPhraseScorer
581
- ***************************************************************************/
916
+ /* add space for extra <> characters and boost and slop */
917
+ len += 100 + 3
918
+ * (phq->positions[phq->pos_cnt - 1].pos - phq->positions[0].pos);
582
919
 
583
- float ephsc_phrase_freq(Scorer *self)
584
- {
585
- GET_PHSC;
586
- // sort list with pq
587
- int i;
588
- float freq = 0.0;
589
- PhrasePosition *first;
590
- PhrasePosition *last;
920
+ buffer = ALLOC_N(char, len);
921
+
922
+ if (strcmp(field, phq->field) != 0) {
923
+ len = strlen(phq->field);
924
+ memcpy(buffer, phq->field, len);
925
+ buffer[len] = ':';
926
+ buf_index += len + 1;
927
+ }
591
928
 
592
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
593
- pp_first_position(phsc->phrase_pos[i]);
594
- }
595
- qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
596
- phsc->pp_first = 0;
597
- phsc->pp_last = phsc->pp_cnt - 1;
929
+ buffer[buf_index++] = '"';
930
+
931
+ last_pos = positions[0].pos - 1;
932
+ for (i = 0; i < pos_cnt; i++) {
933
+ char **terms = positions[i].terms;
934
+ const int t_cnt = ary_size(terms);
935
+
936
+ pos = positions[i].pos;
937
+ if (pos == last_pos) {
938
+ buffer[buf_index - 1] = '&';
939
+ }
940
+ else {
941
+ for (j = last_pos; j < pos - 1; j++) {
942
+ memcpy(buffer + buf_index, "<> ", 3);
943
+ buf_index += 3;
944
+ }
945
+ }
946
+
947
+ last_pos = pos;
948
+ for (j = 0; j < t_cnt; j++) {
949
+ char *term = terms[j];
950
+ len = strlen(term);
951
+ memcpy(buffer + buf_index, term, len);
952
+ buf_index += len;
953
+ buffer[buf_index++] = '|';
954
+ }
955
+ buffer[buf_index-1] = ' '; /* change last '|' to ' ' */
956
+ }
957
+
958
+ if (buffer[buf_index-1] == ' ') {
959
+ buf_index--;
960
+ }
598
961
 
599
- first = phsc->phrase_pos[0];
600
- last = phsc->phrase_pos[phsc->pp_last];
962
+ buffer[buf_index++] = '"';
963
+ buffer[buf_index] = 0;
601
964
 
602
- do { // find position w/ all terms
603
- while (first->position < last->position) { // scan forward in first
604
- do {
605
- if (! pp_next_position(first)) return freq;
606
- } while (first->position < last->position);
607
- FIRST_TO_LAST();
965
+ if (phq->slop != 0) {
966
+ sprintf(buffer + buf_index, "~%d", phq->slop);
967
+ buf_index += strlen(buffer + buf_index);
608
968
  }
609
- freq += 1.0; // all equal: a match
610
- } while (pp_next_position(last));
611
969
 
612
- return freq;
970
+ if (self->boost != 1.0) {
971
+ buffer[buf_index++] = '^';
972
+ dbl_to_s(buffer + buf_index, self->boost);
973
+ }
974
+
975
+ return buffer;
613
976
  }
614
977
 
615
- Scorer *exact_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
616
- int *positions, int t_cnt, Similarity *similarity, uchar *norms)
978
+ static void phq_destroy(Query *self)
617
979
  {
618
- Scorer *self =
619
- phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
620
- GET_PHSC;
621
- phsc->phrase_freq = &ephsc_phrase_freq;
622
- return self;
980
+ PhraseQuery *phq = PhQ(self);
981
+ int i;
982
+ free(phq->field);
983
+ for (i = 0; i < phq->pos_cnt; i++) {
984
+ ary_destroy(phq->positions[i].terms, &free);
985
+ }
986
+ free(phq->positions);
987
+ q_destroy_i(self);
623
988
  }
624
989
 
625
- /***************************************************************************
626
- * SloppyPhraseScorer
627
- ***************************************************************************/
628
-
629
- float sphsc_phrase_freq(Scorer *self)
990
+ static Query *phq_rewrite(Query *self, IndexReader *ir)
630
991
  {
631
- GET_PHSC;
632
- PhrasePosition *pp;
633
- PriorityQueue *pq = pq_create(phsc->pp_cnt, &pp_less_than);
634
-
635
- int last_pos = 0, pos, next_pos, start, match_length, i;
636
- bool done = false;
637
- float freq = 0.0;
638
-
639
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
640
- pp = phsc->phrase_pos[i];
641
- pp_first_position(pp);
642
- if (pp->position > last_pos) last_pos = pp->position;
643
- pq_push(pq, pp);
644
- }
645
-
646
- do {
647
- pp = pq_pop(pq);
648
- pos = start = pp->position;
649
- next_pos = ((PhrasePosition *)pq_top(pq))->position;
650
- while (pos <= next_pos) {
651
- start = pos; // advance pp to min window
652
- if (!pp_next_position(pp)) {
653
- done = true; // ran out of a term -- done
654
- break;
655
- }
656
- pos = pp->position;
992
+ PhraseQuery *phq = PhQ(self);
993
+ (void)ir;
994
+ if (phq->pos_cnt == 1) {
995
+ /* optimize one-position case */
996
+ char **terms = phq->positions[0].terms;
997
+ const int t_cnt = ary_size(terms);
998
+ if (t_cnt == 1) {
999
+ Query *tq = tq_new(phq->field, terms[0]);
1000
+ tq->boost = self->boost;
1001
+ return tq;
1002
+ }
1003
+ else {
1004
+ Query *q = multi_tq_new(phq->field);
1005
+ int i;
1006
+ for (i = 0; i < t_cnt; i++) {
1007
+ multi_tq_add_term(q, terms[i]);
1008
+ }
1009
+ q->boost = self->boost;
1010
+ return q;
1011
+ }
1012
+ } else {
1013
+ self->ref_cnt++;
1014
+ return self;
657
1015
  }
1016
+ }
658
1017
 
659
- match_length = last_pos - start;
660
- if (match_length <= phsc->slop) {
661
- freq += sim_sloppy_freq(self->similarity, match_length); // score match
1018
+ static ulong phq_hash(Query *self)
1019
+ {
1020
+ int i, j;
1021
+ PhraseQuery *phq = PhQ(self);
1022
+ ulong hash = str_hash(phq->field);
1023
+ for (i = 0; i < phq->pos_cnt; i++) {
1024
+ char **terms = phq->positions[i].terms;
1025
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
1026
+ hash = (hash << 1) ^ (str_hash(terms[j])
1027
+ ^ phq->positions[i].pos);
1028
+ }
662
1029
  }
1030
+ return (hash ^ phq->slop);
1031
+ }
663
1032
 
664
- if (pp->position > last_pos) {
665
- last_pos = pp->position;
1033
+ static int phq_eq(Query *self, Query *o)
1034
+ {
1035
+ int i, j;
1036
+ PhraseQuery *phq1 = PhQ(self);
1037
+ PhraseQuery *phq2 = PhQ(o);
1038
+ if (phq1->slop != phq2->slop
1039
+ || strcmp(phq1->field, phq2->field) != 0
1040
+ || phq1->pos_cnt != phq2->pos_cnt) {
1041
+ return false;
1042
+ }
1043
+ for (i = 0; i < phq1->pos_cnt; i++) {
1044
+ char **terms1 = phq1->positions[i].terms;
1045
+ char **terms2 = phq2->positions[i].terms;
1046
+ const int t_cnt = ary_size(terms1);
1047
+ if (t_cnt != ary_size(terms2)
1048
+ || phq1->positions[i].pos != phq2->positions[i].pos) {
1049
+ return false;
1050
+ }
1051
+ for (j = 0; j < t_cnt; j++) {
1052
+ if (strcmp(terms1[j], terms2[j]) != 0) {
1053
+ return false;
1054
+ }
1055
+ }
666
1056
  }
667
- pq_push(pq, pp); // restore pq
668
- } while (!done);
1057
+ return true;
1058
+ }
1059
+
1060
+ Query *phq_new(const char *field)
1061
+ {
1062
+ Query *self = q_new(PhraseQuery);
1063
+
1064
+ PhQ(self)->field = estrdup(field);
1065
+ PhQ(self)->pos_cnt = 0;
1066
+ PhQ(self)->pos_capa = PhQ_INIT_CAPA;
1067
+ PhQ(self)->positions = ALLOC_N(PhrasePosition, PhQ_INIT_CAPA);
1068
+
1069
+ self->type = PHRASE_QUERY;
1070
+ self->rewrite = &phq_rewrite;
1071
+ self->extract_terms = &phq_extract_terms;
1072
+ self->to_s = &phq_to_s;
1073
+ self->hash = &phq_hash;
1074
+ self->eq = &phq_eq;
1075
+ self->destroy_i = &phq_destroy;
1076
+ self->create_weight_i = &phw_new;
1077
+ self->get_matchv_i = &phq_get_matchv_i;
1078
+ return self;
1079
+ }
669
1080
 
670
- pq_destroy(pq);
671
- return freq;
1081
+ void phq_add_term_abs(Query *self, const char *term, int position)
1082
+ {
1083
+ PhraseQuery *phq = PhQ(self);
1084
+ int index = phq->pos_cnt;
1085
+ PhrasePosition *pp;
1086
+ if (index >= phq->pos_capa) {
1087
+ phq->pos_capa <<= 1;
1088
+ REALLOC_N(phq->positions, PhrasePosition, phq->pos_capa);
1089
+ }
1090
+ pp = &(phq->positions[index]);
1091
+ pp->terms = ary_new_type_capa(char *, 2);
1092
+ ary_push(pp->terms, estrdup(term));
1093
+ pp->pos = position;
1094
+ phq->pos_cnt++;
672
1095
  }
673
-
674
- Scorer *sloppy_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
675
- int *positions, int t_cnt, Similarity *similarity, int slop, uchar *norms)
1096
+
1097
+ void phq_add_term(Query *self, const char *term, int pos_inc)
676
1098
  {
677
- Scorer *self =
678
- phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
679
- GET_PHSC;
680
- phsc->slop = slop;
681
- phsc->phrase_freq = &sphsc_phrase_freq;
682
- return self;
1099
+ PhraseQuery *phq = PhQ(self);
1100
+ int position;
1101
+ if (phq->pos_cnt == 0) {
1102
+ position = 0;
1103
+ }
1104
+ else {
1105
+ position = phq->positions[phq->pos_cnt - 1].pos + pos_inc;
1106
+ }
1107
+ phq_add_term_abs(self, term, position);
683
1108
  }
684
1109
 
1110
+ void phq_append_multi_term(Query *self, const char *term)
1111
+ {
1112
+ PhraseQuery *phq = PhQ(self);
1113
+ int index = phq->pos_cnt - 1;
1114
+
1115
+ if (index < 0) {
1116
+ phq_add_term(self, term, 0);
1117
+ }
1118
+ else {
1119
+ ary_push(phq->positions[index].terms, estrdup(term));
1120
+ }
1121
+ }