ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_phrase.c CHANGED
@@ -1,684 +1,1121 @@
1
1
  #include <string.h>
2
+ #include <limits.h>
2
3
  #include "search.h"
4
+ #include "array.h"
5
+
6
+ #define PhQ(query) ((PhraseQuery *)(query))
7
+
8
+ static int phrase_pos_cmp(const void *p1, const void *p2)
9
+ {
10
+ int pos1 = ((PhrasePosition *)p1)->pos;
11
+ int pos2 = ((PhrasePosition *)p2)->pos;
12
+ if (pos1 > pos2) {
13
+ return 1;
14
+ }
15
+ if (pos1 < pos2) {
16
+ return -1;
17
+ }
18
+ return strcmp(((PhrasePosition *)p1)->terms[0],
19
+ ((PhrasePosition *)p2)->terms[0]);
20
+ }
3
21
 
4
- static char * const FIELD_CHANGE_ERROR_MSG = "Field illegally changed in the phrase";
5
22
 
6
23
  /***************************************************************************
7
24
  *
8
- * PhraseWeight
25
+ * PhraseScorer
9
26
  *
10
27
  ***************************************************************************/
11
28
 
12
- Scorer *phw_scorer(Weight *self, IndexReader *ir)
13
- {
14
- Scorer *phsc;
15
- PhraseQuery *phq = (PhraseQuery *)self->query->data;
16
- int i;
17
- TermDocEnum **tps;
18
-
19
- if (phq->t_cnt == 0) {
20
- return NULL; /* optimize zero-term case */
21
- }
22
-
23
- tps = ALLOC_N(TermDocEnum *, phq->t_cnt);
24
-
25
- for (i = 0; i < phq->t_cnt; i++) {
26
- tps[i] = ir_term_positions_for(ir, phq->terms[i]);
27
- if (tps[i] == NULL) {
28
- // free everything we just created and return NULL
29
- int j;
30
- for (j = 0; j < i; j++) {
31
- tps[i]->close(tps[i]);
32
- }
33
- free(tps);
34
- return NULL;
35
- }
36
- }
37
-
38
- if (phq->slop == 0) { // optimize exact case
39
- phsc = exact_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
40
- self->similarity,
41
- ir->get_norms(ir, phq->field));
42
- } else {
43
- phsc = sloppy_phrase_scorer_create(self, tps, phq->positions, phq->t_cnt,
44
- self->similarity,
45
- phq->slop,
46
- ir->get_norms(ir, phq->field));
47
- }
48
- free(tps);
49
- return phsc;
29
+ /***************************************************************************
30
+ * PhPos
31
+ ***************************************************************************/
32
+
33
+ #define PP(p) ((PhPos *)(p))
34
+ typedef struct PhPos
35
+ {
36
+ TermDocEnum *tpe;
37
+ int offset;
38
+ int count;
39
+ int doc;
40
+ int position;
41
+ } PhPos;
42
+
43
+ static bool pp_next(PhPos *self)
44
+ {
45
+ TermDocEnum *tpe = self->tpe;
46
+ if (!tpe->next(tpe)) {
47
+ tpe->close(tpe); /* close stream */
48
+ self->tpe = NULL;
49
+ self->doc = INT_MAX; /* sentinel value */
50
+ return false;
51
+ }
52
+ self->doc = tpe->doc_num(tpe);
53
+ self->position = 0;
54
+ return true;
50
55
  }
51
56
 
52
- Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
57
+ static bool pp_skip_to(PhPos *self, int doc_num)
53
58
  {
54
- Explanation *idf_expl1;
55
- Explanation *idf_expl2;
56
- Explanation *query_expl;
57
- Explanation *qnorm_expl;
58
- Explanation *field_expl;
59
- Explanation *tf_expl;
60
- Scorer *scorer;
61
- uchar *field_norms;
62
- float field_norm;
63
- Explanation *field_norm_expl;
64
-
65
- char *query_str = self->query->to_s(self->query, "");
66
- PhraseQuery *phq = (PhraseQuery *)self->query->data;
67
- int i;
68
- char *doc_freqs = NULL;
69
- int len = 0, pos = 0;
70
-
71
- Explanation *expl = expl_create(0.0,
72
- strfmt("weight(%s in %d), product of:", query_str, doc_num));
73
-
74
- for (i = 0; i < phq->t_cnt; i++) {
75
- len += (int)strlen(phq->terms[i]->text) + 30;
76
- }
77
- doc_freqs = ALLOC_N(char, len);
78
- for (i = 0; i < phq->t_cnt; i++) {
79
- Term *term = phq->terms[i];
80
- sprintf(doc_freqs + pos, "%s=%d, ", term->text, ir->doc_freq(ir, term));
81
- pos += (int)strlen(doc_freqs + pos);
82
- }
83
- pos -= 2; // remove ", " from the end
84
- doc_freqs[pos] = 0;
85
-
86
- idf_expl1 = expl_create(self->idf,
87
- strfmt("idf(%s:<%s>)", phq->field, doc_freqs));
88
- idf_expl2 = expl_create(self->idf,
89
- strfmt("idf(%s:<%s>)", phq->field, doc_freqs));
90
- free(doc_freqs);
91
-
92
- /* explain query weight */
93
- query_expl = expl_create(0.0,
94
- strfmt("query_weight(%s), product of:", query_str));
95
-
96
- if (self->query->boost != 1.0) {
97
- expl_add_detail(query_expl, expl_create(self->query->boost, estrdup("boost")));
98
- }
99
- expl_add_detail(query_expl, idf_expl1);
100
-
101
- qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
102
- expl_add_detail(query_expl, qnorm_expl);
103
-
104
- query_expl->value = self->query->boost * self->idf * self->qnorm;
105
-
106
- expl_add_detail(expl, query_expl);
107
-
108
- /* explain field weight */
109
- field_expl = expl_create(0.0,
110
- strfmt("field_weight(%s in %d), product of:", query_str, doc_num));
111
- free(query_str);
112
-
113
- scorer = self->scorer(self, ir);
114
- tf_expl = scorer->explain(scorer, doc_num);
115
- scorer->destroy(scorer);
116
- expl_add_detail(field_expl, tf_expl);
117
- expl_add_detail(field_expl, idf_expl2);
118
-
119
- field_norms = ir->get_norms(ir, phq->field);
120
- field_norm = (field_norms != NULL)
121
- ? sim_decode_norm(self->similarity, field_norms[doc_num])
122
- : (float)0.0;
123
- field_norm_expl = expl_create(field_norm,
124
- strfmt("field_norm(field=%s, doc=%d)", phq->field, doc_num));
125
-
126
- expl_add_detail(field_expl, field_norm_expl);
127
-
128
- field_expl->value = tf_expl->value * self->idf * field_norm;
129
-
130
- /* combine them */
131
- if (query_expl->value == 1.0) {
132
- expl_destoy(expl);
133
- return field_expl;
134
- } else {
135
- expl->value = (query_expl->value * field_expl->value);
136
- expl_add_detail(expl, field_expl);
137
- return expl;
138
- }
139
- }
140
-
141
- char *phw_to_s(Weight *self)
142
- {
143
- return strfmt("PhraseWeight(%f)", self->value);
144
- }
145
-
146
- Weight *phw_create(Query *query, Searcher *searcher)
147
- {
148
- Weight *self = w_create(query);
149
- PhraseQuery *phq = (PhraseQuery *)query->data;
150
-
151
- self->scorer = &phw_scorer;
152
- self->explain = &phw_explain;
153
- self->to_s = &phw_to_s;
154
- self->sum_of_squared_weights = &w_sum_of_squared_weights;
155
-
156
- self->similarity = query->get_similarity(query, searcher);
157
- self->value = query->boost;
158
- self->idf = sim_idf_phrase(self->similarity, phq->terms, phq->t_cnt, searcher);
159
-
160
- return self;
59
+ TermDocEnum *tpe = self->tpe;
60
+ if (!tpe->skip_to(tpe, doc_num)) {
61
+ tpe->close(tpe); /* close stream */
62
+ self->tpe = NULL;
63
+ self->doc = INT_MAX; /* sentinel value */
64
+ return false;
65
+ }
66
+ self->doc = tpe->doc_num(tpe);
67
+ self->position = 0;
68
+ return true;
161
69
  }
162
70
 
163
- /***************************************************************************
164
- *
165
- * PhraseQuery
166
- *
167
- ***************************************************************************/
71
+ static bool pp_next_position(PhPos *self)
72
+ {
73
+ TermDocEnum *tpe = self->tpe;
74
+ self->count--;
75
+ if (self->count >= 0) { /* read subsequent pos's */
76
+ self->position = tpe->next_position(tpe) - self->offset;
77
+ return true;
78
+ }
79
+ else {
80
+ return false;
81
+ }
82
+ }
168
83
 
169
- #define GET_PHQ PhraseQuery *phq = (PhraseQuery *)self->data
170
-
171
- void phq_extract_terms(Query *self, HashSet *terms)
172
- {
173
- GET_PHQ;
174
- int i;
175
- for (i = 0; i < phq->t_cnt; i++) {
176
- hs_add(terms, term_clone(phq->terms[i]));
177
- }
178
- }
179
-
180
- char *phq_to_s(Query *self, char *field)
181
- {
182
- GET_PHQ;
183
- int i, j, buf_index = 0, len = 0, pos, last_pos = -1;
184
- char *buffer;
185
- if (!phq->t_cnt) return NULL;
186
- len = (int)strlen(phq->field) + 1;
187
- for (i = 0; i < phq->t_cnt; i++) {
188
- len += (int)strlen(phq->terms[i]->text) + 1;
189
- }
190
- // add space for extra characters and boost and slop
191
- len += 100 + 3 * phq->positions[phq->t_cnt - 1];
192
-
193
- buffer = ALLOC_N(char, len);
194
-
195
- if (strcmp(field, phq->field) != 0) {
196
- len = (int)strlen(phq->field);
197
- memcpy(buffer, phq->field, len);
198
- buffer[len] = ':';
199
- buf_index += len + 1;
200
- }
201
- buffer[buf_index++] = '"';
202
-
203
- for (i = 0; i < phq->t_cnt; i++) {
204
- Term *term = phq->terms[i];
205
- pos = phq->positions[i];
206
- for (j = last_pos; j < pos - 1; j++) {
207
- memcpy(buffer + buf_index, "<> ", 3);
208
- buf_index += 3;
209
- }
210
- last_pos = pos;
211
-
212
- len = (int)strlen(term->text);
213
- memcpy(buffer + buf_index, term->text, len);
214
- buf_index += len;
215
- buffer[buf_index++] = ' ';
216
- }
217
- if (buffer[buf_index-1] == ' ') buf_index--;
218
- buffer[buf_index++] = '"';
219
- buffer[buf_index] = 0;
220
- if (phq->slop != 0) {
221
- sprintf(buffer + buf_index, "~%d", phq->slop);
222
- buf_index += (int)strlen(buffer + buf_index);
223
- }
224
- if (self->boost != 1.0) {
225
- buffer[buf_index++] = '^';
226
- dbl_to_s(buffer + buf_index, self->boost);
227
- }
228
- return buffer;
229
- }
230
-
231
- void phq_destroy(Query *self)
232
- {
233
- GET_PHQ;
234
- int i;
235
- if (self->destroy_all) {
236
- for (i = 0; i < phq->t_cnt; i++) {
237
- term_destroy(phq->terms[i]);
238
- }
239
- }
240
- free(phq->terms);
241
- free(phq->positions);
242
- free(phq);
243
-
244
- q_destroy_i(self);
245
- }
246
-
247
- Query *phq_rewrite(Query *self, IndexReader *ir)
248
- {
249
- GET_PHQ;
250
- if (phq->t_cnt == 1) { // optimize one-term case
251
- Term *term = phq->terms[0];
252
- Query *tq = tq_create(term_clone(term));
253
- tq->boost = self->boost;
254
- return tq;
255
- } else {
256
- self->ref_cnt++;
257
- return self;
258
- }
84
+ static bool pp_first_position(PhPos *self)
85
+ {
86
+ TermDocEnum *tpe = self->tpe;
87
+ self->count = tpe->freq(tpe); /* read first pos */
88
+ return pp_next_position(self);
259
89
  }
260
90
 
261
- void phq_add_term(Query *self, Term *term, int pos_inc)
91
+ /*
92
+ static char *pp_to_s(PhPos *self)
262
93
  {
263
- GET_PHQ;
264
- int position, index = phq->t_cnt;
265
- if (index >= phq->t_capa) {
266
- phq->t_capa *= 2;
267
- REALLOC_N(phq->terms, Term *, phq->t_capa);
268
- REALLOC_N(phq->positions, int, phq->t_capa);
269
- }
270
- if (index == 0) {
271
- position = 0;
272
- phq->field = term->field;
273
- } else {
274
- position = phq->positions[index - 1] + pos_inc;
275
- if (strcmp(term->field, phq->field) != 0) {
276
- RAISE(ARG_ERROR, FIELD_CHANGE_ERROR_MSG);
94
+ return strfmt("pp->(doc => %d, position => %d)", self->doc, self->position);
95
+ }
96
+ */
97
+
98
+ #define PP_pp(p) (*(PhPos **)p)
99
+ static int pp_cmp(const void *const p1, const void *const p2)
100
+ {
101
+ int cmp = PP_pp(p1)->doc - PP_pp(p2)->doc;
102
+ if (cmp == 0) {
103
+ return PP_pp(p1)->position - PP_pp(p2)->position;
104
+ }
105
+ else {
106
+ return cmp;
277
107
  }
278
- }
279
- phq->terms[index] = term;
280
- phq->positions[index] = position;
281
- phq->t_cnt++;
282
108
  }
283
109
 
284
- static uint phq_hash(Query *self)
110
+ static int pp_pos_cmp(const void *const p1, const void *const p2)
285
111
  {
286
- int i;
287
- uint hash = 0;
288
- PhraseQuery *phq = (PhraseQuery *)self->data;
289
- for (i = 0; i < phq->t_cnt; i++) {
290
- hash = (hash << 1) ^ (term_hash(phq->terms[i]) ^ phq->positions[i]);
291
- }
292
- return (hash ^ phq->slop);
112
+ return PP_pp(p1)->position - PP_pp(p2)->position;
293
113
  }
294
114
 
295
- static int phq_eq(Query *self, Query *o)
115
+ static bool pp_less_than(const PhPos *pp1, const PhPos *pp2)
116
+ {
117
+ /* docs will all be equal when this method is used */
118
+ return pp1->position < pp2->position;
119
+ /*
120
+ if (PP(p)->doc == PP(p)->doc) {
121
+ return PP(p)->position < PP(p)->position;
122
+ }
123
+ else {
124
+ return PP(p)->doc < PP(p)->doc;
125
+ }
126
+ */
127
+ }
128
+
129
+ void pp_destroy(PhPos *pp)
296
130
  {
297
- int i;
298
- PhraseQuery *phq1 = (PhraseQuery *)self->data;
299
- PhraseQuery *phq2 = (PhraseQuery *)o->data;
300
- if (phq1->slop != phq2->slop) return false;
301
- for (i = 0; i < phq1->t_cnt; i++) {
302
- if (!term_eq(phq1->terms[i], phq2->terms[i]) ||
303
- (phq1->positions[i] != phq2->positions[i])) return false;
304
- }
305
- return true;
131
+ if (pp->tpe) {
132
+ pp->tpe->close(pp->tpe);
133
+ }
134
+ free(pp);
306
135
  }
307
136
 
308
- Query *phq_create()
137
+ PhPos *pp_new(TermDocEnum *tpe, int offset)
309
138
  {
310
- Query *self = q_create();
311
- PhraseQuery *phq = ALLOC_AND_ZERO_N(PhraseQuery, 1);
139
+ PhPos *self = ALLOC(PhPos);
312
140
 
313
- phq->t_capa = PHQ_INIT_CAPA;
314
- phq->terms = ALLOC_N(Term *, PHQ_INIT_CAPA);
315
- phq->positions = ALLOC_N(int, PHQ_INIT_CAPA);
316
- self->data = phq;
141
+ self->tpe = tpe;
142
+ self->count = self->doc = self->position = -1;
143
+ self->offset = offset;
317
144
 
318
- self->type = PHRASE_QUERY;
319
- self->rewrite = &phq_rewrite;
320
- self->extract_terms = &phq_extract_terms;
321
- self->to_s = &phq_to_s;
322
- self->hash = &phq_hash;
323
- self->eq = &phq_eq;
324
- self->destroy_i = &phq_destroy;
325
- self->create_weight_i = &phw_create;
326
- return self;
145
+ return self;
327
146
  }
328
147
 
329
148
  /***************************************************************************
330
- *
331
149
  * PhraseScorer
332
- *
333
150
  ***************************************************************************/
334
151
 
335
- /***************************************************************************
336
- * PhrasePosition
337
- ***************************************************************************/
152
+ #define PhSc(scorer) ((PhraseScorer *)(scorer))
338
153
 
339
- bool pp_next(PhrasePosition *self)
154
+ typedef struct PhraseScorer
340
155
  {
341
- TermDocEnum *tpe = self->tpe;
342
- if (!tpe->next(tpe)) {
343
- tpe->close(tpe); // close stream
344
- self->tpe = NULL;
345
- self->doc = INT_MAX; // sentinel value
346
- return false;
347
- }
348
- self->doc = tpe->doc_num(tpe);
349
- self->position = 0;
350
- return true;
156
+ Scorer super;
157
+ float (*phrase_freq)(Scorer *self);
158
+ float freq;
159
+ uchar *norms;
160
+ float value;
161
+ Weight *weight;
162
+ PhPos **phrase_pos;
163
+ int pp_first_idx;
164
+ int pp_cnt;
165
+ int slop;
166
+ bool first_time : 1;
167
+ bool more : 1;
168
+ } PhraseScorer;
169
+
170
+ static void phsc_init(PhraseScorer *phsc)
171
+ {
172
+ int i;
173
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
174
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
175
+ }
176
+
177
+ if (phsc->more) {
178
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
179
+ sizeof(PhPos *), &pp_cmp);
180
+ phsc->pp_first_idx = 0;
181
+ }
351
182
  }
352
183
 
353
- bool pp_skip_to(PhrasePosition *self, int doc_num)
184
+ static bool phsc_do_next(Scorer *self)
354
185
  {
355
- TermDocEnum *tpe = self->tpe;
356
- if (!tpe->skip_to(tpe, doc_num)) {
357
- tpe->close(tpe); // close stream
358
- self->tpe = NULL;
359
- self->doc = INT_MAX; // sentinel value
186
+ PhraseScorer *phsc = PhSc(self);
187
+ const int pp_cnt = phsc->pp_cnt;
188
+ int pp_first_idx = phsc->pp_first_idx;
189
+ PhPos **phrase_positions = phsc->phrase_pos;
190
+
191
+ PhPos *first = phrase_positions[pp_first_idx];
192
+ PhPos *last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
193
+
194
+ while (phsc->more) {
195
+ /* find doc with all the terms */
196
+ while (phsc->more && first->doc < last->doc) {
197
+ /* skip first upto last */
198
+ phsc->more = pp_skip_to(first, last->doc);
199
+ last = first;
200
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
201
+ first = phrase_positions[pp_first_idx];
202
+ }
203
+
204
+ if (phsc->more) {
205
+ /* pp_first_idx will be used by phrase_freq */
206
+ phsc->pp_first_idx = pp_first_idx;
207
+
208
+ /* found a doc with all of the terms */
209
+ phsc->freq = phsc->phrase_freq(self);
210
+
211
+ if (phsc->freq == 0.0) { /* no match */
212
+ /* continuing search so re-set first and last */
213
+ pp_first_idx = phsc->pp_first_idx;
214
+ first = phrase_positions[pp_first_idx];
215
+ last = phrase_positions[PREV_NUM(pp_first_idx, pp_cnt)];
216
+ phsc->more = pp_next(last); /* trigger further scanning */
217
+ }
218
+ else {
219
+ self->doc = first->doc;
220
+ return true; /* found a match */
221
+ }
222
+
223
+ }
224
+ }
360
225
  return false;
361
- }
362
- self->doc = tpe->doc_num(tpe);
363
- self->position = 0;
364
- return true;
365
226
  }
366
227
 
367
- bool pp_next_position(PhrasePosition *self)
228
+ static float phsc_score(Scorer *self)
368
229
  {
369
- TermDocEnum *tpe = self->tpe;
370
- self->count -= 1;
371
- if (self->count >= 0) {// read subsequent pos's
372
- self->position = tpe->next_position(tpe) - self->offset;
373
- return true;
374
- } else {
375
- return false;
376
- }
230
+ PhraseScorer *phsc = PhSc(self);
231
+ float raw_score = sim_tf(self->similarity, phsc->freq) * phsc->value;
232
+ /* normalize */
233
+ return raw_score * sim_decode_norm(
234
+ self->similarity,
235
+ phsc->norms[phsc->phrase_pos[phsc->pp_first_idx]->doc]);
377
236
  }
378
237
 
379
- bool pp_first_position(PhrasePosition *self)
238
+ static bool phsc_next(Scorer *self)
380
239
  {
381
- TermDocEnum *tpe = self->tpe;
382
- self->count = tpe->freq(tpe); // read first pos
383
- return pp_next_position(self);
240
+ PhraseScorer *phsc = PhSc(self);
241
+ if (phsc->first_time) {
242
+ phsc_init(phsc);
243
+ phsc->first_time = false;
244
+ }
245
+ else if (phsc->more) {
246
+ /* trigger further scanning */
247
+ phsc->more = pp_next(
248
+ phsc->phrase_pos[PREV_NUM(phsc->pp_first_idx, phsc->pp_cnt)]);
249
+ }
250
+
251
+ return phsc_do_next(self);
252
+ }
253
+
254
+ static bool phsc_skip_to(Scorer *self, int doc_num)
255
+ {
256
+ PhraseScorer *phsc = PhSc(self);
257
+ int i;
258
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
259
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) {
260
+ break;
261
+ }
262
+ }
263
+
264
+ if (phsc->more) {
265
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
266
+ sizeof(PhPos *), &pp_cmp);
267
+ phsc->pp_first_idx = 0;
268
+ }
269
+ return phsc_do_next(self);
384
270
  }
385
271
 
386
- char *pp_to_s(PhrasePosition *self)
272
+ static Explanation *phsc_explain(Scorer *self, int doc_num)
387
273
  {
388
- return strfmt("pp->(doc => %d, position => %d)", self->doc, self->position);
274
+ PhraseScorer *phsc = PhSc(self);
275
+ float phrase_freq;
276
+
277
+ phsc_skip_to(self, doc_num);
278
+
279
+ phrase_freq = (self->doc == doc_num) ? phsc->freq : (float)0.0;
280
+ return expl_new(sim_tf(self->similarity, phrase_freq),
281
+ "tf(phrase_freq=%f)", phrase_freq);
389
282
  }
390
283
 
391
- inline int pp_cmp(const void *const p1, const void *const p2)
284
+ static void phsc_destroy(Scorer *self)
392
285
  {
393
- PhrasePosition *pp1 = *(PhrasePosition **)p1;
394
- PhrasePosition *pp2 = *(PhrasePosition **)p2;
395
- int cmp = pp1->doc - pp2->doc;
396
- if (cmp == 0) {
397
- return pp1->position - pp2->position;
398
- } else {
399
- return cmp;
400
- }
286
+ PhraseScorer *phsc = PhSc(self);
287
+ int i;
288
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
289
+ pp_destroy(phsc->phrase_pos[i]);
290
+ }
291
+ free(phsc->phrase_pos);
292
+ scorer_destroy_i(self);
401
293
  }
402
- bool pp_less_than(void *p1, void *p2)
294
+
295
+ static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum,
296
+ PhrasePosition *positions, int pos_cnt,
297
+ Similarity *similarity, uchar *norms)
403
298
  {
404
- PhrasePosition *pp1 = (PhrasePosition *)p1;
405
- PhrasePosition *pp2 = (PhrasePosition *)p2;
406
- if (pp1->doc == pp2->doc) {
407
- return pp1->position < pp2->position;
408
- } else {
409
- return pp1->doc < pp2->doc;
410
- }
299
+ int i;
300
+ Scorer *self = scorer_new(PhraseScorer, similarity);
301
+
302
+ PhSc(self)->weight = weight;
303
+ PhSc(self)->norms = norms;
304
+ PhSc(self)->value = weight->value;
305
+ PhSc(self)->phrase_pos = ALLOC_N(PhPos *, pos_cnt);
306
+ PhSc(self)->pp_first_idx = 0;
307
+ PhSc(self)->pp_cnt = pos_cnt;
308
+ PhSc(self)->slop = 0;
309
+ PhSc(self)->first_time = true;
310
+ PhSc(self)->more = true;
311
+
312
+ for (i = 0; i < pos_cnt; i++) {
313
+ PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos);
314
+ }
315
+
316
+ self->score = &phsc_score;
317
+ self->next = &phsc_next;
318
+ self->skip_to = &phsc_skip_to;
319
+ self->explain = &phsc_explain;
320
+ self->destroy = &phsc_destroy;
321
+
322
+ return self;
411
323
  }
412
324
 
413
- void pp_destroy(PhrasePosition *pp)
325
+ /***************************************************************************
326
+ * ExactPhraseScorer
327
+ ***************************************************************************/
328
+
329
+ static float ephsc_phrase_freq(Scorer *self)
414
330
  {
415
- if (pp->tpe) pp->tpe->close(pp->tpe);
416
- free(pp);
331
+ PhraseScorer *phsc = PhSc(self);
332
+ int i;
333
+ int pp_first_idx = 0;
334
+ const int pp_cnt = phsc->pp_cnt;
335
+ float freq = 0.0;
336
+ PhPos **phrase_positions = phsc->phrase_pos;
337
+ PhPos *first;
338
+ PhPos *last;
339
+
340
+ for (i = 0; i < pp_cnt; i++) {
341
+ pp_first_position(phrase_positions[i]);
342
+ }
343
+ qsort(phrase_positions, pp_cnt, sizeof(PhPos *), &pp_pos_cmp);
344
+
345
+ first = phrase_positions[0];
346
+ last = phrase_positions[pp_cnt - 1];
347
+
348
+ /* scan to position with all terms */
349
+ do {
350
+ /* scan forward in first */
351
+ while (first->position < last->position) {
352
+ do {
353
+ if (! pp_next_position(first)) {
354
+ /* maintain first position */
355
+ phsc->pp_first_idx = pp_first_idx;
356
+ return freq;
357
+ }
358
+ } while (first->position < last->position);
359
+ last = first;
360
+ pp_first_idx = NEXT_NUM(pp_first_idx, pp_cnt);
361
+ first = phrase_positions[pp_first_idx];
362
+ }
363
+ freq += 1.0; /* all equal: a match */
364
+ } while (pp_next_position(last));
365
+
366
+ /* maintain first position */
367
+ phsc->pp_first_idx = pp_first_idx;
368
+ return freq;
417
369
  }
418
370
 
419
- PhrasePosition *pp_create(TermDocEnum *tpe, int offset)
371
+ static Scorer *exact_phrase_scorer_new(Weight *weight,
372
+ TermDocEnum **term_pos_enum,
373
+ PhrasePosition *positions, int pp_cnt,
374
+ Similarity *similarity, uchar *norms)
420
375
  {
421
- PhrasePosition *self = ALLOC(PhrasePosition);
422
- self->tpe = tpe;
423
- self->count = self->doc = self->position = -1;
424
- self->offset = offset;
425
- return self;
376
+ Scorer *self =
377
+ phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
378
+
379
+ PhSc(self)->phrase_freq = &ephsc_phrase_freq;
380
+ return self;
426
381
  }
427
382
 
428
383
  /***************************************************************************
429
- * PhraseScorer
384
+ * SloppyPhraseScorer
430
385
  ***************************************************************************/
431
386
 
432
- #define GET_PHSC PhraseScorer *phsc = (PhraseScorer *)self->data
387
+ static float sphsc_phrase_freq(Scorer *self)
388
+ {
389
+ PhraseScorer *phsc = PhSc(self);
390
+ PhPos *pp;
391
+ PriorityQueue *pq = pq_new(phsc->pp_cnt, (lt_ft)&pp_less_than, NULL);
392
+ const int pp_cnt = phsc->pp_cnt;
393
+
394
+ int last_pos = 0, pos, next_pos, start, match_length, i;
395
+ bool done = false;
396
+ float freq = 0.0;
397
+
398
+ for (i = 0; i < pp_cnt; i++) {
399
+ pp = phsc->phrase_pos[i];
400
+ pp_first_position(pp);
401
+ if (pp->position > last_pos) {
402
+ last_pos = pp->position;
403
+ }
404
+ pq_push(pq, pp);
405
+ }
433
406
 
407
+ do {
408
+ pp = pq_pop(pq);
409
+ pos = start = pp->position;
410
+ next_pos = PP(pq_top(pq))->position;
411
+ while (pos <= next_pos) {
412
+ start = pos; /* advance pp to min window */
413
+ if (!pp_next_position(pp)) {
414
+ done = true; /* ran out of a positions for a term - done */
415
+ break;
416
+ }
417
+ pos = pp->position;
418
+ }
419
+
420
+ match_length = last_pos - start;
421
+ if (match_length <= phsc->slop) {
422
+ /* score match */
423
+ freq += sim_sloppy_freq(self->similarity, match_length);
424
+ }
425
+
426
+ if (pp->position > last_pos) {
427
+ last_pos = pp->position;
428
+ }
429
+ pq_push(pq, pp); /* restore pq */
430
+ } while (!done);
431
+
432
+ pq_destroy(pq);
433
+ return freq;
434
+ }
434
435
 
435
- void phsc_init(PhraseScorer *phsc)
436
+ static Scorer *sloppy_phrase_scorer_new(Weight *weight,
437
+ TermDocEnum **term_pos_enum,
438
+ PhrasePosition *positions,
439
+ int pp_cnt, Similarity *similarity,
440
+ int slop, uchar *norms)
436
441
  {
437
- int i;
438
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
439
- if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
440
- }
442
+ Scorer *self =
443
+ phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms);
441
444
 
442
- if (phsc->more) {
443
- qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
444
- phsc->pp_first = 0;
445
- phsc->pp_last = phsc->pp_cnt - 1;
446
- }
445
+ PhSc(self)->slop = slop;
446
+ PhSc(self)->phrase_freq = &sphsc_phrase_freq;
447
+ return self;
447
448
  }
448
449
 
449
- #define FIRST_TO_LAST() \
450
- last = first;\
451
- phsc->pp_last = phsc->pp_first;\
452
- phsc->pp_first = (phsc->pp_first + 1) % phsc->pp_cnt;\
453
- first = phsc->phrase_pos[phsc->pp_first];
450
+ /***************************************************************************
451
+ *
452
+ * PhraseWeight
453
+ *
454
+ ***************************************************************************/
455
+
456
+ static char *phw_to_s(Weight *self)
457
+ {
458
+ return strfmt("PhraseWeight(%f)", self->value);
459
+ }
454
460
 
455
- bool phsc_do_next(Scorer *self)
461
+ static Scorer *phw_scorer(Weight *self, IndexReader *ir)
456
462
  {
457
- GET_PHSC;
458
- PhrasePosition *first = phsc->phrase_pos[phsc->pp_first];
459
- PhrasePosition *last = phsc->phrase_pos[phsc->pp_last];
463
+ int i;
464
+ Scorer *phsc = NULL;
465
+ PhraseQuery *phq = PhQ(self->query);
466
+ TermDocEnum **tps, *tpe;
467
+ PhrasePosition *positions = phq->positions;
468
+ const int pos_cnt = phq->pos_cnt;
469
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
470
+
471
+ if (pos_cnt == 0 || field_num < 0) {
472
+ return NULL;
473
+ }
460
474
 
461
- while (phsc->more) {
462
- while (phsc->more && first->doc < last->doc) { // find doc w/ all the terms
463
- phsc->more = pp_skip_to(first, last->doc); // skip first upto last
464
- FIRST_TO_LAST(); // and move it to the end
475
+ tps = ALLOC_N(TermDocEnum *, pos_cnt);
476
+
477
+ for (i = 0; i < pos_cnt; i++) {
478
+ char **terms = positions[i].terms;
479
+ const int t_cnt = ary_size(terms);
480
+ if (t_cnt == 1) {
481
+ tpe = tps[i] = ir->term_positions(ir);
482
+ tpe->seek(tpe, field_num, terms[0]);
483
+ }
484
+ else {
485
+ tps[i] = mtdpe_new(ir, field_num, terms, t_cnt);
486
+ }
487
+ if (tps[i] == NULL) {
488
+ /* free everything we just created and return NULL */
489
+ int j;
490
+ for (j = 0; j < i; j++) {
491
+ tps[i]->close(tps[i]);
492
+ }
493
+ free(tps);
494
+ return NULL;
495
+ }
465
496
  }
466
497
 
467
- if (phsc->more) {
468
- // found a doc with all of the terms
469
- phsc->freq = phsc->phrase_freq(self); // check for phrase
470
- if (phsc->freq == 0.0) { // no match
471
- first = phsc->phrase_pos[phsc->pp_first];
472
- last = phsc->phrase_pos[phsc->pp_last];
473
- phsc->more = pp_next(last); // trigger further scanning
474
- } else {
475
- self->doc = first->doc;
476
- return true; // found a match
477
- }
498
+ if (phq->slop == 0) { /* optimize exact (common) case */
499
+ phsc = exact_phrase_scorer_new(self, tps, positions, pos_cnt,
500
+ self->similarity,
501
+ ir->get_norms(ir, field_num));
502
+ }
503
+ else {
504
+ phsc = sloppy_phrase_scorer_new(self, tps, positions, pos_cnt,
505
+ self->similarity, phq->slop,
506
+ ir->get_norms(ir, field_num));
507
+ }
508
+ free(tps);
509
+ return phsc;
510
+ }
511
+
512
+ Explanation *phw_explain(Weight *self, IndexReader *ir, int doc_num)
513
+ {
514
+ Explanation *expl;
515
+ Explanation *idf_expl1;
516
+ Explanation *idf_expl2;
517
+ Explanation *query_expl;
518
+ Explanation *qnorm_expl;
519
+ Explanation *field_expl;
520
+ Explanation *tf_expl;
521
+ Scorer *scorer;
522
+ uchar *field_norms;
523
+ float field_norm;
524
+ Explanation *field_norm_expl;
525
+ char *query_str;
526
+ PhraseQuery *phq = PhQ(self->query);
527
+ const int pos_cnt = phq->pos_cnt;
528
+ PhrasePosition *positions = phq->positions;
529
+ int i, j;
530
+ char *doc_freqs = NULL;
531
+ size_t len = 0, pos = 0;
532
+ const int field_num = fis_get_field_num(ir->fis, phq->field);
533
+
534
+ if (field_num < 0) {
535
+ return expl_new(0.0, "field \"%s\" does not exist in the index", phq->field);
536
+ }
537
+
538
+ query_str = self->query->to_s(self->query, "");
539
+
540
+ expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
541
+
542
+ /* ensure the phrase positions are in order for explanation */
543
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
544
+
545
+ for (i = 0; i < phq->pos_cnt; i++) {
546
+ char **terms = phq->positions[i].terms;
547
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
548
+ len += strlen(terms[j]) + 30;
549
+ }
550
+ }
551
+ doc_freqs = ALLOC_N(char, len);
552
+ for (i = 0; i < phq->pos_cnt; i++) {
553
+ char **terms = phq->positions[i].terms;
554
+ const int t_cnt = ary_size(terms);
555
+ for (j = 0; j < t_cnt; j++) {
556
+ char *term = terms[j];
557
+ sprintf(doc_freqs + pos, "%s=%d, ",
558
+ term, ir->doc_freq(ir, field_num, term));
559
+ pos += strlen(doc_freqs + pos);
560
+ }
561
+ }
562
+ pos -= 2; /* remove ", " from the end */
563
+ doc_freqs[pos] = 0;
564
+
565
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
566
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", phq->field, doc_freqs);
567
+ free(doc_freqs);
568
+
569
+ /* explain query weight */
570
+ query_expl = expl_new(0.0, "query_weight(%s), product of:", query_str);
571
+
572
+ if (self->query->boost != 1.0) {
573
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
574
+ }
575
+ expl_add_detail(query_expl, idf_expl1);
576
+
577
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
578
+ expl_add_detail(query_expl, qnorm_expl);
579
+
580
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
581
+
582
+ expl_add_detail(expl, query_expl);
583
+
584
+ /* explain field weight */
585
+ field_expl = expl_new(0.0, "field_weight(%s in %d), product of:",
586
+ query_str, doc_num);
587
+ free(query_str);
588
+
589
+ scorer = self->scorer(self, ir);
590
+ tf_expl = scorer->explain(scorer, doc_num);
591
+ scorer->destroy(scorer);
592
+ expl_add_detail(field_expl, tf_expl);
593
+ expl_add_detail(field_expl, idf_expl2);
594
+
595
+ field_norms = ir->get_norms(ir, field_num);
596
+ field_norm = (field_norms != NULL)
597
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
598
+ : (float)0.0;
599
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
600
+ phq->field, doc_num);
601
+
602
+ expl_add_detail(field_expl, field_norm_expl);
603
+
604
+ field_expl->value = tf_expl->value * self->idf * field_norm;
605
+
606
+ /* combine them */
607
+ if (query_expl->value == 1.0) {
608
+ expl_destroy(expl);
609
+ return field_expl;
610
+ }
611
+ else {
612
+ expl->value = (query_expl->value * field_expl->value);
613
+ expl_add_detail(expl, field_expl);
614
+ return expl;
478
615
  }
479
- }
480
- return false;
481
616
  }
482
617
 
483
- float phsc_score(Scorer *self)
618
+ static Weight *phw_new(Query *query, Searcher *searcher)
484
619
  {
485
- GET_PHSC;
486
- float raw = sim_tf(self->similarity, phsc->freq) * phsc->value; // raw score
487
- // normalize
488
- return raw * sim_decode_norm(self->similarity,
489
- phsc->norms[phsc->phrase_pos[phsc->pp_first]->doc]);
620
+ Weight *self = w_new(Weight, query);
621
+
622
+ self->scorer = &phw_scorer;
623
+ self->explain = &phw_explain;
624
+ self->to_s = &phw_to_s;
625
+
626
+ self->similarity = query->get_similarity(query, searcher);
627
+ self->value = query->boost;
628
+ self->idf = sim_idf_phrase(self->similarity, PhQ(query)->field,
629
+ PhQ(query)->positions,
630
+ PhQ(query)->pos_cnt, searcher);
631
+ return self;
490
632
  }
491
633
 
492
- bool phsc_next(Scorer *self)
634
+ /***************************************************************************
635
+ *
636
+ * PhraseQuery
637
+ *
638
+ ***************************************************************************/
639
+
640
+ /* ** TVPosEnum ** */
641
+ typedef struct TVPosEnum
642
+ {
643
+ int index;
644
+ int size;
645
+ int offset;
646
+ int pos;
647
+ int positions[];
648
+ } TVPosEnum;
649
+
650
+ static bool tvpe_next(TVPosEnum *self)
493
651
  {
494
- GET_PHSC;
495
- if (phsc->first_time) {
496
- phsc_init(phsc);
497
- phsc->first_time = false;
498
- } else if (phsc->more) {
499
- phsc->more = pp_next(phsc->phrase_pos[phsc->pp_last]); // trigger further scanning
500
- }
501
- return phsc_do_next(self);
652
+ if (++(self->index) < self->size) {
653
+ self->pos = self->positions[self->index] - self->offset;
654
+ return true;
655
+ }
656
+ else {
657
+ self->pos = -1;
658
+ return false;
659
+ }
502
660
  }
503
661
 
504
- bool phsc_skip_to(Scorer *self, int doc_num)
662
+ static int tvpe_skip_to(TVPosEnum *self, int position)
505
663
  {
506
- GET_PHSC;
507
- int i;
508
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
509
- if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) break;
510
- }
664
+ int i;
665
+ int search_pos = position + self->offset;
666
+ for (i = self->index + 1; i < self->size; i++) {
667
+ if (self->positions[i] >= search_pos) {
668
+ self->pos = self->positions[i] - self->offset;
669
+ break;
670
+ }
671
+ }
672
+ self->index = i;
673
+ if (i == self->size) {
674
+ self->pos = -1;
675
+ return false;
676
+ }
677
+ return true;
678
+ }
511
679
 
512
- if (phsc->more) {
513
- qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
514
- phsc->pp_first = 0;
515
- phsc->pp_last = phsc->pp_cnt - 1;
516
- }
517
- return phsc_do_next(self);
680
+ static bool tvpe_lt(TVPosEnum *tvpe1, TVPosEnum *tvpe2)
681
+ {
682
+ return tvpe1->pos < tvpe2->pos;
518
683
  }
519
684
 
520
- static Explanation *phsc_explain(Scorer *self, int doc_num)
685
+ static TVPosEnum *tvpe_new(int *positions, int size, int offset)
521
686
  {
522
- GET_PHSC;
523
- float phrase_freq;
687
+ TVPosEnum *self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
688
+ + size * sizeof(int));
689
+ memcpy(self->positions, positions, size * sizeof(int));
690
+ self->size = size;
691
+ self->offset = offset;
692
+ self->index = -1;
693
+ self->pos = -1;
694
+ return self;
695
+ }
524
696
 
525
- while (phsc_next(self) && self->doc < doc_num)
526
- ;
697
+ static TVPosEnum *tvpe_new_merge(char **terms, int t_cnt, TermVector *tv,
698
+ int offset)
699
+ {
700
+ int i, total_positions = 0;
701
+ PriorityQueue *tvpe_pq = pq_new(t_cnt, (lt_ft)tvpe_lt, &free);
702
+ TVPosEnum *self = NULL;
703
+
704
+ for (i = 0; i < t_cnt; i++) {
705
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[i]);
706
+ if (tv_term) {
707
+ TVPosEnum *tvpe = tvpe_new(tv_term->positions, tv_term->freq, 0);
708
+ if (tvpe_next(tvpe)) {
709
+ pq_push(tvpe_pq, tvpe);
710
+ total_positions += tv_term->freq;
711
+ }
712
+ else {
713
+ free(tvpe);
714
+ }
715
+ }
716
+ }
717
+ if (tvpe_pq->size == 0) {
718
+ pq_destroy(tvpe_pq);
719
+ }
720
+ else {
721
+ int index = 0;
722
+ self = (TVPosEnum *)emalloc(sizeof(TVPosEnum)
723
+ + total_positions * sizeof(int));
724
+ self->size = total_positions;
725
+ self->offset = offset;
726
+ self->index = -1;
727
+ self->pos = -1;
728
+ while (tvpe_pq->size > 0) {
729
+ TVPosEnum *top = (TVPosEnum *)pq_top(tvpe_pq);
730
+ self->positions[index++] = top->pos;
731
+ if (! tvpe_next(top)) {
732
+ pq_pop(tvpe_pq);
733
+ free(top);
734
+ }
735
+ else {
736
+ pq_down(tvpe_pq);
737
+ }
738
+ }
739
+ pq_destroy(tvpe_pq);
740
+ }
741
+ return self;
742
+ }
527
743
 
528
- phrase_freq = (self->doc == doc_num) ? phsc->freq : (float)0.0;
529
- return expl_create(sim_tf(self->similarity, phrase_freq),
530
- strfmt("tf(phrase_freq=%f)", phrase_freq));
744
+ static TVPosEnum *get_tvpe(TermVector *tv, char **terms, int t_cnt, int offset)
745
+ {
746
+ TVPosEnum *tvpe = NULL;
747
+ if (t_cnt == 1) {
748
+ TVTerm *tv_term = tv_get_tv_term(tv, terms[0]);
749
+ if (tv_term) {
750
+ tvpe = tvpe_new(tv_term->positions, tv_term->freq, offset);
751
+ }
752
+ }
753
+ else {
754
+ tvpe = tvpe_new_merge(terms, t_cnt, tv, offset);
755
+ }
756
+ return tvpe;
531
757
  }
532
758
 
533
- static void phsc_destroy(Scorer *self)
759
+ static MatchVector *phq_get_matchv_i(Query *self, MatchVector *mv,
760
+ TermVector *tv)
534
761
  {
535
- GET_PHSC;
536
- int i;
537
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
538
- pp_destroy(phsc->phrase_pos[i]);
539
- }
540
- free(phsc->phrase_pos);
541
- scorer_destroy_i(self);
762
+ if (strcmp(tv->field, PhQ(self)->field) == 0) {
763
+ const int pos_cnt = PhQ(self)->pos_cnt;
764
+ int i;
765
+ int slop = PhQ(self)->slop;
766
+ bool done = false;
767
+
768
+ if (slop > 0) {
769
+ PriorityQueue *tvpe_pq = pq_new(pos_cnt, (lt_ft)tvpe_lt, &free);
770
+ int last_pos = 0;
771
+ for (i = 0; i < pos_cnt; i++) {
772
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
773
+ const int t_cnt = ary_size(pp->terms);
774
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
775
+ if (tvpe && tvpe_next(tvpe)) {
776
+ if (tvpe->pos > last_pos) {
777
+ last_pos = tvpe->pos;
778
+ }
779
+ pq_push(tvpe_pq, tvpe);
780
+ }
781
+ else {
782
+ done = true;
783
+ free(tvpe);
784
+ break;
785
+ }
786
+ }
787
+ while (! done) {
788
+ TVPosEnum *tvpe = pq_pop(tvpe_pq);
789
+ int pos;
790
+ int start = pos = tvpe->pos;
791
+ int next_pos = ((TVPosEnum *)pq_top(tvpe_pq))->pos;
792
+ while (pos <= next_pos) {
793
+ start = pos;
794
+ if (!tvpe_next(tvpe)) {
795
+ done = true;
796
+ break;
797
+ }
798
+ pos = tvpe->pos;
799
+ }
800
+
801
+ if (last_pos - start <= slop) {
802
+ int min, max = min = start + tvpe->offset;
803
+ for (i = tvpe_pq->size; i > 0; i--) {
804
+ TVPosEnum *t = (TVPosEnum *)tvpe_pq->heap[i];
805
+ int p = t->pos + t->offset;
806
+ max = p > max ? p : max;
807
+ min = p < min ? p : min;
808
+ }
809
+ matchv_add(mv, min, max);
810
+ }
811
+ if (tvpe->pos > last_pos) {
812
+ last_pos = tvpe->pos;
813
+ }
814
+ pq_push(tvpe_pq, tvpe);
815
+ }
816
+
817
+ pq_destroy(tvpe_pq);
818
+ }
819
+ else { /* exact match */
820
+ TVPosEnum **tvpe_a = ALLOC_AND_ZERO_N(TVPosEnum *, pos_cnt);
821
+ TVPosEnum *first, *last;
822
+ int first_index = 0;
823
+ done = false;
824
+ qsort(PhQ(self)->positions, pos_cnt, sizeof(PhrasePosition),
825
+ &phrase_pos_cmp);
826
+ for (i = 0; i < pos_cnt; i++) {
827
+ PhrasePosition *pp = &(PhQ(self)->positions[i]);
828
+ const int t_cnt = ary_size(pp->terms);
829
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
830
+ if (tvpe && ((i == 0 && tvpe_next(tvpe))
831
+ || tvpe_skip_to(tvpe, tvpe_a[i-1]->pos))) {
832
+ tvpe_a[i] = tvpe;
833
+ }
834
+ else {
835
+ done = true;
836
+ free(tvpe);
837
+ break;
838
+ }
839
+ }
840
+
841
+ first = tvpe_a[0];
842
+ last = tvpe_a[pos_cnt - 1];
843
+
844
+ while (!done) {
845
+ while (first->pos < last->pos) {
846
+ if (tvpe_skip_to(first, last->pos)) {
847
+ last = first;
848
+ first_index = NEXT_NUM(first_index, pos_cnt);
849
+ first = tvpe_a[first_index];
850
+ }
851
+ else {
852
+ done = true;
853
+ break;
854
+ }
855
+ }
856
+ if (!done) {
857
+ matchv_add(mv, tvpe_a[0]->pos + tvpe_a[0]->offset,
858
+ tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
859
+ }
860
+ if (!tvpe_next(last)) {
861
+ done = true;
862
+ }
863
+ }
864
+ for (i = 0; i < pos_cnt; i++) {
865
+ free(tvpe_a[i]);
866
+ }
867
+ free(tvpe_a);
868
+ }
869
+ }
870
+ return mv;
542
871
  }
543
872
 
544
- Scorer *phsc_create(Weight *weight, TermDocEnum **term_pos_enum,
545
- int *positions, int t_cnt, Similarity *similarity, uchar *norms)
873
+
874
+ /* ** PhraseQuery besides highlighting stuff ** */
875
+
876
+ #define PhQ_INIT_CAPA 4
877
+
878
+ static void phq_extract_terms(Query *self, HashSet *term_set)
546
879
  {
547
- int i;
548
- Scorer *self = scorer_create(similarity);
549
- PhraseScorer *phsc = ALLOC(PhraseScorer);
550
- ZEROSET(phsc, PhraseScorer, 1);
880
+ PhraseQuery *phq = PhQ(self);
881
+ int i, j;
882
+ for (i = 0; i < phq->pos_cnt; i++) {
883
+ char **terms = phq->positions[i].terms;
884
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
885
+ hs_add(term_set, term_new(phq->field, terms[j]));
886
+ }
887
+ }
888
+ }
551
889
 
552
- phsc->weight = weight;
553
- phsc->norms = norms;
554
- phsc->value = weight->value;
890
+ static char *phq_to_s(Query *self, const char *field)
891
+ {
892
+ PhraseQuery *phq = PhQ(self);
893
+ const int pos_cnt = phq->pos_cnt;
894
+ PhrasePosition *positions = phq->positions;
555
895
 
556
- phsc->phrase_pos = ALLOC_N(PhrasePosition *, t_cnt);
557
- for (i = 0; i < t_cnt; i++) {
558
- phsc->phrase_pos[i] = pp_create(term_pos_enum[i], positions[i]);
559
- }
560
- phsc->pp_first = 0;
561
- phsc->pp_last = t_cnt - 1;
562
- phsc->pp_cnt = t_cnt;
896
+ int i, j, buf_index = 0, pos, last_pos;
897
+ size_t len = 0;
898
+ char *buffer;
563
899
 
564
- phsc->slop = 0;
900
+ if (phq->pos_cnt == 0) {
901
+ return NULL;
902
+ }
565
903
 
566
- phsc->first_time = true;
567
- phsc->more = true;
904
+ /* sort the phrase positions by position */
905
+ qsort(positions, pos_cnt, sizeof(PhrasePosition), &phrase_pos_cmp);
568
906
 
569
- self->data = phsc;
570
- self->score = &phsc_score;
571
- self->next = &phsc_next;
572
- self->skip_to = &phsc_skip_to;
573
- self->explain = &phsc_explain;
574
- self->destroy = &phsc_destroy;
907
+ len = strlen(phq->field) + 1;
575
908
 
576
- return self;
577
- }
909
+ for (i = 0; i < pos_cnt; i++) {
910
+ char **terms = phq->positions[i].terms;
911
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
912
+ len += strlen(terms[j]) + 5;
913
+ }
914
+ }
578
915
 
579
- /***************************************************************************
580
- * ExactPhraseScorer
581
- ***************************************************************************/
916
+ /* add space for extra <> characters and boost and slop */
917
+ len += 100 + 3
918
+ * (phq->positions[phq->pos_cnt - 1].pos - phq->positions[0].pos);
582
919
 
583
- float ephsc_phrase_freq(Scorer *self)
584
- {
585
- GET_PHSC;
586
- // sort list with pq
587
- int i;
588
- float freq = 0.0;
589
- PhrasePosition *first;
590
- PhrasePosition *last;
920
+ buffer = ALLOC_N(char, len);
921
+
922
+ if (strcmp(field, phq->field) != 0) {
923
+ len = strlen(phq->field);
924
+ memcpy(buffer, phq->field, len);
925
+ buffer[len] = ':';
926
+ buf_index += len + 1;
927
+ }
591
928
 
592
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
593
- pp_first_position(phsc->phrase_pos[i]);
594
- }
595
- qsort(phsc->phrase_pos, phsc->pp_cnt, sizeof(PhrasePosition *), &pp_cmp);
596
- phsc->pp_first = 0;
597
- phsc->pp_last = phsc->pp_cnt - 1;
929
+ buffer[buf_index++] = '"';
930
+
931
+ last_pos = positions[0].pos - 1;
932
+ for (i = 0; i < pos_cnt; i++) {
933
+ char **terms = positions[i].terms;
934
+ const int t_cnt = ary_size(terms);
935
+
936
+ pos = positions[i].pos;
937
+ if (pos == last_pos) {
938
+ buffer[buf_index - 1] = '&';
939
+ }
940
+ else {
941
+ for (j = last_pos; j < pos - 1; j++) {
942
+ memcpy(buffer + buf_index, "<> ", 3);
943
+ buf_index += 3;
944
+ }
945
+ }
946
+
947
+ last_pos = pos;
948
+ for (j = 0; j < t_cnt; j++) {
949
+ char *term = terms[j];
950
+ len = strlen(term);
951
+ memcpy(buffer + buf_index, term, len);
952
+ buf_index += len;
953
+ buffer[buf_index++] = '|';
954
+ }
955
+ buffer[buf_index-1] = ' '; /* change last '|' to ' ' */
956
+ }
957
+
958
+ if (buffer[buf_index-1] == ' ') {
959
+ buf_index--;
960
+ }
598
961
 
599
- first = phsc->phrase_pos[0];
600
- last = phsc->phrase_pos[phsc->pp_last];
962
+ buffer[buf_index++] = '"';
963
+ buffer[buf_index] = 0;
601
964
 
602
- do { // find position w/ all terms
603
- while (first->position < last->position) { // scan forward in first
604
- do {
605
- if (! pp_next_position(first)) return freq;
606
- } while (first->position < last->position);
607
- FIRST_TO_LAST();
965
+ if (phq->slop != 0) {
966
+ sprintf(buffer + buf_index, "~%d", phq->slop);
967
+ buf_index += strlen(buffer + buf_index);
608
968
  }
609
- freq += 1.0; // all equal: a match
610
- } while (pp_next_position(last));
611
969
 
612
- return freq;
970
+ if (self->boost != 1.0) {
971
+ buffer[buf_index++] = '^';
972
+ dbl_to_s(buffer + buf_index, self->boost);
973
+ }
974
+
975
+ return buffer;
613
976
  }
614
977
 
615
- Scorer *exact_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
616
- int *positions, int t_cnt, Similarity *similarity, uchar *norms)
978
+ static void phq_destroy(Query *self)
617
979
  {
618
- Scorer *self =
619
- phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
620
- GET_PHSC;
621
- phsc->phrase_freq = &ephsc_phrase_freq;
622
- return self;
980
+ PhraseQuery *phq = PhQ(self);
981
+ int i;
982
+ free(phq->field);
983
+ for (i = 0; i < phq->pos_cnt; i++) {
984
+ ary_destroy(phq->positions[i].terms, &free);
985
+ }
986
+ free(phq->positions);
987
+ q_destroy_i(self);
623
988
  }
624
989
 
625
- /***************************************************************************
626
- * SloppyPhraseScorer
627
- ***************************************************************************/
628
-
629
- float sphsc_phrase_freq(Scorer *self)
990
+ static Query *phq_rewrite(Query *self, IndexReader *ir)
630
991
  {
631
- GET_PHSC;
632
- PhrasePosition *pp;
633
- PriorityQueue *pq = pq_create(phsc->pp_cnt, &pp_less_than);
634
-
635
- int last_pos = 0, pos, next_pos, start, match_length, i;
636
- bool done = false;
637
- float freq = 0.0;
638
-
639
- for (i = phsc->pp_cnt - 1; i >= 0; i--) {
640
- pp = phsc->phrase_pos[i];
641
- pp_first_position(pp);
642
- if (pp->position > last_pos) last_pos = pp->position;
643
- pq_push(pq, pp);
644
- }
645
-
646
- do {
647
- pp = pq_pop(pq);
648
- pos = start = pp->position;
649
- next_pos = ((PhrasePosition *)pq_top(pq))->position;
650
- while (pos <= next_pos) {
651
- start = pos; // advance pp to min window
652
- if (!pp_next_position(pp)) {
653
- done = true; // ran out of a term -- done
654
- break;
655
- }
656
- pos = pp->position;
992
+ PhraseQuery *phq = PhQ(self);
993
+ (void)ir;
994
+ if (phq->pos_cnt == 1) {
995
+ /* optimize one-position case */
996
+ char **terms = phq->positions[0].terms;
997
+ const int t_cnt = ary_size(terms);
998
+ if (t_cnt == 1) {
999
+ Query *tq = tq_new(phq->field, terms[0]);
1000
+ tq->boost = self->boost;
1001
+ return tq;
1002
+ }
1003
+ else {
1004
+ Query *q = multi_tq_new(phq->field);
1005
+ int i;
1006
+ for (i = 0; i < t_cnt; i++) {
1007
+ multi_tq_add_term(q, terms[i]);
1008
+ }
1009
+ q->boost = self->boost;
1010
+ return q;
1011
+ }
1012
+ } else {
1013
+ self->ref_cnt++;
1014
+ return self;
657
1015
  }
1016
+ }
658
1017
 
659
- match_length = last_pos - start;
660
- if (match_length <= phsc->slop) {
661
- freq += sim_sloppy_freq(self->similarity, match_length); // score match
1018
+ static ulong phq_hash(Query *self)
1019
+ {
1020
+ int i, j;
1021
+ PhraseQuery *phq = PhQ(self);
1022
+ ulong hash = str_hash(phq->field);
1023
+ for (i = 0; i < phq->pos_cnt; i++) {
1024
+ char **terms = phq->positions[i].terms;
1025
+ for (j = ary_size(terms) - 1; j >= 0; j--) {
1026
+ hash = (hash << 1) ^ (str_hash(terms[j])
1027
+ ^ phq->positions[i].pos);
1028
+ }
662
1029
  }
1030
+ return (hash ^ phq->slop);
1031
+ }
663
1032
 
664
- if (pp->position > last_pos) {
665
- last_pos = pp->position;
1033
+ static int phq_eq(Query *self, Query *o)
1034
+ {
1035
+ int i, j;
1036
+ PhraseQuery *phq1 = PhQ(self);
1037
+ PhraseQuery *phq2 = PhQ(o);
1038
+ if (phq1->slop != phq2->slop
1039
+ || strcmp(phq1->field, phq2->field) != 0
1040
+ || phq1->pos_cnt != phq2->pos_cnt) {
1041
+ return false;
1042
+ }
1043
+ for (i = 0; i < phq1->pos_cnt; i++) {
1044
+ char **terms1 = phq1->positions[i].terms;
1045
+ char **terms2 = phq2->positions[i].terms;
1046
+ const int t_cnt = ary_size(terms1);
1047
+ if (t_cnt != ary_size(terms2)
1048
+ || phq1->positions[i].pos != phq2->positions[i].pos) {
1049
+ return false;
1050
+ }
1051
+ for (j = 0; j < t_cnt; j++) {
1052
+ if (strcmp(terms1[j], terms2[j]) != 0) {
1053
+ return false;
1054
+ }
1055
+ }
666
1056
  }
667
- pq_push(pq, pp); // restore pq
668
- } while (!done);
1057
+ return true;
1058
+ }
1059
+
1060
+ Query *phq_new(const char *field)
1061
+ {
1062
+ Query *self = q_new(PhraseQuery);
1063
+
1064
+ PhQ(self)->field = estrdup(field);
1065
+ PhQ(self)->pos_cnt = 0;
1066
+ PhQ(self)->pos_capa = PhQ_INIT_CAPA;
1067
+ PhQ(self)->positions = ALLOC_N(PhrasePosition, PhQ_INIT_CAPA);
1068
+
1069
+ self->type = PHRASE_QUERY;
1070
+ self->rewrite = &phq_rewrite;
1071
+ self->extract_terms = &phq_extract_terms;
1072
+ self->to_s = &phq_to_s;
1073
+ self->hash = &phq_hash;
1074
+ self->eq = &phq_eq;
1075
+ self->destroy_i = &phq_destroy;
1076
+ self->create_weight_i = &phw_new;
1077
+ self->get_matchv_i = &phq_get_matchv_i;
1078
+ return self;
1079
+ }
669
1080
 
670
- pq_destroy(pq);
671
- return freq;
1081
+ void phq_add_term_abs(Query *self, const char *term, int position)
1082
+ {
1083
+ PhraseQuery *phq = PhQ(self);
1084
+ int index = phq->pos_cnt;
1085
+ PhrasePosition *pp;
1086
+ if (index >= phq->pos_capa) {
1087
+ phq->pos_capa <<= 1;
1088
+ REALLOC_N(phq->positions, PhrasePosition, phq->pos_capa);
1089
+ }
1090
+ pp = &(phq->positions[index]);
1091
+ pp->terms = ary_new_type_capa(char *, 2);
1092
+ ary_push(pp->terms, estrdup(term));
1093
+ pp->pos = position;
1094
+ phq->pos_cnt++;
672
1095
  }
673
-
674
- Scorer *sloppy_phrase_scorer_create(Weight *weight, TermDocEnum **term_pos_enum,
675
- int *positions, int t_cnt, Similarity *similarity, int slop, uchar *norms)
1096
+
1097
+ void phq_add_term(Query *self, const char *term, int pos_inc)
676
1098
  {
677
- Scorer *self =
678
- phsc_create(weight, term_pos_enum, positions, t_cnt, similarity, norms);
679
- GET_PHSC;
680
- phsc->slop = slop;
681
- phsc->phrase_freq = &sphsc_phrase_freq;
682
- return self;
1099
+ PhraseQuery *phq = PhQ(self);
1100
+ int position;
1101
+ if (phq->pos_cnt == 0) {
1102
+ position = 0;
1103
+ }
1104
+ else {
1105
+ position = phq->positions[phq->pos_cnt - 1].pos + pos_inc;
1106
+ }
1107
+ phq_add_term_abs(self, term, position);
683
1108
  }
684
1109
 
1110
+ void phq_append_multi_term(Query *self, const char *term)
1111
+ {
1112
+ PhraseQuery *phq = PhQ(self);
1113
+ int index = phq->pos_cnt - 1;
1114
+
1115
+ if (index < 0) {
1116
+ phq_add_term(self, term, 0);
1117
+ }
1118
+ else {
1119
+ ary_push(phq->positions[index].terms, estrdup(term));
1120
+ }
1121
+ }