isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,1182 @@
1
+ #include <string.h>
2
+ #include <limits.h>
3
+ #include "frt_global.h"
4
+ #include "frt_search.h"
5
+ #include "frt_array.h"
6
+
7
+ #define PhQ(query) ((FrtPhraseQuery *)(query))
8
+
9
+ /**
10
+ * Use to sort the phrase positions into positional order. For phrase
11
+ * positions matching at the same position (a very unusual case) we order by
12
+ * first terms. The only real reason for the sorting by first terms is to get
13
+ * consistant order of positions when testing. Functionally it makes no
14
+ * difference.
15
+ */
16
+ static int phrase_pos_cmp(const void *p1, const void *p2)
17
+ {
18
+ int pos1 = ((FrtPhrasePosition *)p1)->pos;
19
+ int pos2 = ((FrtPhrasePosition *)p2)->pos;
20
+ if (pos1 > pos2) {
21
+ return 1;
22
+ }
23
+ if (pos1 < pos2) {
24
+ return -1;
25
+ }
26
+ return strcmp(((FrtPhrasePosition *)p1)->terms[0], ((FrtPhrasePosition *)p2)->terms[0]);
27
+ }
28
+
29
+
30
+ /***************************************************************************
31
+ *
32
+ * PhraseScorer
33
+ *
34
+ ***************************************************************************/
35
+
36
+ /***************************************************************************
37
+ * PhPos
38
+ ***************************************************************************/
39
+
40
+ #define PP(p) ((PhPos *)(p))
41
+ typedef struct PhPos
42
+ {
43
+ FrtTermDocEnum *tpe;
44
+ int offset;
45
+ int count;
46
+ int doc;
47
+ int position;
48
+ } PhPos;
49
+
50
+ static bool pp_next(PhPos *self)
51
+ {
52
+ FrtTermDocEnum *tpe = self->tpe;
53
+ assert(tpe);
54
+
55
+ if (!tpe->next(tpe)) {
56
+ tpe->close(tpe); /* close stream */
57
+ self->tpe = NULL;
58
+ self->doc = INT_MAX; /* sentinel value */
59
+ return false;
60
+ }
61
+ self->doc = tpe->doc_num(tpe);
62
+ self->position = 0;
63
+ return true;
64
+ }
65
+
66
+ static bool pp_skip_to(PhPos *self, int doc_num)
67
+ {
68
+ FrtTermDocEnum *tpe = self->tpe;
69
+ assert(tpe);
70
+
71
+ if (!tpe->skip_to(tpe, doc_num)) {
72
+ tpe->close(tpe); /* close stream */
73
+ self->tpe = NULL;
74
+ self->doc = INT_MAX; /* sentinel value */
75
+ return false;
76
+ }
77
+ self->doc = tpe->doc_num(tpe);
78
+ self->position = 0;
79
+ return true;
80
+ }
81
+
82
+ static bool pp_next_position(PhPos *self)
83
+ {
84
+ FrtTermDocEnum *tpe = self->tpe;
85
+ self->count--;
86
+ if (self->count >= 0) { /* read subsequent pos's */
87
+ self->position = tpe->next_position(tpe) - self->offset;
88
+ return true;
89
+ }
90
+ else {
91
+ return false;
92
+ }
93
+ }
94
+
95
+ static bool pp_first_position(PhPos *self)
96
+ {
97
+ FrtTermDocEnum *tpe = self->tpe;
98
+ self->count = tpe->freq(tpe); /* read first pos */
99
+ return pp_next_position(self);
100
+ }
101
+
102
+ #define PP_pp(p) (*(PhPos **)p)
103
+ static int pp_cmp(const void *const p1, const void *const p2)
104
+ {
105
+ int cmp = PP_pp(p1)->doc - PP_pp(p2)->doc;
106
+ if (cmp == 0) {
107
+ return PP_pp(p1)->position - PP_pp(p2)->position;
108
+ }
109
+ else {
110
+ return cmp;
111
+ }
112
+ }
113
+
114
+ static int pp_pos_cmp(const void *const p1, const void *const p2)
115
+ {
116
+ return PP_pp(p1)->position - PP_pp(p2)->position;
117
+ }
118
+
119
+ static bool pp_less_than(const PhPos *pp1, const PhPos *pp2)
120
+ {
121
+ if (pp1->position == pp2->position) {
122
+ return pp1->offset < pp2->offset;
123
+ }
124
+ else {
125
+ return pp1->position < pp2->position;
126
+ }
127
+ }
128
+
129
+ static void pp_destroy(PhPos *pp)
130
+ {
131
+ if (pp->tpe) {
132
+ pp->tpe->close(pp->tpe);
133
+ }
134
+ free(pp);
135
+ }
136
+
137
+ static PhPos *pp_new(FrtTermDocEnum *tpe, int offset)
138
+ {
139
+ PhPos *self = FRT_ALLOC(PhPos);
140
+
141
+ self->tpe = tpe;
142
+ self->count = self->doc = self->position = -1;
143
+ self->offset = offset;
144
+
145
+ return self;
146
+ }
147
+
148
+ /***************************************************************************
149
+ * PhraseScorer
150
+ ***************************************************************************/
151
+
152
+ #define PhSc(scorer) ((PhraseScorer *)(scorer))
153
+
154
+ typedef struct PhraseScorer
155
+ {
156
+ FrtScorer super;
157
+ float (*phrase_freq)(FrtScorer *self);
158
+ float freq;
159
+ frt_uchar *norms;
160
+ float value;
161
+ FrtWeight *weight;
162
+ PhPos **phrase_pos;
163
+ int pp_first_idx;
164
+ int pp_cnt;
165
+ int slop;
166
+ bool first_time : 1;
167
+ bool more : 1;
168
+ bool check_repeats : 1;
169
+ } PhraseScorer;
170
+
171
+ static void phsc_init(PhraseScorer *phsc)
172
+ {
173
+ int i;
174
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
175
+ if (!(phsc->more = pp_next(phsc->phrase_pos[i]))) break;
176
+ }
177
+
178
+ if (phsc->more) {
179
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
180
+ sizeof(PhPos *), &pp_cmp);
181
+ phsc->pp_first_idx = 0;
182
+ }
183
+ }
184
+
185
+ static bool phsc_do_next(FrtScorer *self)
186
+ {
187
+ PhraseScorer *phsc = PhSc(self);
188
+ const int pp_cnt = phsc->pp_cnt;
189
+ int pp_first_idx = phsc->pp_first_idx;
190
+ PhPos **phrase_positions = phsc->phrase_pos;
191
+
192
+ PhPos *first = phrase_positions[pp_first_idx];
193
+ PhPos *last = phrase_positions[FRT_PREV_NUM(pp_first_idx, pp_cnt)];
194
+ while (phsc->more) {
195
+ /* find doc with all the terms */
196
+ while (phsc->more && first->doc < last->doc) {
197
+ /* skip first upto last */
198
+ phsc->more = pp_skip_to(first, last->doc);
199
+ last = first;
200
+ pp_first_idx = FRT_NEXT_NUM(pp_first_idx, pp_cnt);
201
+ first = phrase_positions[pp_first_idx];
202
+ }
203
+
204
+ if (phsc->more) {
205
+ /* pp_first_idx will be used by phrase_freq */
206
+ phsc->pp_first_idx = pp_first_idx;
207
+ /* found a doc with all of the terms */
208
+ phsc->freq = phsc->phrase_freq(self);
209
+
210
+ if (phsc->freq == 0.0) { /* no match */
211
+ /* continuing search so re-set first and last */
212
+ pp_first_idx = phsc->pp_first_idx;
213
+ first = phrase_positions[pp_first_idx];
214
+ last = phrase_positions[FRT_PREV_NUM(pp_first_idx, pp_cnt)];
215
+ phsc->more = pp_next(last); /* trigger further scanning */
216
+ } else {
217
+ self->doc = first->doc;
218
+ return true; /* found a match */
219
+ }
220
+
221
+ }
222
+ }
223
+ return false;
224
+ }
225
+
226
+ static float phsc_score(FrtScorer *self)
227
+ {
228
+ PhraseScorer *phsc = PhSc(self);
229
+ float raw_score = frt_sim_tf(self->similarity, phsc->freq) * phsc->value;
230
+ /* normalize */
231
+ return raw_score * frt_sim_decode_norm(
232
+ self->similarity,
233
+ phsc->norms[self->doc]);
234
+ }
235
+
236
+ static bool phsc_next(FrtScorer *self)
237
+ {
238
+ PhraseScorer *phsc = PhSc(self);
239
+ if (phsc->first_time) {
240
+ phsc_init(phsc);
241
+ phsc->first_time = false;
242
+ } else if (phsc->more) {
243
+ /* trigger further scanning */
244
+ phsc->more = pp_next(phsc->phrase_pos[FRT_PREV_NUM(phsc->pp_first_idx, phsc->pp_cnt)]);
245
+ }
246
+ return phsc_do_next(self);
247
+ }
248
+
249
+ static bool phsc_skip_to(FrtScorer *self, int doc_num)
250
+ {
251
+ PhraseScorer *phsc = PhSc(self);
252
+ int i;
253
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
254
+ if (!(phsc->more = pp_skip_to(phsc->phrase_pos[i], doc_num))) {
255
+ break;
256
+ }
257
+ }
258
+
259
+ if (phsc->more) {
260
+ qsort(phsc->phrase_pos, phsc->pp_cnt,
261
+ sizeof(PhPos *), &pp_cmp);
262
+ phsc->pp_first_idx = 0;
263
+ }
264
+ return phsc_do_next(self);
265
+ }
266
+
267
+ static FrtExplanation *phsc_explain(FrtScorer *self, int doc_num)
268
+ {
269
+ PhraseScorer *phsc = PhSc(self);
270
+ float phrase_freq;
271
+
272
+ phsc_skip_to(self, doc_num);
273
+
274
+ phrase_freq = (self->doc == doc_num) ? phsc->freq : 0.0f;
275
+ return frt_expl_new(frt_sim_tf(self->similarity, phrase_freq),
276
+ "tf(phrase_freq=%f)", phrase_freq);
277
+ }
278
+
279
+ static void phsc_destroy(FrtScorer *self)
280
+ {
281
+ PhraseScorer *phsc = PhSc(self);
282
+ int i;
283
+ for (i = phsc->pp_cnt - 1; i >= 0; i--) {
284
+ pp_destroy(phsc->phrase_pos[i]);
285
+ }
286
+ free(phsc->phrase_pos);
287
+ frt_scorer_destroy_i(self);
288
+ }
289
+
290
+ static FrtScorer *phsc_new(FrtWeight *weight,
291
+ FrtTermDocEnum **term_pos_enum,
292
+ FrtPhrasePosition *positions, int pos_cnt,
293
+ FrtSimilarity *similarity,
294
+ frt_uchar *norms,
295
+ int slop)
296
+ {
297
+ int i;
298
+ FrtScorer *self = frt_scorer_new(PhraseScorer, similarity);
299
+ FrtHashSet *term_set = NULL;
300
+
301
+ PhSc(self)->weight = weight;
302
+ PhSc(self)->norms = norms;
303
+ PhSc(self)->value = weight->value;
304
+ PhSc(self)->phrase_pos = FRT_ALLOC_N(PhPos *, pos_cnt);
305
+ PhSc(self)->pp_first_idx = 0;
306
+ PhSc(self)->pp_cnt = pos_cnt;
307
+ PhSc(self)->slop = slop;
308
+ PhSc(self)->first_time = true;
309
+ PhSc(self)->more = true;
310
+ PhSc(self)->check_repeats = false;
311
+
312
+ if (slop) {
313
+ term_set = frt_hs_new_str((frt_free_ft)NULL);
314
+ }
315
+ for (i = 0; i < pos_cnt; i++) {
316
+ /* check for repeats */
317
+ if (slop && !PhSc(self)->check_repeats) {
318
+ char **terms = positions[i].terms;
319
+ const int t_cnt = frt_ary_size(terms);
320
+ int j;
321
+ for (j = 0; j < t_cnt; j++) {
322
+ if (frt_hs_add(term_set, terms[j])) {
323
+ PhSc(self)->check_repeats = true;
324
+ break;
325
+ }
326
+ }
327
+ }
328
+ PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos);
329
+ }
330
+
331
+ if (slop) {
332
+ frt_hs_destroy(term_set);
333
+ }
334
+
335
+ self->score = &phsc_score;
336
+ self->next = &phsc_next;
337
+ self->skip_to = &phsc_skip_to;
338
+ self->explain = &phsc_explain;
339
+ self->destroy = &phsc_destroy;
340
+
341
+ return self;
342
+ }
343
+
344
+ /***************************************************************************
345
+ * ExactPhraseScorer
346
+ ***************************************************************************/
347
+
348
+ static float ephsc_phrase_freq(FrtScorer *self)
349
+ {
350
+ PhraseScorer *phsc = PhSc(self);
351
+ int i;
352
+ int pp_first_idx = 0;
353
+ const int pp_cnt = phsc->pp_cnt;
354
+ float freq = 0.0f;
355
+ PhPos **phrase_positions = phsc->phrase_pos;
356
+ PhPos *first;
357
+ PhPos *last;
358
+
359
+ for (i = 0; i < pp_cnt; i++) {
360
+ pp_first_position(phrase_positions[i]);
361
+ }
362
+ qsort(phrase_positions, pp_cnt, sizeof(PhPos *), &pp_pos_cmp);
363
+
364
+ first = phrase_positions[0];
365
+ last = phrase_positions[pp_cnt - 1];
366
+
367
+ /* scan to position with all terms */
368
+ do {
369
+ /* scan forward in first */
370
+ while (first->position < last->position) {
371
+ do {
372
+ if (! pp_next_position(first)) {
373
+ /* maintain first position */
374
+ phsc->pp_first_idx = pp_first_idx;
375
+ return freq;
376
+ }
377
+ } while (first->position < last->position);
378
+ last = first;
379
+ pp_first_idx = FRT_NEXT_NUM(pp_first_idx, pp_cnt);
380
+ first = phrase_positions[pp_first_idx];
381
+ }
382
+ freq += 1.0f; /* all equal: a match */
383
+ } while (pp_next_position(last));
384
+
385
+ /* maintain first position */
386
+ phsc->pp_first_idx = pp_first_idx;
387
+ return freq;
388
+ }
389
+
390
+ static FrtScorer *exact_phrase_scorer_new(FrtWeight *weight,
391
+ FrtTermDocEnum **term_pos_enum,
392
+ FrtPhrasePosition *positions, int pp_cnt,
393
+ FrtSimilarity *similarity, frt_uchar *norms)
394
+ {
395
+ FrtScorer *self = phsc_new(weight,
396
+ term_pos_enum,
397
+ positions,
398
+ pp_cnt,
399
+ similarity,
400
+ norms,
401
+ 0);
402
+
403
+ PhSc(self)->phrase_freq = &ephsc_phrase_freq;
404
+ return self;
405
+ }
406
+
407
+ /***************************************************************************
408
+ * SloppyPhraseScorer
409
+ ***************************************************************************/
410
+
411
+ static bool sphsc_check_repeats(PhPos *pp,
412
+ PhPos **positions,
413
+ const int p_cnt)
414
+ {
415
+ int j;
416
+ for (j = 0; j < p_cnt; j++) {
417
+ PhPos *ppj = positions[j];
418
+ /* If offsets are equal, either we are at the current PhPos +pp+ or
419
+ * +pp+ and +ppj+ are supposed to match in the same position in which
420
+ * case we don't need to check. */
421
+ if (ppj->offset == pp->offset) {
422
+ continue;
423
+ }
424
+ /* the two phrase positions are matching on the same term
425
+ * which we want to avoid */
426
+ if ((ppj->position + ppj->offset) == (pp->position + pp->offset)) {
427
+ if (!pp_next_position(pp)) {
428
+ /* We have no matches for this document */
429
+ return false;
430
+ }
431
+ /* we changed the position so we need to start check again */
432
+ j = -1;
433
+ }
434
+ }
435
+ return true;
436
+ }
437
+
438
+ static float sphsc_phrase_freq(FrtScorer *self)
439
+ {
440
+ PhraseScorer *phsc = PhSc(self);
441
+ PhPos *pp;
442
+ FrtPriorityQueue *pq = frt_pq_new(phsc->pp_cnt, (frt_lt_ft)&pp_less_than, NULL);
443
+ const int pp_cnt = phsc->pp_cnt;
444
+
445
+ int last_pos = 0, pos, next_pos, start, match_length, i;
446
+ bool done = false;
447
+ bool check_repeats = phsc->check_repeats;
448
+ float freq = 0.0f;
449
+
450
+ for (i = 0; i < pp_cnt; i++) {
451
+ bool res;
452
+ pp = phsc->phrase_pos[i];
453
+ /* we should always have at least one position or this functions
454
+ * shouldn't have been called. */
455
+ res = pp_first_position(pp);
456
+ assert(res);(void)res;
457
+ if (check_repeats && (i > 0)) {
458
+ if (!sphsc_check_repeats(pp, phsc->phrase_pos, i)) {
459
+ goto return_freq;
460
+ }
461
+ }
462
+ if (pp->position > last_pos) {
463
+ last_pos = pp->position;
464
+ }
465
+ frt_pq_push(pq, pp);
466
+ }
467
+ int pqsize = pq->size;
468
+ do {
469
+ pqsize--;
470
+ pp = (PhPos *)frt_pq_pop(pq);
471
+ pos = start = pp->position;
472
+ next_pos = PP(frt_pq_top(pq))->position;
473
+ while (pos <= next_pos) {
474
+ start = pos; /* advance pp to min window */
475
+ if (!pp_next_position(pp) || (check_repeats && !sphsc_check_repeats(pp, phsc->phrase_pos, pp_cnt))) {
476
+ done = true;
477
+ break;
478
+ }
479
+ pos = pp->position;
480
+ }
481
+ match_length = last_pos - start;
482
+ if (match_length <= phsc->slop) {
483
+ /* score match */
484
+ freq += frt_sim_sloppy_freq(self->similarity, match_length);
485
+ }
486
+ if (pp->position > last_pos) {
487
+ last_pos = pp->position;
488
+ }
489
+ frt_pq_push(pq, pp); /* restore pq */
490
+ if (pqsize == 0) { done = true; }
491
+ } while (!done);
492
+ return_freq:
493
+
494
+ frt_pq_destroy(pq);
495
+ return freq;
496
+ }
497
+
498
+ static FrtScorer *sloppy_phrase_scorer_new(FrtWeight *weight,
499
+ FrtTermDocEnum **term_pos_enum,
500
+ FrtPhrasePosition *positions,
501
+ int pp_cnt, FrtSimilarity *similarity,
502
+ int slop, frt_uchar *norms)
503
+ {
504
+ FrtScorer *self = phsc_new(weight,
505
+ term_pos_enum,
506
+ positions,
507
+ pp_cnt,
508
+ similarity,
509
+ norms,
510
+ slop);
511
+
512
+ PhSc(self)->phrase_freq = &sphsc_phrase_freq;
513
+ return self;
514
+ }
515
+
516
+ /***************************************************************************
517
+ *
518
+ * PhraseWeight
519
+ *
520
+ ***************************************************************************/
521
+
522
+ static char *phw_to_s(FrtWeight *self)
523
+ {
524
+ return frt_strfmt("PhraseWeight(%f)", self->value);
525
+ }
526
+
527
+ static FrtScorer *phw_scorer(FrtWeight *self, FrtIndexReader *ir)
528
+ {
529
+ int i;
530
+ FrtScorer *phsc = NULL;
531
+ FrtPhraseQuery *phq = PhQ(self->query);
532
+ FrtTermDocEnum **tps;
533
+ FrtPhrasePosition *positions = phq->positions;
534
+ const int pos_cnt = phq->pos_cnt;
535
+ const int field_num = frt_fis_get_field_num(ir->fis, phq->field);
536
+
537
+ if (pos_cnt == 0 || field_num < 0) {
538
+ return NULL;
539
+ }
540
+
541
+ tps = FRT_ALLOC_N(FrtTermDocEnum *, pos_cnt);
542
+
543
+ for (i = 0; i < pos_cnt; i++) {
544
+ char **terms = positions[i].terms;
545
+ const int t_cnt = frt_ary_size(terms);
546
+ if (t_cnt == 1) {
547
+ tps[i] = ir->term_positions(ir);
548
+ tps[i]->seek(tps[i], field_num, terms[0]);
549
+ } else {
550
+ tps[i] = frt_mtdpe_new(ir, field_num, terms, t_cnt);
551
+ }
552
+ /* neither frt_mtdpe_new nor ir->term_positions should return NULL */
553
+ assert(NULL != tps[i]);
554
+ }
555
+
556
+ if (phq->slop == 0) { /* optimize exact (common) case */
557
+ phsc = exact_phrase_scorer_new(self, tps, positions, pos_cnt,
558
+ self->similarity,
559
+ frt_ir_get_norms_i(ir, field_num));
560
+ } else {
561
+ phsc = sloppy_phrase_scorer_new(self, tps, positions, pos_cnt,
562
+ self->similarity, phq->slop,
563
+ frt_ir_get_norms_i(ir, field_num));
564
+ }
565
+ free(tps);
566
+ return phsc;
567
+ }
568
+
569
+ static FrtExplanation *phw_explain(FrtWeight *self, FrtIndexReader *ir, int doc_num)
570
+ {
571
+ FrtExplanation *expl;
572
+ FrtExplanation *idf_expl1;
573
+ FrtExplanation *idf_expl2;
574
+ FrtExplanation *query_expl;
575
+ FrtExplanation *qnorm_expl;
576
+ FrtExplanation *field_expl;
577
+ FrtExplanation *tf_expl;
578
+ FrtScorer *scorer;
579
+ frt_uchar *field_norms;
580
+ float field_norm;
581
+ FrtExplanation *field_norm_expl;
582
+ char *query_str;
583
+ FrtPhraseQuery *phq = PhQ(self->query);
584
+ const int pos_cnt = phq->pos_cnt;
585
+ FrtPhrasePosition *positions = phq->positions;
586
+ int i, j;
587
+ char *doc_freqs = NULL;
588
+ size_t len = 0, pos = 0;
589
+ const int field_num = frt_fis_get_field_num(ir->fis, phq->field);
590
+ const char *field_name = rb_id2name(phq->field);
591
+
592
+ if (field_num < 0) {
593
+ return frt_expl_new(0.0, "field \"%s\" does not exist in the index", field_name);
594
+ }
595
+
596
+ query_str = self->query->to_s(self->query, (FrtSymbol)NULL);
597
+
598
+ expl = frt_expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
599
+
600
+ /* ensure the phrase positions are in order for explanation */
601
+ qsort(positions, pos_cnt, sizeof(FrtPhrasePosition), &phrase_pos_cmp);
602
+
603
+ for (i = 0; i < phq->pos_cnt; i++) {
604
+ char **terms = phq->positions[i].terms;
605
+ for (j = frt_ary_size(terms) - 1; j >= 0; j--) {
606
+ len += strlen(terms[j]) + 30;
607
+ }
608
+ }
609
+ doc_freqs = FRT_ALLOC_N(char, len);
610
+ for (i = 0; i < phq->pos_cnt; i++) {
611
+ char **terms = phq->positions[i].terms;
612
+ const int t_cnt = frt_ary_size(terms);
613
+ for (j = 0; j < t_cnt; j++) {
614
+ char *term = terms[j];
615
+ pos += sprintf(doc_freqs + pos, "%s=%d, ",
616
+ term, ir->doc_freq(ir, field_num, term));
617
+ }
618
+ }
619
+ pos -= 2; /* remove ", " from the end */
620
+ doc_freqs[pos] = 0;
621
+
622
+ idf_expl1 = frt_expl_new(self->idf, "idf(%s:<%s>)", field_name, doc_freqs);
623
+ idf_expl2 = frt_expl_new(self->idf, "idf(%s:<%s>)", field_name, doc_freqs);
624
+ free(doc_freqs);
625
+
626
+ /* explain query weight */
627
+ query_expl = frt_expl_new(0.0, "query_weight(%s), product of:", query_str);
628
+
629
+ if (self->query->boost != 1.0) {
630
+ frt_expl_add_detail(query_expl, frt_expl_new(self->query->boost, "boost"));
631
+ }
632
+ frt_expl_add_detail(query_expl, idf_expl1);
633
+
634
+ qnorm_expl = frt_expl_new(self->qnorm, "query_norm");
635
+ frt_expl_add_detail(query_expl, qnorm_expl);
636
+
637
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
638
+
639
+ frt_expl_add_detail(expl, query_expl);
640
+
641
+ /* explain field weight */
642
+ field_expl = frt_expl_new(0.0, "field_weight(%s in %d), product of:",
643
+ query_str, doc_num);
644
+ free(query_str);
645
+
646
+ scorer = self->scorer(self, ir);
647
+ tf_expl = scorer->explain(scorer, doc_num);
648
+ scorer->destroy(scorer);
649
+ frt_expl_add_detail(field_expl, tf_expl);
650
+ frt_expl_add_detail(field_expl, idf_expl2);
651
+
652
+ field_norms = ir->get_norms(ir, field_num);
653
+ field_norm = (field_norms != NULL)
654
+ ? frt_sim_decode_norm(self->similarity, field_norms[doc_num])
655
+ : (float)0.0;
656
+ field_norm_expl = frt_expl_new(field_norm, "field_norm(field=%s, doc=%d)",
657
+ field_name, doc_num);
658
+
659
+ frt_expl_add_detail(field_expl, field_norm_expl);
660
+
661
+ field_expl->value = tf_expl->value * self->idf * field_norm;
662
+
663
+ /* combine them */
664
+ if (query_expl->value == 1.0) {
665
+ frt_expl_destroy(expl);
666
+ return field_expl;
667
+ }
668
+ else {
669
+ expl->value = (query_expl->value * field_expl->value);
670
+ frt_expl_add_detail(expl, field_expl);
671
+ return expl;
672
+ }
673
+ }
674
+
675
+ static FrtWeight *phw_new(FrtQuery *query, FrtSearcher *searcher)
676
+ {
677
+ FrtWeight *self = w_new(FrtWeight, query);
678
+
679
+ self->scorer = &phw_scorer;
680
+ self->explain = &phw_explain;
681
+ self->to_s = &phw_to_s;
682
+
683
+ self->similarity = query->get_similarity(query, searcher);
684
+ self->value = query->boost;
685
+ self->idf = frt_sim_idf_phrase(self->similarity, PhQ(query)->field,
686
+ PhQ(query)->positions,
687
+ PhQ(query)->pos_cnt, searcher);
688
+ return self;
689
+ }
690
+
691
+ /***************************************************************************
692
+ *
693
+ * PhraseQuery
694
+ *
695
+ ***************************************************************************/
696
+
697
+ /* ** TVPosEnum ** */
698
+ typedef struct TVPosEnum
699
+ {
700
+ int index;
701
+ int size;
702
+ int offset;
703
+ int pos;
704
+ int positions[1];
705
+ } TVPosEnum;
706
+
707
+ static bool tvpe_next(TVPosEnum *self)
708
+ {
709
+ if (++(self->index) < self->size) {
710
+ self->pos = self->positions[self->index] - self->offset;
711
+ return true;
712
+ }
713
+ else {
714
+ self->pos = -1;
715
+ return false;
716
+ }
717
+ }
718
+
719
+ static int tvpe_skip_to(TVPosEnum *self, int position)
720
+ {
721
+ int i;
722
+ int search_pos = position + self->offset;
723
+ for (i = self->index + 1; i < self->size; i++) {
724
+ if (self->positions[i] >= search_pos) {
725
+ self->pos = self->positions[i] - self->offset;
726
+ break;
727
+ }
728
+ }
729
+ self->index = i;
730
+ if (i == self->size) {
731
+ self->pos = -1;
732
+ return false;
733
+ }
734
+ return true;
735
+ }
736
+
737
+ static bool tvpe_lt(TVPosEnum *tvpe1, TVPosEnum *tvpe2)
738
+ {
739
+ return tvpe1->pos < tvpe2->pos;
740
+ }
741
+
742
+ static TVPosEnum *tvpe_new(int *positions, int size, int offset)
743
+ {
744
+ TVPosEnum *self = (TVPosEnum*)frt_emalloc(sizeof(TVPosEnum) + size*sizeof(int));
745
+ memcpy(self->positions, positions, size * sizeof(int));
746
+ self->size = size;
747
+ self->offset = offset;
748
+ self->index = -1;
749
+ self->pos = -1;
750
+ return self;
751
+ }
752
+
753
+ static TVPosEnum *tvpe_new_merge(char **terms, int t_cnt, FrtTermVector *tv, int offset)
754
+ {
755
+ int i, total_positions = 0;
756
+ FrtPriorityQueue *tvpe_pq = frt_pq_new(t_cnt, (frt_lt_ft)tvpe_lt, &free);
757
+ TVPosEnum *self = NULL;
758
+
759
+ for (i = 0; i < t_cnt; i++) {
760
+ FrtTVTerm *tv_term = frt_tv_get_tv_term(tv, terms[i]);
761
+ if (tv_term) {
762
+ TVPosEnum *tvpe = tvpe_new(tv_term->positions, tv_term->freq, 0);
763
+ /* got tv_term so tvpe_next should always return true once here */
764
+ bool res = tvpe_next(tvpe);
765
+ assert(res);(void)res;
766
+ frt_pq_push(tvpe_pq, tvpe);
767
+ total_positions += tv_term->freq;
768
+ }
769
+ }
770
+ if (tvpe_pq->size == 0) {
771
+ frt_pq_destroy(tvpe_pq);
772
+ } else {
773
+ int index = 0;
774
+ self = (TVPosEnum *)frt_emalloc(sizeof(TVPosEnum)
775
+ + total_positions * sizeof(int));
776
+ self->size = total_positions;
777
+ self->offset = offset;
778
+ self->index = -1;
779
+ self->pos = -1;
780
+ while (tvpe_pq->size > 0) {
781
+ TVPosEnum *top = (TVPosEnum *)frt_pq_top(tvpe_pq);
782
+ self->positions[index++] = top->pos;
783
+ if (!tvpe_next(top)) {
784
+ frt_pq_pop(tvpe_pq);
785
+ free(top);
786
+ } else {
787
+ frt_pq_down(tvpe_pq);
788
+ }
789
+ }
790
+ frt_pq_destroy(tvpe_pq);
791
+ }
792
+ return self;
793
+ }
794
+
795
+ static TVPosEnum *get_tvpe(FrtTermVector *tv, char **terms, int t_cnt, int offset)
796
+ {
797
+ TVPosEnum *tvpe = NULL;
798
+ if (t_cnt == 1) {
799
+ FrtTVTerm *tv_term = frt_tv_get_tv_term(tv, terms[0]);
800
+ if (tv_term) {
801
+ tvpe = tvpe_new(tv_term->positions, tv_term->freq, offset);
802
+ }
803
+ } else {
804
+ tvpe = tvpe_new_merge(terms, t_cnt, tv, offset);
805
+ }
806
+ return tvpe;
807
+ }
808
+
809
+ static FrtMatchVector *phq_get_matchv_i(FrtQuery *self, FrtMatchVector *mv, FrtTermVector *tv)
810
+ {
811
+ if (tv->field == PhQ(self)->field) {
812
+ const int pos_cnt = PhQ(self)->pos_cnt;
813
+ int i;
814
+ int slop = PhQ(self)->slop;
815
+ bool done = false;
816
+
817
+ if (slop > 0) {
818
+ FrtPriorityQueue *tvpe_pq = frt_pq_new(pos_cnt, (frt_lt_ft)tvpe_lt, &free);
819
+ int last_pos = 0;
820
+ for (i = 0; i < pos_cnt; i++) {
821
+ FrtPhrasePosition *pp = &(PhQ(self)->positions[i]);
822
+ const int t_cnt = frt_ary_size(pp->terms);
823
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
824
+ if (tvpe && tvpe_next(tvpe)) {
825
+ if (tvpe->pos > last_pos) {
826
+ last_pos = tvpe->pos;
827
+ }
828
+ frt_pq_push(tvpe_pq, tvpe);
829
+ } else {
830
+ done = true;
831
+ free(tvpe);
832
+ break;
833
+ }
834
+ }
835
+ while (!done) {
836
+ TVPosEnum *tvpe = (TVPosEnum *)frt_pq_pop(tvpe_pq);
837
+ int pos;
838
+ int start = pos = tvpe->pos;
839
+ int next_pos = ((TVPosEnum *)frt_pq_top(tvpe_pq))->pos;
840
+ while (pos <= next_pos) {
841
+ start = pos;
842
+ if (!tvpe_next(tvpe)) {
843
+ done = true;
844
+ break;
845
+ }
846
+ pos = tvpe->pos;
847
+ }
848
+
849
+ if ((last_pos - start) <= slop) {
850
+ int min, max = min = start + tvpe->offset;
851
+ for (i = tvpe_pq->size; i > 0; i--) {
852
+ TVPosEnum *t = (TVPosEnum *)tvpe_pq->heap[i];
853
+ int p = t->pos + t->offset;
854
+ max = p > max ? p : max;
855
+ min = p < min ? p : min;
856
+ }
857
+ frt_matchv_add(mv, min, max);
858
+ }
859
+ if (tvpe->pos > last_pos) {
860
+ last_pos = tvpe->pos;
861
+ }
862
+ frt_pq_push(tvpe_pq, tvpe);
863
+ }
864
+
865
+ frt_pq_destroy(tvpe_pq);
866
+ } else { /* exact match */
867
+ TVPosEnum **tvpe_a = FRT_ALLOC_AND_ZERO_N(TVPosEnum *, pos_cnt);
868
+ TVPosEnum *first, *last;
869
+ int first_index = 0;
870
+ done = false;
871
+ qsort(PhQ(self)->positions, pos_cnt, sizeof(FrtPhrasePosition),
872
+ &phrase_pos_cmp);
873
+ for (i = 0; i < pos_cnt; i++) {
874
+ FrtPhrasePosition *pp = &(PhQ(self)->positions[i]);
875
+ const int t_cnt = frt_ary_size(pp->terms);
876
+ TVPosEnum *tvpe = get_tvpe(tv, pp->terms, t_cnt, pp->pos);
877
+ if (tvpe && ((i == 0 && tvpe_next(tvpe))
878
+ || tvpe_skip_to(tvpe, tvpe_a[i-1]->pos))) {
879
+ tvpe_a[i] = tvpe;
880
+ }
881
+ else {
882
+ done = true;
883
+ free(tvpe);
884
+ break;
885
+ }
886
+ }
887
+
888
+ first = tvpe_a[0];
889
+ last = tvpe_a[pos_cnt - 1];
890
+
891
+ while (!done) {
892
+ while (first->pos < last->pos) {
893
+ if (tvpe_skip_to(first, last->pos)) {
894
+ last = first;
895
+ first_index = FRT_NEXT_NUM(first_index, pos_cnt);
896
+ first = tvpe_a[first_index];
897
+ }
898
+ else {
899
+ done = true;
900
+ break;
901
+ }
902
+ }
903
+ if (!done) {
904
+ frt_matchv_add(mv, tvpe_a[0]->pos + tvpe_a[0]->offset,
905
+ tvpe_a[pos_cnt-1]->pos + tvpe_a[pos_cnt-1]->offset);
906
+ }
907
+ if (!tvpe_next(last)) {
908
+ done = true;
909
+ }
910
+ }
911
+ for (i = 0; i < pos_cnt; i++) {
912
+ free(tvpe_a[i]);
913
+ }
914
+ free(tvpe_a);
915
+ }
916
+ }
917
+ return mv;
918
+ }
919
+
920
+
921
+ /* ** PhraseQuery besides highlighting stuff ** */
922
+
923
+ #define PhQ_INIT_CAPA 4
924
+
925
+ static void phq_extract_terms(FrtQuery *self, FrtHashSet *term_set)
926
+ {
927
+ FrtPhraseQuery *phq = PhQ(self);
928
+ int i, j;
929
+ for (i = 0; i < phq->pos_cnt; i++) {
930
+ char **terms = phq->positions[i].terms;
931
+ for (j = frt_ary_size(terms) - 1; j >= 0; j--) {
932
+ frt_hs_add(term_set, frt_term_new(phq->field, terms[j]));
933
+ }
934
+ }
935
+ }
936
+
937
+ static char *phq_to_s(FrtQuery *self, FrtSymbol default_field)
938
+ {
939
+ FrtPhraseQuery *phq = PhQ(self);
940
+ const int pos_cnt = phq->pos_cnt;
941
+ FrtPhrasePosition *positions = phq->positions;
942
+ const char *field_name = rb_id2name(phq->field);
943
+ int flen = 0;
944
+ if (field_name) {
945
+ flen = strlen(field_name);
946
+ } else {
947
+ field_name = "";
948
+ flen = 0;
949
+ }
950
+ int i, j, buf_index = 0, pos, last_pos;
951
+ size_t len = 0;
952
+ char *buffer;
953
+
954
+ if (phq->pos_cnt == 0) {
955
+ if (default_field != phq->field) {
956
+ return frt_strfmt("%s:\"\"", field_name);
957
+ }
958
+ else {
959
+ return frt_estrdup("\"\"");
960
+ }
961
+ }
962
+
963
+ /* sort the phrase positions by position */
964
+ qsort(positions, pos_cnt, sizeof(FrtPhrasePosition), &phrase_pos_cmp);
965
+
966
+ len = flen + 1;
967
+
968
+ for (i = 0; i < pos_cnt; i++) {
969
+ char **terms = phq->positions[i].terms;
970
+ for (j = frt_ary_size(terms) - 1; j >= 0; j--) {
971
+ len += strlen(terms[j]) + 5;
972
+ }
973
+ }
974
+
975
+ /* add space for extra <> characters and boost and slop */
976
+ len += 100 + 3
977
+ * (phq->positions[phq->pos_cnt - 1].pos - phq->positions[0].pos);
978
+
979
+ buffer = FRT_ALLOC_N(char, len);
980
+
981
+ if (default_field != phq->field) {
982
+ memcpy(buffer, field_name, flen);
983
+ buffer[flen] = ':';
984
+ buf_index += flen + 1;
985
+ }
986
+
987
+ buffer[buf_index++] = '"';
988
+
989
+ last_pos = positions[0].pos - 1;
990
+ for (i = 0; i < pos_cnt; i++) {
991
+ char **terms = positions[i].terms;
992
+ const int t_cnt = frt_ary_size(terms);
993
+
994
+ pos = positions[i].pos;
995
+ if (pos == last_pos) {
996
+ buffer[buf_index - 1] = '&';
997
+ }
998
+ else {
999
+ for (j = last_pos; j < pos - 1; j++) {
1000
+ memcpy(buffer + buf_index, "<> ", 3);
1001
+ buf_index += 3;
1002
+ }
1003
+ }
1004
+
1005
+ last_pos = pos;
1006
+ for (j = 0; j < t_cnt; j++) {
1007
+ char *term = terms[j];
1008
+ len = strlen(term);
1009
+ memcpy(buffer + buf_index, term, len);
1010
+ buf_index += len;
1011
+ buffer[buf_index++] = '|';
1012
+ }
1013
+ buffer[buf_index-1] = ' '; /* change last '|' to ' ' */
1014
+ }
1015
+
1016
+ if (buffer[buf_index-1] == ' ') {
1017
+ buf_index--;
1018
+ }
1019
+
1020
+ buffer[buf_index++] = '"';
1021
+ buffer[buf_index] = 0;
1022
+
1023
+ if (phq->slop != 0) {
1024
+ buf_index += sprintf(buffer + buf_index, "~%d", phq->slop);
1025
+ }
1026
+
1027
+ if (self->boost != 1.0) {
1028
+ buffer[buf_index++] = '^';
1029
+ frt_dbl_to_s(buffer + buf_index, self->boost);
1030
+ }
1031
+
1032
+ return buffer;
1033
+ }
1034
+
1035
+ static void phq_destroy(FrtQuery *self)
1036
+ {
1037
+ FrtPhraseQuery *phq = PhQ(self);
1038
+ int i;
1039
+ for (i = 0; i < phq->pos_cnt; i++) {
1040
+ frt_ary_destroy(phq->positions[i].terms, &free);
1041
+ }
1042
+ free(phq->positions);
1043
+ frt_q_destroy_i(self);
1044
+ }
1045
+
1046
+ static FrtQuery *phq_rewrite(FrtQuery *self, FrtIndexReader *ir)
1047
+ {
1048
+ FrtPhraseQuery *phq = PhQ(self);
1049
+ (void)ir;
1050
+ if (phq->pos_cnt == 1) {
1051
+ /* optimize one-position case */
1052
+ char **terms = phq->positions[0].terms;
1053
+ const int t_cnt = frt_ary_size(terms);
1054
+ if (t_cnt == 1) {
1055
+ FrtQuery *tq = frt_tq_new(phq->field, terms[0]);
1056
+ tq->boost = self->boost;
1057
+ return tq;
1058
+ }
1059
+ else {
1060
+ FrtQuery *q = frt_multi_tq_new(phq->field);
1061
+ int i;
1062
+ for (i = 0; i < t_cnt; i++) {
1063
+ frt_multi_tq_add_term(q, terms[i]);
1064
+ }
1065
+ q->boost = self->boost;
1066
+ return q;
1067
+ }
1068
+ } else {
1069
+ self->ref_cnt++;
1070
+ return self;
1071
+ }
1072
+ }
1073
+
1074
+ static unsigned long long phq_hash(FrtQuery *self)
1075
+ {
1076
+ int i, j;
1077
+ FrtPhraseQuery *phq = PhQ(self);
1078
+ unsigned long long hash = frt_str_hash(rb_id2name(phq->field));
1079
+ for (i = 0; i < phq->pos_cnt; i++) {
1080
+ char **terms = phq->positions[i].terms;
1081
+ for (j = frt_ary_size(terms) - 1; j >= 0; j--) {
1082
+ hash = (hash << 1) ^ (frt_str_hash(terms[j])
1083
+ ^ phq->positions[i].pos);
1084
+ }
1085
+ }
1086
+ return (hash ^ phq->slop);
1087
+ }
1088
+
1089
+ static int phq_eq(FrtQuery *self, FrtQuery *o)
1090
+ {
1091
+ int i, j;
1092
+ FrtPhraseQuery *phq1 = PhQ(self);
1093
+ FrtPhraseQuery *phq2 = PhQ(o);
1094
+ if (phq1->slop != phq2->slop
1095
+ || phq1->field != phq2->field
1096
+ || phq1->pos_cnt != phq2->pos_cnt) {
1097
+ return false;
1098
+ }
1099
+ for (i = 0; i < phq1->pos_cnt; i++) {
1100
+ char **terms1 = phq1->positions[i].terms;
1101
+ char **terms2 = phq2->positions[i].terms;
1102
+ const int t_cnt = frt_ary_size(terms1);
1103
+ if (t_cnt != frt_ary_size(terms2)
1104
+ || phq1->positions[i].pos != phq2->positions[i].pos) {
1105
+ return false;
1106
+ }
1107
+ for (j = 0; j < t_cnt; j++) {
1108
+ if (strcmp(terms1[j], terms2[j]) != 0) {
1109
+ return false;
1110
+ }
1111
+ }
1112
+ }
1113
+ return true;
1114
+ }
1115
+
1116
+ FrtQuery *frt_phq_new(FrtSymbol field)
1117
+ {
1118
+ FrtQuery *self = frt_q_new(FrtPhraseQuery);
1119
+
1120
+ PhQ(self)->field = field;
1121
+ PhQ(self)->pos_cnt = 0;
1122
+ PhQ(self)->pos_capa = PhQ_INIT_CAPA;
1123
+ PhQ(self)->positions = FRT_ALLOC_N(FrtPhrasePosition, PhQ_INIT_CAPA);
1124
+
1125
+ self->type = PHRASE_QUERY;
1126
+ self->rewrite = &phq_rewrite;
1127
+ self->extract_terms = &phq_extract_terms;
1128
+ self->to_s = &phq_to_s;
1129
+ self->hash = &phq_hash;
1130
+ self->eq = &phq_eq;
1131
+ self->destroy_i = &phq_destroy;
1132
+ self->create_weight_i = &phw_new;
1133
+ self->get_matchv_i = &phq_get_matchv_i;
1134
+ return self;
1135
+ }
1136
+
1137
+ void frt_phq_add_term_abs(FrtQuery *self, const char *term, int position)
1138
+ {
1139
+ FrtPhraseQuery *phq = PhQ(self);
1140
+ int index = phq->pos_cnt;
1141
+ FrtPhrasePosition *pp;
1142
+ if (index >= phq->pos_capa) {
1143
+ phq->pos_capa <<= 1;
1144
+ FRT_REALLOC_N(phq->positions, FrtPhrasePosition, phq->pos_capa);
1145
+ }
1146
+ pp = &(phq->positions[index]);
1147
+ pp->terms = frt_ary_new_type_capa(char *, 2);
1148
+ frt_ary_push(pp->terms, frt_estrdup(term));
1149
+ pp->pos = position;
1150
+ phq->pos_cnt++;
1151
+ }
1152
+
1153
+ void frt_phq_add_term(FrtQuery *self, const char *term, int pos_inc)
1154
+ {
1155
+ FrtPhraseQuery *phq = PhQ(self);
1156
+ int position;
1157
+ if (phq->pos_cnt == 0) {
1158
+ position = 0;
1159
+ }
1160
+ else {
1161
+ position = phq->positions[phq->pos_cnt - 1].pos + pos_inc;
1162
+ }
1163
+ frt_phq_add_term_abs(self, term, position);
1164
+ }
1165
+
1166
+ void frt_phq_append_multi_term(FrtQuery *self, const char *term)
1167
+ {
1168
+ FrtPhraseQuery *phq = PhQ(self);
1169
+ int index = phq->pos_cnt - 1;
1170
+
1171
+ if (index < 0) {
1172
+ frt_phq_add_term(self, term, 0);
1173
+ }
1174
+ else {
1175
+ frt_ary_push(phq->positions[index].terms, frt_estrdup(term));
1176
+ }
1177
+ }
1178
+
1179
+ void frt_phq_set_slop(FrtQuery *self, int slop)
1180
+ {
1181
+ PhQ(self)->slop = slop;
1182
+ }