isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,672 @@
1
+ #include <string.h>
2
+ #include "frt_global.h"
3
+ #include "frt_search.h"
4
+ #include "frt_helper.h"
5
+
6
+ #define MTQ(query) ((FrtMultiTermQuery *)(query))
7
+
8
+ /***************************************************************************
9
+ *
10
+ * MultiTerm
11
+ *
12
+ ***************************************************************************/
13
+
14
+ /***************************************************************************
15
+ * BoostedTerm
16
+ ***************************************************************************/
17
+
18
+ typedef struct BoostedTerm
19
+ {
20
+ char *term;
21
+ float boost;
22
+ } BoostedTerm;
23
+
24
+ static bool boosted_term_less_than(const BoostedTerm *bt1, const BoostedTerm *bt2)
25
+ {
26
+ // if (bt1->boost == bt2->boost) { return (strcmp(bt1->term, bt2->term) < 0); }
27
+ // return (bt1->boost < bt2->boost);
28
+ if (bt1->boost < bt2->boost) {
29
+ return true;
30
+ }
31
+ if (bt1->boost > bt2->boost) {
32
+ return false;
33
+ }
34
+ return (strcmp(bt1->term, bt2->term) < 0);
35
+ }
36
+
37
+ static void boosted_term_destroy(BoostedTerm *self)
38
+ {
39
+ free(self->term);
40
+ free(self);
41
+ }
42
+
43
+ static BoostedTerm *boosted_term_new(const char *term, float boost)
44
+ {
45
+ BoostedTerm *self = FRT_ALLOC(BoostedTerm);
46
+ self->term = frt_estrdup(term);
47
+ self->boost = boost;
48
+ return self;
49
+ }
50
+
51
+ /***************************************************************************
52
+ * TermDocEnumWrapper
53
+ ***************************************************************************/
54
+
55
+ #define TDE_READ_SIZE 16
56
+
57
+ typedef struct TermDocEnumWrapper
58
+ {
59
+ const char *term;
60
+ FrtTermDocEnum *tde;
61
+ float boost;
62
+ int doc;
63
+ int freq;
64
+ int docs[TDE_READ_SIZE];
65
+ int freqs[TDE_READ_SIZE];
66
+ int pointer;
67
+ int pointer_max;
68
+ } TermDocEnumWrapper;
69
+
70
+ static bool tdew_less_than(const TermDocEnumWrapper *tdew1,
71
+ const TermDocEnumWrapper *tdew2)
72
+ {
73
+ return (tdew1->doc < tdew2->doc);
74
+ }
75
+
76
+ static bool tdew_next(TermDocEnumWrapper *self)
77
+ {
78
+ self->pointer++;
79
+ if (self->pointer >= self->pointer_max) {
80
+ /* refill buffer */
81
+ self->pointer_max = self->tde->read(self->tde, self->docs, self->freqs, TDE_READ_SIZE);
82
+ if (self->pointer_max != 0) {
83
+ self->pointer = 0;
84
+ }
85
+ else {
86
+ return false;
87
+ }
88
+ }
89
+ self->doc = self->docs[self->pointer];
90
+ self->freq = self->freqs[self->pointer];
91
+ return true;
92
+ }
93
+
94
+ static bool tdew_skip_to(TermDocEnumWrapper *self, int doc_num)
95
+ {
96
+ FrtTermDocEnum *tde = self->tde;
97
+
98
+ while (++(self->pointer) < self->pointer_max) {
99
+ if (self->docs[self->pointer] >= doc_num) {
100
+ self->doc = self->docs[self->pointer];
101
+ self->freq = self->freqs[self->pointer];
102
+ return true;
103
+ }
104
+ }
105
+
106
+ /* not found in cache, seek underlying stream */
107
+ if (tde->skip_to(tde, doc_num)) {
108
+ self->pointer_max = 1;
109
+ self->pointer = 0;
110
+ self->docs[0] = self->doc = tde->doc_num(tde);
111
+ self->freqs[0] = self->freq = tde->freq(tde);
112
+ return true;
113
+ } else {
114
+ return false;
115
+ }
116
+ }
117
+
118
+ static void tdew_destroy(TermDocEnumWrapper *self)
119
+ {
120
+ self->tde->close(self->tde);
121
+ free(self);
122
+ }
123
+
124
+ static TermDocEnumWrapper *tdew_new(const char *term, FrtTermDocEnum *tde,
125
+ float boost)
126
+ {
127
+ TermDocEnumWrapper *self = FRT_ALLOC_AND_ZERO(TermDocEnumWrapper);
128
+ self->term = term;
129
+ self->tde = tde;
130
+ self->boost = boost;
131
+ self->doc = -1;
132
+ return self;
133
+ }
134
+
135
+ /***************************************************************************
136
+ * MultiTermScorer
137
+ ***************************************************************************/
138
+
139
+ #define SCORE_CACHE_SIZE 32
140
+ #define MTSc(scorer) ((MultiTermScorer *)(scorer))
141
+
142
+ typedef struct MultiTermScorer
143
+ {
144
+ FrtScorer super;
145
+ FrtSymbol field;
146
+ frt_uchar *norms;
147
+ FrtWeight *weight;
148
+ TermDocEnumWrapper **tdew_a;
149
+ int tdew_cnt;
150
+ FrtPriorityQueue *tdew_pq;
151
+ float weight_value;
152
+ float score_cache[SCORE_CACHE_SIZE];
153
+ float total_score;
154
+ } MultiTermScorer;
155
+
156
+ static float multi_tsc_score(FrtScorer *self)
157
+ {
158
+ return MTSc(self)->total_score * MTSc(self)->weight_value
159
+ * frt_sim_decode_norm(self->similarity, MTSc(self)->norms[self->doc]);
160
+ }
161
+
162
+ static bool multi_tsc_next(FrtScorer *self)
163
+ {
164
+ int curr_doc;
165
+ float total_score = 0.0f;
166
+ TermDocEnumWrapper *tdew;
167
+ MultiTermScorer *mtsc = MTSc(self);
168
+ FrtPriorityQueue *tdew_pq = mtsc->tdew_pq;
169
+ if (tdew_pq == NULL) {
170
+ TermDocEnumWrapper **tdew_a = mtsc->tdew_a;
171
+ int i;
172
+ tdew_pq = frt_pq_new(mtsc->tdew_cnt, (frt_lt_ft)tdew_less_than, (frt_free_ft)NULL);
173
+ for (i = mtsc->tdew_cnt - 1; i >= 0; i--) {
174
+ if (tdew_next(tdew_a[i])) {
175
+ frt_pq_push(tdew_pq, tdew_a[i]);
176
+ }
177
+ }
178
+ mtsc->tdew_pq = tdew_pq;
179
+ }
180
+
181
+ tdew = (TermDocEnumWrapper *)frt_pq_top(tdew_pq);
182
+ if (tdew == NULL) {
183
+ return false;
184
+ }
185
+
186
+ self->doc = curr_doc = tdew->doc;
187
+ do {
188
+ int freq = tdew->freq;
189
+ if (freq < SCORE_CACHE_SIZE) {
190
+ total_score += mtsc->score_cache[freq] * tdew->boost;
191
+ }
192
+ else {
193
+ total_score += frt_sim_tf(self->similarity, (float)freq) * tdew->boost;
194
+ }
195
+
196
+ if (tdew_next(tdew)) {
197
+ frt_pq_down(tdew_pq);
198
+ }
199
+ else {
200
+ frt_pq_pop(tdew_pq);
201
+ }
202
+
203
+ } while (((tdew = (TermDocEnumWrapper *)frt_pq_top(tdew_pq)) != NULL)
204
+ && tdew->doc == curr_doc);
205
+ mtsc->total_score = total_score;
206
+ return true;
207
+ }
208
+
209
+ static bool multi_tsc_advance_to(FrtScorer *self, int target_doc_num)
210
+ {
211
+ FrtPriorityQueue *tdew_pq = MTSc(self)->tdew_pq;
212
+ TermDocEnumWrapper *tdew;
213
+ if (tdew_pq == NULL) {
214
+ MultiTermScorer *mtsc = MTSc(self);
215
+ TermDocEnumWrapper **tdew_a = mtsc->tdew_a;
216
+ int i;
217
+ tdew_pq = frt_pq_new(mtsc->tdew_cnt, (frt_lt_ft)tdew_less_than, (frt_free_ft)NULL);
218
+ for (i = mtsc->tdew_cnt - 1; i >= 0; i--) {
219
+ if (tdew_skip_to(tdew_a[i], target_doc_num)) {
220
+ frt_pq_push(tdew_pq, tdew_a[i]);
221
+ }
222
+ }
223
+ MTSc(self)->tdew_pq = tdew_pq;
224
+ }
225
+ if (tdew_pq->size == 0) {
226
+ self->doc = -1;
227
+ return false;
228
+ }
229
+ while ((tdew = (TermDocEnumWrapper *)frt_pq_top(tdew_pq)) != NULL
230
+ && (target_doc_num > tdew->doc)) {
231
+ if (tdew_skip_to(tdew, target_doc_num)) {
232
+ frt_pq_down(tdew_pq);
233
+ }
234
+ else {
235
+ frt_pq_pop(tdew_pq);
236
+ }
237
+ }
238
+ tdew = (TermDocEnumWrapper *)frt_pq_top(tdew_pq);
239
+ return (frt_pq_top(tdew_pq) == NULL) ? false : true;
240
+ }
241
+
242
+ static bool multi_tsc_skip_to(FrtScorer *self, int target_doc_num)
243
+ {
244
+ return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
245
+ }
246
+
247
+ static FrtExplanation *multi_tsc_explain(FrtScorer *self, int doc_num)
248
+ {
249
+ MultiTermScorer *mtsc = MTSc(self);
250
+ TermDocEnumWrapper *tdew;
251
+
252
+ if (multi_tsc_advance_to(self, doc_num) &&
253
+ (tdew = (TermDocEnumWrapper *)frt_pq_top(mtsc->tdew_pq))->doc == doc_num) {
254
+
255
+ FrtPriorityQueue *tdew_pq = MTSc(self)->tdew_pq;
256
+ FrtExplanation *expl = frt_expl_new(0.0f, "The sum of:");
257
+ int curr_doc = self->doc = tdew->doc;
258
+ float total_score = 0.0f;
259
+
260
+ do {
261
+ int freq = tdew->freq;
262
+ frt_expl_add_detail(expl,
263
+ frt_expl_new(frt_sim_tf(self->similarity, (float)freq) * tdew->boost,
264
+ "tf(term_freq(%s:%s)=%d)^%f",
265
+ rb_id2name(mtsc->field), tdew->term, freq, tdew->boost));
266
+
267
+ total_score += frt_sim_tf(self->similarity, (float)freq) * tdew->boost;
268
+
269
+ /* maintain tdew queue, even though it probably won't get used
270
+ * again */
271
+ if (tdew_next(tdew)) {
272
+ frt_pq_down(tdew_pq);
273
+ }
274
+ else {
275
+ frt_pq_pop(tdew_pq);
276
+ }
277
+
278
+ } while (((tdew = (TermDocEnumWrapper *)frt_pq_top(tdew_pq)) != NULL)
279
+ && tdew->doc == curr_doc);
280
+ expl->value = total_score;
281
+ return expl;
282
+ }
283
+ else {
284
+ return frt_expl_new(0.0f, "None of the required terms exist in the index");
285
+ }
286
+ }
287
+
288
+ static void multi_tsc_destroy(FrtScorer *self)
289
+ {
290
+ int i;
291
+ TermDocEnumWrapper **tdew_a = MTSc(self)->tdew_a;
292
+ for (i = MTSc(self)->tdew_cnt - 1; i >= 0; i--) {
293
+ tdew_destroy(tdew_a[i]);
294
+ }
295
+ free(tdew_a);
296
+ if (MTSc(self)->tdew_pq) frt_pq_destroy(MTSc(self)->tdew_pq);
297
+ frt_scorer_destroy_i(self);
298
+ }
299
+
300
+ static FrtScorer *multi_tsc_new(FrtWeight *weight, FrtSymbol field,
301
+ TermDocEnumWrapper **tdew_a, int tdew_cnt,
302
+ frt_uchar *norms)
303
+ {
304
+ int i;
305
+ FrtScorer *self = frt_scorer_new(MultiTermScorer, weight->similarity);
306
+
307
+ MTSc(self)->weight = weight;
308
+ MTSc(self)->field = field;
309
+ MTSc(self)->weight_value = weight->value;
310
+ MTSc(self)->tdew_a = tdew_a;
311
+ MTSc(self)->tdew_cnt = tdew_cnt;
312
+ MTSc(self)->norms = norms;
313
+
314
+ for (i = 0; i < SCORE_CACHE_SIZE; i++) {
315
+ MTSc(self)->score_cache[i] = frt_sim_tf(self->similarity, (float)i);
316
+ }
317
+
318
+ self->score = &multi_tsc_score;
319
+ self->next = &multi_tsc_next;
320
+ self->skip_to = &multi_tsc_skip_to;
321
+ self->explain = &multi_tsc_explain;
322
+ self->destroy = &multi_tsc_destroy;
323
+
324
+ return self;
325
+ }
326
+
327
+ /***************************************************************************
328
+ * MultiTermWeight
329
+ ***************************************************************************/
330
+
331
+ static char *multi_tw_to_s(FrtWeight *self)
332
+ {
333
+ return frt_strfmt("MultiTermWeight(%f)", self->value);
334
+ }
335
+
336
+ static FrtScorer *multi_tw_scorer(FrtWeight *self, FrtIndexReader *ir)
337
+ {
338
+ FrtScorer *multi_tsc = NULL;
339
+ FrtPriorityQueue *boosted_terms = MTQ(self->query)->boosted_terms;
340
+ const int field_num = frt_fis_get_field_num(ir->fis, MTQ(self->query)->field);
341
+
342
+ if (boosted_terms->size > 0 && field_num >= 0) {
343
+ int i;
344
+ FrtTermDocEnum *tde;
345
+ FrtTermEnum *te = ir->terms(ir, field_num);
346
+ TermDocEnumWrapper **tdew_a = FRT_ALLOC_N(TermDocEnumWrapper *,
347
+ boosted_terms->size);
348
+ int tdew_cnt = 0;
349
+ /* Priority queues skip the first element */
350
+ for (i = boosted_terms->size; i > 0; i--) {
351
+ char *term;
352
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
353
+ if (((term = te->skip_to(te, bt->term)) != NULL) && (strcmp(term, bt->term) == 0)) {
354
+ tde = ir->term_docs(ir);
355
+ tde->seek_te(tde, te);
356
+ tdew_a[tdew_cnt++] = tdew_new(bt->term, tde, bt->boost);
357
+ }
358
+ }
359
+ te->close(te);
360
+ if (tdew_cnt) {
361
+ multi_tsc = multi_tsc_new(self, MTQ(self->query)->field, tdew_a,
362
+ tdew_cnt, frt_ir_get_norms_i(ir, field_num));
363
+ }
364
+ else {
365
+ free(tdew_a);
366
+ }
367
+ }
368
+ return multi_tsc;
369
+ }
370
+
371
+ static FrtExplanation *multi_tw_explain(FrtWeight *self, FrtIndexReader *ir, int doc_num)
372
+ {
373
+ FrtExplanation *expl;
374
+ FrtExplanation *idf_expl1;
375
+ FrtExplanation *idf_expl2;
376
+ FrtExplanation *query_expl;
377
+ FrtExplanation *qnorm_expl;
378
+ FrtExplanation *field_expl;
379
+ FrtExplanation *tf_expl;
380
+ FrtScorer *scorer;
381
+ frt_uchar *field_norms;
382
+ float field_norm;
383
+ FrtExplanation *field_norm_expl;
384
+
385
+ char *query_str;
386
+ FrtMultiTermQuery *mtq = MTQ(self->query);
387
+ const char *field_name = rb_id2name(mtq->field);
388
+ FrtPriorityQueue *bt_pq = mtq->boosted_terms;
389
+ int i;
390
+ int total_doc_freqs = 0;
391
+ char *doc_freqs = NULL;
392
+ size_t len = 0, pos = 0;
393
+ const int field_num = frt_fis_get_field_num(ir->fis, mtq->field);
394
+
395
+ if (field_num < 0) {
396
+ return frt_expl_new(0.0f, "field \"%s\" does not exist in the index",
397
+ field_name);
398
+ }
399
+
400
+ query_str = self->query->to_s(self->query, (FrtSymbol)NULL);
401
+
402
+ expl = frt_expl_new(0.0f, "weight(%s in %d), product of:", query_str, doc_num);
403
+
404
+ len = 30;
405
+ for (i = bt_pq->size; i > 0; i--) {
406
+ len += strlen(((BoostedTerm *)bt_pq->heap[i])->term) + 30;
407
+ }
408
+ doc_freqs = FRT_ALLOC_N(char, len);
409
+ for (i = bt_pq->size; i > 0; i--) {
410
+ char *term = ((BoostedTerm *)bt_pq->heap[i])->term;
411
+ int doc_freq = ir->doc_freq(ir, field_num, term);
412
+ pos += sprintf(doc_freqs + pos, "(%s=%d) + ", term, doc_freq);
413
+ total_doc_freqs += doc_freq;
414
+ }
415
+ pos -= 2; /* remove " + " from the end */
416
+ sprintf(doc_freqs + pos, "= %d", total_doc_freqs);
417
+
418
+ idf_expl1 = frt_expl_new(self->idf, "idf(%s:<%s>)", field_name, doc_freqs);
419
+ idf_expl2 = frt_expl_new(self->idf, "idf(%s:<%s>)", field_name, doc_freqs);
420
+ free(doc_freqs);
421
+
422
+ /* explain query weight */
423
+ query_expl = frt_expl_new(0.0f, "query_weight(%s), product of:", query_str);
424
+
425
+ if (self->query->boost != 1.0f) {
426
+ frt_expl_add_detail(query_expl, frt_expl_new(self->query->boost, "boost"));
427
+ }
428
+ frt_expl_add_detail(query_expl, idf_expl1);
429
+
430
+ qnorm_expl = frt_expl_new(self->qnorm, "query_norm");
431
+ frt_expl_add_detail(query_expl, qnorm_expl);
432
+
433
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
434
+
435
+ frt_expl_add_detail(expl, query_expl);
436
+
437
+ /* explain field weight */
438
+ field_expl = frt_expl_new(0.0f, "field_weight(%s in %d), product of:",
439
+ query_str, doc_num);
440
+ free(query_str);
441
+
442
+ if ((scorer = self->scorer(self, ir)) != NULL) {
443
+ tf_expl = scorer->explain(scorer, doc_num);
444
+ scorer->destroy(scorer);
445
+ }
446
+ else {
447
+ tf_expl = frt_expl_new(0.0f, "no terms were found");
448
+ }
449
+ frt_expl_add_detail(field_expl, tf_expl);
450
+ frt_expl_add_detail(field_expl, idf_expl2);
451
+
452
+ field_norms = ir->get_norms(ir, field_num);
453
+ field_norm = (field_norms != NULL)
454
+ ? frt_sim_decode_norm(self->similarity, field_norms[doc_num])
455
+ : (float)0.0f;
456
+ field_norm_expl = frt_expl_new(field_norm, "field_norm(field=%s, doc=%d)",
457
+ field_name, doc_num);
458
+
459
+ frt_expl_add_detail(field_expl, field_norm_expl);
460
+
461
+ field_expl->value = tf_expl->value * self->idf * field_norm;
462
+
463
+ /* combine them */
464
+ if (query_expl->value == 1.0f) {
465
+ frt_expl_destroy(expl);
466
+ return field_expl;
467
+ }
468
+ else {
469
+ expl->value = (query_expl->value * field_expl->value);
470
+ frt_expl_add_detail(expl, field_expl);
471
+ return expl;
472
+ }
473
+ }
474
+
475
+ static FrtWeight *multi_tw_new(FrtQuery *query, FrtSearcher *searcher)
476
+ {
477
+ int i;
478
+ int doc_freq = 0;
479
+ FrtWeight *self = w_new(FrtWeight, query);
480
+ FrtPriorityQueue *bt_pq = MTQ(query)->boosted_terms;
481
+
482
+ self->scorer = &multi_tw_scorer;
483
+ self->explain = &multi_tw_explain;
484
+ self->to_s = &multi_tw_to_s;
485
+
486
+ self->similarity = query->get_similarity(query, searcher);
487
+ self->value = query->boost;
488
+ self->idf = 0.0f;
489
+
490
+ for (i = bt_pq->size; i > 0; i--) {
491
+ doc_freq += searcher->doc_freq(searcher, MTQ(query)->field,
492
+ ((BoostedTerm *)bt_pq->heap[i])->term);
493
+ }
494
+ self->idf += frt_sim_idf(self->similarity, doc_freq,
495
+ searcher->max_doc(searcher));
496
+
497
+ return self;
498
+ }
499
+
500
+
501
+ /***************************************************************************
502
+ * MultiTermQuery
503
+ ***************************************************************************/
504
+
505
+ static char *multi_tq_to_s(FrtQuery *self, FrtSymbol default_field)
506
+ {
507
+ int i;
508
+ FrtPriorityQueue *boosted_terms = MTQ(self)->boosted_terms, *bt_pq_clone;
509
+ BoostedTerm *bt;
510
+ char *buffer, *bptr;
511
+ const char *field_name = rb_id2name(MTQ(self)->field);
512
+ int flen = strlen(field_name);
513
+ int tlen = 0;
514
+
515
+ /* Priority queues skip the first element */
516
+ for (i = boosted_terms->size; i > 0; i--) {
517
+ tlen += strlen(((BoostedTerm *)boosted_terms->heap[i])->term) + 35;
518
+ }
519
+
520
+ bptr = buffer = FRT_ALLOC_N(char, tlen + flen + 35);
521
+
522
+ if (default_field != MTQ(self)->field) {
523
+ bptr += sprintf(bptr, "%s:", field_name);
524
+ }
525
+
526
+ *(bptr++) = '"';
527
+ bt_pq_clone = frt_pq_clone(boosted_terms);
528
+ while ((bt = (BoostedTerm *)frt_pq_pop(bt_pq_clone)) != NULL) {
529
+ bptr += sprintf(bptr, "%s", bt->term);
530
+
531
+ if (bt->boost != 1.0f) {
532
+ *bptr = '^';
533
+ frt_dbl_to_s(++bptr, bt->boost);
534
+ bptr += (int)strlen(bptr);
535
+ }
536
+
537
+ *(bptr++) = '|';
538
+ }
539
+ frt_pq_destroy(bt_pq_clone);
540
+
541
+ if (bptr[-1] == '"') {
542
+ bptr++; /* handle zero term case */
543
+ }
544
+ bptr[-1] = '"'; /* delete last '|' char */
545
+ bptr[ 0] = '\0';
546
+
547
+ if (self->boost != 1.0f) {
548
+ *bptr = '^';
549
+ frt_dbl_to_s(++bptr, self->boost);
550
+ }
551
+ return buffer;
552
+ }
553
+
554
+ static void multi_tq_destroy_i(FrtQuery *self)
555
+ {
556
+ frt_pq_destroy(MTQ(self)->boosted_terms);
557
+ frt_q_destroy_i(self);
558
+ }
559
+
560
+ static void multi_tq_extract_terms(FrtQuery *self, FrtHashSet *terms)
561
+ {
562
+ int i;
563
+ FrtPriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
564
+ for (i = boosted_terms->size; i > 0; i--) {
565
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
566
+ frt_hs_add(terms, frt_term_new(MTQ(self)->field, bt->term));
567
+ }
568
+ }
569
+
570
+ static unsigned long long multi_tq_hash(FrtQuery *self)
571
+ {
572
+ int i;
573
+ unsigned long long hash = frt_str_hash(rb_id2name(MTQ(self)->field));
574
+ FrtPriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
575
+ for (i = boosted_terms->size; i > 0; i--) {
576
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
577
+ hash ^= frt_str_hash(bt->term) ^ frt_float2int(bt->boost);
578
+ }
579
+ return hash;
580
+ }
581
+
582
+ static int multi_tq_eq(FrtQuery *self, FrtQuery *o)
583
+ {
584
+ int i;
585
+ FrtPriorityQueue *boosted_terms1 = MTQ(self)->boosted_terms;
586
+ FrtPriorityQueue *boosted_terms2 = MTQ(o)->boosted_terms;
587
+
588
+ if ((MTQ(self)->field != MTQ(o)->field)
589
+ || boosted_terms1->size != boosted_terms2->size) {
590
+ return false;
591
+ }
592
+ for (i = boosted_terms1->size; i > 0; i--) {
593
+ BoostedTerm *bt1 = (BoostedTerm *)boosted_terms1->heap[i];
594
+ BoostedTerm *bt2 = (BoostedTerm *)boosted_terms2->heap[i];
595
+ if ((strcmp(bt1->term, bt2->term) != 0) || (bt1->boost != bt2->boost)) {
596
+ return false;
597
+ }
598
+ }
599
+ return true;
600
+ }
601
+
602
+ static FrtMatchVector *multi_tq_get_matchv_i(FrtQuery *self, FrtMatchVector *mv,
603
+ FrtTermVector *tv)
604
+ {
605
+ if (tv->field == MTQ(self)->field) {
606
+ int i;
607
+ FrtPriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
608
+ for (i = boosted_terms->size; i > 0; i--) {
609
+ int j;
610
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
611
+ FrtTVTerm *tv_term = frt_tv_get_tv_term(tv, bt->term);
612
+ if (tv_term) {
613
+ for (j = 0; j < tv_term->freq; j++) {
614
+ int pos = tv_term->positions[j];
615
+ frt_matchv_add(mv, pos, pos);
616
+ }
617
+ }
618
+ }
619
+ }
620
+ return mv;
621
+ }
622
+
623
+ FrtQuery *frt_multi_tq_new_conf(FrtSymbol field, int max_terms, float min_boost)
624
+ {
625
+ FrtQuery *self;
626
+
627
+ if (max_terms <= 0) {
628
+ FRT_RAISE(FRT_ARG_ERROR, ":max_terms must be greater than or equal to zero. "
629
+ "%d < 0. ", max_terms);
630
+ }
631
+
632
+ self = frt_q_new(FrtMultiTermQuery);
633
+
634
+ MTQ(self)->field = field;
635
+ MTQ(self)->boosted_terms = frt_pq_new(max_terms,
636
+ (frt_lt_ft)&boosted_term_less_than,
637
+ (frt_free_ft)&boosted_term_destroy);
638
+ MTQ(self)->min_boost = min_boost;
639
+
640
+ self->type = MULTI_TERM_QUERY;
641
+ self->to_s = &multi_tq_to_s;
642
+ self->extract_terms = &multi_tq_extract_terms;
643
+ self->hash = &multi_tq_hash;
644
+ self->eq = &multi_tq_eq;
645
+ self->destroy_i = &multi_tq_destroy_i;
646
+ self->create_weight_i = &multi_tw_new;
647
+ self->get_matchv_i = &multi_tq_get_matchv_i;
648
+
649
+ return self;
650
+ }
651
+
652
+ FrtQuery *frt_multi_tq_new(FrtSymbol field)
653
+ {
654
+ return frt_multi_tq_new_conf(field, MULTI_TERM_QUERY_MAX_TERMS, 0.0f);
655
+ }
656
+
657
+ void frt_multi_tq_add_term_boost(FrtQuery *self, const char *term, float boost)
658
+ {
659
+ if (boost > MTQ(self)->min_boost && term && term[0]) {
660
+ BoostedTerm *bt = boosted_term_new(term, boost);
661
+ FrtPriorityQueue *bt_pq = MTQ(self)->boosted_terms;
662
+ frt_pq_insert(bt_pq, bt);
663
+ if (frt_pq_full(bt_pq)) {
664
+ MTQ(self)->min_boost = ((BoostedTerm *)frt_pq_top(bt_pq))->boost;
665
+ }
666
+ }
667
+ }
668
+
669
+ void frt_multi_tq_add_term(FrtQuery *self, const char *term)
670
+ {
671
+ frt_multi_tq_add_term_boost(self, term, 1.0f);
672
+ }