jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/q_match_all.c ADDED
@@ -0,0 +1,149 @@
1
+ #include "search.h"
2
+ #include <string.h>
3
+ #include "internal.h"
4
+
5
+ /***************************************************************************
6
+ *
7
+ * MatchAllScorer
8
+ *
9
+ ***************************************************************************/
10
+
11
+ #define MASc(scorer) ((MatchAllScorer *)(scorer))
12
+
13
+ typedef struct MatchAllScorer
14
+ {
15
+ Scorer super;
16
+ IndexReader *ir;
17
+ int max_doc;
18
+ float score;
19
+ } MatchAllScorer;
20
+
21
+ static float masc_score(Scorer *self)
22
+ {
23
+ return MASc(self)->score;
24
+ }
25
+
26
+ static bool masc_next(Scorer *self)
27
+ {
28
+ while (self->doc < (MASc(self)->max_doc - 1)) {
29
+ self->doc++;
30
+ if (!MASc(self)->ir->is_deleted(MASc(self)->ir, self->doc)) {
31
+ return true;
32
+ }
33
+ }
34
+ return false;
35
+ }
36
+
37
+ static bool masc_skip_to(Scorer *self, int doc_num)
38
+ {
39
+ self->doc = doc_num - 1;
40
+ return masc_next(self);
41
+ }
42
+
43
+ static Explanation *masc_explain(Scorer *self, int doc_num)
44
+ {
45
+ (void)self;
46
+ (void)doc_num;
47
+ return expl_new(1.0, "MatchAllScorer");
48
+ }
49
+
50
+ static Scorer *masc_new(Weight *weight, IndexReader *ir)
51
+ {
52
+ Scorer *self = scorer_new(MatchAllScorer, weight->similarity);
53
+
54
+ MASc(self)->ir = ir;
55
+ MASc(self)->max_doc = ir->max_doc(ir);
56
+ MASc(self)->score = weight->value;
57
+
58
+ self->doc = -1;
59
+ self->score = &masc_score;
60
+ self->next = &masc_next;
61
+ self->skip_to = &masc_skip_to;
62
+ self->explain = &masc_explain;
63
+ self->destroy = &scorer_destroy_i;
64
+
65
+ return self;
66
+ }
67
+
68
+ /***************************************************************************
69
+ *
70
+ * Weight
71
+ *
72
+ ***************************************************************************/
73
+
74
+ static char *maw_to_s(Weight *self)
75
+ {
76
+ return strfmt("MatchAllWeight(%f)", self->value);
77
+ }
78
+
79
+ static Explanation *maw_explain(Weight *self, IndexReader *ir, int doc_num)
80
+ {
81
+ Explanation *expl;
82
+ if (!ir->is_deleted(ir, doc_num)) {
83
+ expl = expl_new(self->value, "MatchAllQuery: product of:");
84
+ expl_add_detail(expl, expl_new(self->query->boost, "boost"));
85
+ expl_add_detail(expl, expl_new(self->qnorm, "query_norm"));
86
+ } else {
87
+ expl = expl_new(self->value,
88
+ "MatchAllQuery: doc %d was deleted", doc_num);
89
+ }
90
+
91
+ return expl;
92
+ }
93
+
94
+ static Weight *maw_new(Query *query, Searcher *searcher)
95
+ {
96
+ Weight *self = w_new(Weight, query);
97
+
98
+ self->scorer = &masc_new;
99
+ self->explain = &maw_explain;
100
+ self->to_s = &maw_to_s;
101
+
102
+ self->similarity = query->get_similarity(query, searcher);
103
+ self->idf = 1.0;
104
+
105
+ return self;
106
+ }
107
+
108
+ /***************************************************************************
109
+ *
110
+ * MatchAllQuery
111
+ *
112
+ ***************************************************************************/
113
+
114
+ static char *maq_to_s(Query *self, Symbol default_field)
115
+ {
116
+ (void)default_field;
117
+ if (self->boost == 1.0) {
118
+ return estrdup("*");
119
+ } else {
120
+ return strfmt("*^%f", self->boost);
121
+ }
122
+ }
123
+
124
+ static unsigned long maq_hash(Query *self)
125
+ {
126
+ (void)self;
127
+ return 0;
128
+ }
129
+
130
+ static int maq_eq(Query *self, Query *o)
131
+ {
132
+ (void)self; (void)o;
133
+ return true;
134
+ }
135
+
136
+ Query *maq_new()
137
+ {
138
+ Query *self = q_new(Query);
139
+
140
+ self->type = MATCH_ALL_QUERY;
141
+ self->to_s = &maq_to_s;
142
+ self->hash = &maq_hash;
143
+ self->eq = &maq_eq;
144
+ self->destroy_i = &q_destroy_i;
145
+ self->create_weight_i = &maw_new;
146
+
147
+ return self;
148
+ }
149
+
@@ -0,0 +1,673 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+ #include "helper.h"
4
+ #include "symbol.h"
5
+ #include "internal.h"
6
+
7
+ #define MTQ(query) ((MultiTermQuery *)(query))
8
+
9
+ /***************************************************************************
10
+ *
11
+ * MultiTerm
12
+ *
13
+ ***************************************************************************/
14
+
15
+ /***************************************************************************
16
+ * BoostedTerm
17
+ ***************************************************************************/
18
+
19
+ typedef struct BoostedTerm
20
+ {
21
+ char *term;
22
+ float boost;
23
+ } BoostedTerm;
24
+
25
+ static bool boosted_term_less_than(const BoostedTerm *bt1,
26
+ const BoostedTerm *bt2)
27
+ {
28
+ if (bt1->boost == bt2->boost) {
29
+ return (strcmp(bt1->term, bt2->term) < 0);
30
+ }
31
+
32
+ return (bt1->boost < bt2->boost);
33
+ }
34
+
35
+ static void boosted_term_destroy(BoostedTerm *self)
36
+ {
37
+ free(self->term);
38
+ free(self);
39
+ }
40
+
41
+ static BoostedTerm *boosted_term_new(const char *term, float boost)
42
+ {
43
+ BoostedTerm *self = ALLOC(BoostedTerm);
44
+ self->term = estrdup(term);
45
+ self->boost = boost;
46
+ return self;
47
+ }
48
+
49
+ /***************************************************************************
50
+ * TermDocEnumWrapper
51
+ ***************************************************************************/
52
+
53
+ #define TDE_READ_SIZE 16
54
+
55
+ typedef struct TermDocEnumWrapper
56
+ {
57
+ const char *term;
58
+ TermDocEnum *tde;
59
+ float boost;
60
+ int doc;
61
+ int freq;
62
+ int docs[TDE_READ_SIZE];
63
+ int freqs[TDE_READ_SIZE];
64
+ int pointer;
65
+ int pointer_max;
66
+ } TermDocEnumWrapper;
67
+
68
+ static bool tdew_less_than(const TermDocEnumWrapper *tdew1,
69
+ const TermDocEnumWrapper *tdew2)
70
+ {
71
+ return (tdew1->doc < tdew2->doc);
72
+ }
73
+
74
+ static bool tdew_next(TermDocEnumWrapper *self)
75
+ {
76
+ self->pointer++;
77
+ if (self->pointer >= self->pointer_max) {
78
+ /* refill buffer */
79
+ self->pointer_max = self->tde->read(self->tde, self->docs, self->freqs,
80
+ TDE_READ_SIZE);
81
+ if (self->pointer_max != 0) {
82
+ self->pointer = 0;
83
+ }
84
+ else {
85
+ return false;
86
+ }
87
+ }
88
+ self->doc = self->docs[self->pointer];
89
+ self->freq = self->freqs[self->pointer];
90
+ return true;
91
+ }
92
+
93
+ static bool tdew_skip_to(TermDocEnumWrapper *self, int doc_num)
94
+ {
95
+ TermDocEnum *tde = self->tde;
96
+
97
+ while (++(self->pointer) < self->pointer_max) {
98
+ if (self->docs[self->pointer] >= doc_num) {
99
+ self->doc = self->docs[self->pointer];
100
+ self->freq = self->freqs[self->pointer];
101
+ return true;
102
+ }
103
+ }
104
+
105
+ /* not found in cache, seek underlying stream */
106
+ if (tde->skip_to(tde, doc_num)) {
107
+ self->pointer_max = 1;
108
+ self->pointer = 0;
109
+ self->docs[0] = self->doc = tde->doc_num(tde);
110
+ self->freqs[0] = self->freq = tde->freq(tde);
111
+ return true;
112
+ }
113
+ else {
114
+ return false;
115
+ }
116
+ }
117
+
118
+ static void tdew_destroy(TermDocEnumWrapper *self)
119
+ {
120
+ self->tde->close(self->tde);
121
+ free(self);
122
+ }
123
+
124
+ static TermDocEnumWrapper *tdew_new(const char *term, TermDocEnum *tde,
125
+ float boost)
126
+ {
127
+ TermDocEnumWrapper *self = ALLOC_AND_ZERO(TermDocEnumWrapper);
128
+ self->term = term;
129
+ self->tde = tde;
130
+ self->boost = boost;
131
+ self->doc = -1;
132
+ return self;
133
+ }
134
+
135
+ /***************************************************************************
136
+ * MultiTermScorer
137
+ ***************************************************************************/
138
+
139
+ #define SCORE_CACHE_SIZE 32
140
+ #define MTSc(scorer) ((MultiTermScorer *)(scorer))
141
+
142
+ typedef struct MultiTermScorer
143
+ {
144
+ Scorer super;
145
+ Symbol field;
146
+ uchar *norms;
147
+ Weight *weight;
148
+ TermDocEnumWrapper **tdew_a;
149
+ int tdew_cnt;
150
+ PriorityQueue *tdew_pq;
151
+ float weight_value;
152
+ float score_cache[SCORE_CACHE_SIZE];
153
+ float total_score;
154
+ } MultiTermScorer;
155
+
156
+ static float multi_tsc_score(Scorer *self)
157
+ {
158
+ return MTSc(self)->total_score * MTSc(self)->weight_value
159
+ * sim_decode_norm(self->similarity, MTSc(self)->norms[self->doc]);
160
+ }
161
+
162
+ static bool multi_tsc_next(Scorer *self)
163
+ {
164
+ int curr_doc;
165
+ float total_score = 0.0;
166
+ TermDocEnumWrapper *tdew;
167
+ MultiTermScorer *mtsc = MTSc(self);
168
+ PriorityQueue *tdew_pq = mtsc->tdew_pq;
169
+ if (tdew_pq == NULL) {
170
+ TermDocEnumWrapper **tdew_a = mtsc->tdew_a;
171
+ int i;
172
+ tdew_pq = pq_new(mtsc->tdew_cnt, (lt_ft)tdew_less_than, (free_ft)NULL);
173
+ for (i = mtsc->tdew_cnt - 1; i >= 0; i--) {
174
+ if (tdew_next(tdew_a[i])) {
175
+ pq_push(tdew_pq, tdew_a[i]);
176
+ }
177
+ }
178
+ mtsc->tdew_pq = tdew_pq;
179
+ }
180
+
181
+ tdew = (TermDocEnumWrapper *)pq_top(tdew_pq);
182
+ if (tdew == NULL) {
183
+ return false;
184
+ }
185
+
186
+ self->doc = curr_doc = tdew->doc;
187
+ do {
188
+ int freq = tdew->freq;
189
+ if (freq < SCORE_CACHE_SIZE) {
190
+ total_score += mtsc->score_cache[freq] * tdew->boost;
191
+ }
192
+ else {
193
+ total_score += sim_tf(self->similarity, (float)freq) * tdew->boost;
194
+ }
195
+
196
+ if (tdew_next(tdew)) {
197
+ pq_down(tdew_pq);
198
+ }
199
+ else {
200
+ pq_pop(tdew_pq);
201
+ }
202
+
203
+ } while (((tdew = (TermDocEnumWrapper *)pq_top(tdew_pq)) != NULL)
204
+ && tdew->doc == curr_doc);
205
+ mtsc->total_score = total_score;
206
+ return true;
207
+ }
208
+
209
+ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num)
210
+ {
211
+ PriorityQueue *tdew_pq = MTSc(self)->tdew_pq;
212
+ TermDocEnumWrapper *tdew;
213
+ if (tdew_pq == NULL) {
214
+ MultiTermScorer *mtsc = MTSc(self);
215
+ TermDocEnumWrapper **tdew_a = mtsc->tdew_a;
216
+ int i;
217
+ tdew_pq = pq_new(mtsc->tdew_cnt, (lt_ft)tdew_less_than, (free_ft)NULL);
218
+ for (i = mtsc->tdew_cnt - 1; i >= 0; i--) {
219
+ tdew_skip_to(tdew_a[i], target_doc_num);
220
+ pq_push(tdew_pq, tdew_a[i]);
221
+ }
222
+ MTSc(self)->tdew_pq = tdew_pq;
223
+ }
224
+ if (tdew_pq->size == 0) {
225
+ self->doc = -1;
226
+ return false;
227
+ }
228
+ while ((tdew = (TermDocEnumWrapper *)pq_top(tdew_pq)) != NULL
229
+ && (target_doc_num > tdew->doc)) {
230
+ if (tdew_skip_to(tdew, target_doc_num)) {
231
+ pq_down(tdew_pq);
232
+ }
233
+ else {
234
+ pq_pop(tdew_pq);
235
+ }
236
+ }
237
+ return (pq_top(tdew_pq) == NULL) ? false : true;
238
+ }
239
+
240
+ static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
241
+ {
242
+ return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
243
+ }
244
+
245
+ static Explanation *multi_tsc_explain(Scorer *self, int doc_num)
246
+ {
247
+ MultiTermScorer *mtsc = MTSc(self);
248
+ TermDocEnumWrapper *tdew;
249
+
250
+ if (multi_tsc_advance_to(self, doc_num) &&
251
+ (tdew = (TermDocEnumWrapper *)pq_top(mtsc->tdew_pq))->doc == doc_num) {
252
+
253
+ PriorityQueue *tdew_pq = MTSc(self)->tdew_pq;
254
+ Explanation *expl = expl_new(0.0, "The sum of:");
255
+ int curr_doc = self->doc = tdew->doc;
256
+ float total_score = 0.0;
257
+
258
+ do {
259
+ int freq = tdew->freq;
260
+ expl_add_detail(expl,
261
+ expl_new(sim_tf(self->similarity, (float)freq) * tdew->boost,
262
+ "tf(term_freq(%s:%s)=%d)^%f",
263
+ S(mtsc->field), tdew->term, freq, tdew->boost));
264
+
265
+ total_score += sim_tf(self->similarity, (float)freq) * tdew->boost;
266
+
267
+ /* maintain tdew queue, even though it probably won't get used
268
+ * again */
269
+ if (tdew_next(tdew)) {
270
+ pq_down(tdew_pq);
271
+ }
272
+ else {
273
+ pq_pop(tdew_pq);
274
+ }
275
+
276
+ } while (((tdew = (TermDocEnumWrapper *)pq_top(tdew_pq)) != NULL)
277
+ && tdew->doc == curr_doc);
278
+ expl->value = total_score;
279
+ return expl;
280
+ }
281
+ else {
282
+ return expl_new(0.0, "None of the required terms exist in the index");
283
+ }
284
+ }
285
+
286
+ static void multi_tsc_destroy(Scorer *self)
287
+ {
288
+ int i;
289
+ TermDocEnumWrapper **tdew_a = MTSc(self)->tdew_a;
290
+ for (i = MTSc(self)->tdew_cnt - 1; i >= 0; i--) {
291
+ tdew_destroy(tdew_a[i]);
292
+ }
293
+ free(tdew_a);
294
+ if (MTSc(self)->tdew_pq) pq_destroy(MTSc(self)->tdew_pq);
295
+ scorer_destroy_i(self);
296
+ }
297
+
298
+ static Scorer *multi_tsc_new(Weight *weight, Symbol field,
299
+ TermDocEnumWrapper **tdew_a, int tdew_cnt,
300
+ uchar *norms)
301
+ {
302
+ int i;
303
+ Scorer *self = scorer_new(MultiTermScorer, weight->similarity);
304
+
305
+ MTSc(self)->weight = weight;
306
+ MTSc(self)->field = field;
307
+ MTSc(self)->weight_value = weight->value;
308
+ MTSc(self)->tdew_a = tdew_a;
309
+ MTSc(self)->tdew_cnt = tdew_cnt;
310
+ MTSc(self)->norms = norms;
311
+
312
+ for (i = 0; i < SCORE_CACHE_SIZE; i++) {
313
+ MTSc(self)->score_cache[i] = sim_tf(self->similarity, (float)i);
314
+ }
315
+
316
+ self->score = &multi_tsc_score;
317
+ self->next = &multi_tsc_next;
318
+ self->skip_to = &multi_tsc_skip_to;
319
+ self->explain = &multi_tsc_explain;
320
+ self->destroy = &multi_tsc_destroy;
321
+
322
+ return self;
323
+ }
324
+
325
+ /***************************************************************************
326
+ * MultiTermWeight
327
+ ***************************************************************************/
328
+
329
+ static char *multi_tw_to_s(Weight *self)
330
+ {
331
+ return strfmt("MultiTermWeight(%f)", self->value);
332
+ }
333
+
334
+ static Scorer *multi_tw_scorer(Weight *self, IndexReader *ir)
335
+ {
336
+ Scorer *multi_tsc = NULL;
337
+ PriorityQueue *boosted_terms = MTQ(self->query)->boosted_terms;
338
+ const int field_num = fis_get_field_num(ir->fis, MTQ(self->query)->field);
339
+
340
+ if (boosted_terms->size > 0 && field_num >= 0) {
341
+ int i;
342
+ TermDocEnum *tde;
343
+ TermEnum *te = ir->terms(ir, field_num);
344
+ TermDocEnumWrapper **tdew_a = ALLOC_N(TermDocEnumWrapper *,
345
+ boosted_terms->size);
346
+ int tdew_cnt = 0;
347
+ /* Priority queues skip the first element */
348
+ for (i = boosted_terms->size; i > 0; i--) {
349
+ char *term;
350
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
351
+ if ((term = te->skip_to(te, bt->term)) != NULL
352
+ && strcmp(term, bt->term) == 0) {
353
+ tde = ir->term_docs(ir);
354
+ tde->seek_te(tde, te);
355
+ tdew_a[tdew_cnt++] = tdew_new(bt->term, tde, bt->boost);
356
+ }
357
+ }
358
+ te->close(te);
359
+ if (tdew_cnt) {
360
+ multi_tsc = multi_tsc_new(self, MTQ(self->query)->field, tdew_a,
361
+ tdew_cnt, ir_get_norms_i(ir, field_num));
362
+ }
363
+ else {
364
+ free(tdew_a);
365
+ }
366
+ }
367
+
368
+ return multi_tsc;
369
+ }
370
+
371
+ static Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
372
+ {
373
+ Explanation *expl;
374
+ Explanation *idf_expl1;
375
+ Explanation *idf_expl2;
376
+ Explanation *query_expl;
377
+ Explanation *qnorm_expl;
378
+ Explanation *field_expl;
379
+ Explanation *tf_expl;
380
+ Scorer *scorer;
381
+ uchar *field_norms;
382
+ float field_norm;
383
+ Explanation *field_norm_expl;
384
+
385
+ char *query_str;
386
+ MultiTermQuery *mtq = MTQ(self->query);
387
+ const char *field = S(mtq->field);
388
+ PriorityQueue *bt_pq = mtq->boosted_terms;
389
+ int i;
390
+ int total_doc_freqs = 0;
391
+ char *doc_freqs = NULL;
392
+ size_t len = 0, pos = 0;
393
+ const int field_num = fis_get_field_num(ir->fis, mtq->field);
394
+
395
+ if (field_num < 0) {
396
+ return expl_new(0.0, "field \"%s\" does not exist in the index",
397
+ field);
398
+ }
399
+
400
+ query_str = self->query->to_s(self->query, NULL);
401
+
402
+ expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
403
+
404
+ len = 30;
405
+ for (i = bt_pq->size; i > 0; i--) {
406
+ len += strlen(((BoostedTerm *)bt_pq->heap[i])->term) + 30;
407
+ }
408
+ doc_freqs = ALLOC_N(char, len);
409
+ for (i = bt_pq->size; i > 0; i--) {
410
+ char *term = ((BoostedTerm *)bt_pq->heap[i])->term;
411
+ int doc_freq = ir->doc_freq(ir, field_num, term);
412
+ pos += sprintf(doc_freqs + pos, "(%s=%d) + ", term, doc_freq);
413
+ total_doc_freqs += doc_freq;
414
+ }
415
+ pos -= 2; /* remove " + " from the end */
416
+ sprintf(doc_freqs + pos, "= %d", total_doc_freqs);
417
+
418
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
419
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
420
+ free(doc_freqs);
421
+
422
+ /* explain query weight */
423
+ query_expl = expl_new(0.0, "query_weight(%s), product of:", query_str);
424
+
425
+ if (self->query->boost != 1.0) {
426
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
427
+ }
428
+ expl_add_detail(query_expl, idf_expl1);
429
+
430
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
431
+ expl_add_detail(query_expl, qnorm_expl);
432
+
433
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
434
+
435
+ expl_add_detail(expl, query_expl);
436
+
437
+ /* explain field weight */
438
+ field_expl = expl_new(0.0, "field_weight(%s in %d), product of:",
439
+ query_str, doc_num);
440
+ free(query_str);
441
+
442
+ if ((scorer = self->scorer(self, ir)) != NULL) {
443
+ tf_expl = scorer->explain(scorer, doc_num);
444
+ scorer->destroy(scorer);
445
+ }
446
+ else {
447
+ tf_expl = expl_new(0.0, "no terms were found");
448
+ }
449
+ expl_add_detail(field_expl, tf_expl);
450
+ expl_add_detail(field_expl, idf_expl2);
451
+
452
+ field_norms = ir->get_norms(ir, field_num);
453
+ field_norm = (field_norms != NULL)
454
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
455
+ : (float)0.0;
456
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
457
+ field, doc_num);
458
+
459
+ expl_add_detail(field_expl, field_norm_expl);
460
+
461
+ field_expl->value = tf_expl->value * self->idf * field_norm;
462
+
463
+ /* combine them */
464
+ if (query_expl->value == 1.0) {
465
+ expl_destroy(expl);
466
+ return field_expl;
467
+ }
468
+ else {
469
+ expl->value = (query_expl->value * field_expl->value);
470
+ expl_add_detail(expl, field_expl);
471
+ return expl;
472
+ }
473
+ }
474
+
475
+ static Weight *multi_tw_new(Query *query, Searcher *searcher)
476
+ {
477
+ int i;
478
+ int doc_freq = 0;
479
+ Weight *self = w_new(Weight, query);
480
+ PriorityQueue *bt_pq = MTQ(query)->boosted_terms;
481
+
482
+ self->scorer = &multi_tw_scorer;
483
+ self->explain = &multi_tw_explain;
484
+ self->to_s = &multi_tw_to_s;
485
+
486
+ self->similarity = query->get_similarity(query, searcher);
487
+ self->value = query->boost;
488
+ self->idf = 0.0;
489
+
490
+ for (i = bt_pq->size; i > 0; i--) {
491
+ doc_freq += searcher->doc_freq(searcher, MTQ(query)->field,
492
+ ((BoostedTerm *)bt_pq->heap[i])->term);
493
+ }
494
+ self->idf += sim_idf(self->similarity, doc_freq,
495
+ searcher->max_doc(searcher));
496
+
497
+ return self;
498
+ }
499
+
500
+
501
+ /***************************************************************************
502
+ * MultiTermQuery
503
+ ***************************************************************************/
504
+
505
+ static char *multi_tq_to_s(Query *self, Symbol default_field)
506
+ {
507
+ int i;
508
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms, *bt_pq_clone;
509
+ BoostedTerm *bt;
510
+ char *buffer, *bptr;
511
+ const char *field = S(MTQ(self)->field);
512
+ int flen = (int)strlen(field);
513
+ int tlen = 0;
514
+
515
+ /* Priority queues skip the first element */
516
+ for (i = boosted_terms->size; i > 0; i--) {
517
+ tlen += (int)strlen(((BoostedTerm *)boosted_terms->heap[i])->term) + 35;
518
+ }
519
+
520
+ bptr = buffer = ALLOC_N(char, tlen + flen + 35);
521
+
522
+ if (default_field != MTQ(self)->field) {
523
+ bptr += sprintf(bptr, "%s:", field);
524
+ }
525
+
526
+ *(bptr++) = '"';
527
+ bt_pq_clone = pq_clone(boosted_terms);
528
+ while ((bt = (BoostedTerm *)pq_pop(bt_pq_clone)) != NULL) {
529
+ bptr += sprintf(bptr, "%s", bt->term);
530
+
531
+ if (bt->boost != 1.0) {
532
+ *bptr = '^';
533
+ dbl_to_s(++bptr, bt->boost);
534
+ bptr += (int)strlen(bptr);
535
+ }
536
+
537
+ *(bptr++) = '|';
538
+ }
539
+ pq_destroy(bt_pq_clone);
540
+
541
+ if (bptr[-1] == '"') {
542
+ bptr++; /* handle zero term case */
543
+ }
544
+ bptr[-1] = '"'; /* delete last '|' char */
545
+ bptr[ 0] = '\0';
546
+
547
+ if (self->boost != 1.0) {
548
+ *bptr = '^';
549
+ dbl_to_s(++bptr, self->boost);
550
+ }
551
+
552
+ return buffer;
553
+ }
554
+
555
+ static void multi_tq_destroy_i(Query *self)
556
+ {
557
+ pq_destroy(MTQ(self)->boosted_terms);
558
+ q_destroy_i(self);
559
+ }
560
+
561
+ static void multi_tq_extract_terms(Query *self, HashSet *terms)
562
+ {
563
+ int i;
564
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
565
+ for (i = boosted_terms->size; i > 0; i--) {
566
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
567
+ hs_add(terms, term_new(MTQ(self)->field, bt->term));
568
+ }
569
+ }
570
+
571
+ static unsigned long multi_tq_hash(Query *self)
572
+ {
573
+ int i;
574
+ unsigned long hash = sym_hash(MTQ(self)->field);
575
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
576
+ for (i = boosted_terms->size; i > 0; i--) {
577
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
578
+ hash ^= str_hash(bt->term) ^ float2int(bt->boost);
579
+ }
580
+ return hash;
581
+ }
582
+
583
+ static int multi_tq_eq(Query *self, Query *o)
584
+ {
585
+ int i;
586
+ PriorityQueue *boosted_terms1 = MTQ(self)->boosted_terms;
587
+ PriorityQueue *boosted_terms2 = MTQ(o)->boosted_terms;
588
+
589
+ if ((MTQ(self)->field != MTQ(o)->field)
590
+ || boosted_terms1->size != boosted_terms2->size) {
591
+ return false;
592
+ }
593
+ for (i = boosted_terms1->size; i > 0; i--) {
594
+ BoostedTerm *bt1 = (BoostedTerm *)boosted_terms1->heap[i];
595
+ BoostedTerm *bt2 = (BoostedTerm *)boosted_terms2->heap[i];
596
+ if ((strcmp(bt1->term, bt2->term) != 0) || (bt1->boost != bt2->boost)) {
597
+ return false;
598
+ }
599
+ }
600
+ return true;
601
+ }
602
+
603
+ static MatchVector *multi_tq_get_matchv_i(Query *self, MatchVector *mv,
604
+ TermVector *tv)
605
+ {
606
+ if (tv->field == MTQ(self)->field) {
607
+ int i;
608
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
609
+ for (i = boosted_terms->size; i > 0; i--) {
610
+ int j;
611
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
612
+ TVTerm *tv_term = tv_get_tv_term(tv, bt->term);
613
+ if (tv_term) {
614
+ for (j = 0; j < tv_term->freq; j++) {
615
+ int pos = tv_term->positions[j];
616
+ matchv_add(mv, pos, pos);
617
+ }
618
+ }
619
+ }
620
+ }
621
+ return mv;
622
+ }
623
+
624
+ Query *multi_tq_new_conf(Symbol field, int max_terms, float min_boost)
625
+ {
626
+ Query *self;
627
+
628
+ if (max_terms <= 0) {
629
+ RAISE(ARG_ERROR, ":max_terms must be greater than or equal to zero. "
630
+ "%d < 0. ", max_terms);
631
+ }
632
+
633
+ self = q_new(MultiTermQuery);
634
+
635
+ MTQ(self)->field = field;
636
+ MTQ(self)->boosted_terms = pq_new(max_terms,
637
+ (lt_ft)&boosted_term_less_than,
638
+ (free_ft)&boosted_term_destroy);
639
+ MTQ(self)->min_boost = min_boost;
640
+
641
+ self->type = MULTI_TERM_QUERY;
642
+ self->to_s = &multi_tq_to_s;
643
+ self->extract_terms = &multi_tq_extract_terms;
644
+ self->hash = &multi_tq_hash;
645
+ self->eq = &multi_tq_eq;
646
+ self->destroy_i = &multi_tq_destroy_i;
647
+ self->create_weight_i = &multi_tw_new;
648
+ self->get_matchv_i = &multi_tq_get_matchv_i;
649
+
650
+ return self;
651
+ }
652
+
653
+ Query *multi_tq_new(Symbol field)
654
+ {
655
+ return multi_tq_new_conf(field, MULTI_TERM_QUERY_MAX_TERMS, 0.0);
656
+ }
657
+
658
+ void multi_tq_add_term_boost(Query *self, const char *term, float boost)
659
+ {
660
+ if (boost > MTQ(self)->min_boost && term && term[0]) {
661
+ BoostedTerm *bt = boosted_term_new(term, boost);
662
+ PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
663
+ pq_insert(bt_pq, bt);
664
+ if (pq_full(bt_pq)) {
665
+ MTQ(self)->min_boost = ((BoostedTerm *)pq_top(bt_pq))->boost;
666
+ }
667
+ }
668
+ }
669
+
670
+ void multi_tq_add_term(Query *self, const char *term)
671
+ {
672
+ multi_tq_add_term_boost(self, term, 1.0);
673
+ }