sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,148 @@
1
+ #include "search.h"
2
+ #include <string.h>
3
+
4
+ /***************************************************************************
5
+ *
6
+ * MatchAllScorer
7
+ *
8
+ ***************************************************************************/
9
+
10
+ #define MASc(scorer) ((MatchAllScorer *)(scorer))
11
+
12
+ typedef struct MatchAllScorer
13
+ {
14
+ Scorer super;
15
+ IndexReader *ir;
16
+ int max_doc;
17
+ float score;
18
+ } MatchAllScorer;
19
+
20
+ static float masc_score(Scorer *self)
21
+ {
22
+ return MASc(self)->score;
23
+ }
24
+
25
+ static bool masc_next(Scorer *self)
26
+ {
27
+ while (self->doc < (MASc(self)->max_doc - 1)) {
28
+ self->doc++;
29
+ if (!MASc(self)->ir->is_deleted(MASc(self)->ir, self->doc)) {
30
+ return true;
31
+ }
32
+ }
33
+ return false;
34
+ }
35
+
36
+ static bool masc_skip_to(Scorer *self, int doc_num)
37
+ {
38
+ self->doc = doc_num - 1;
39
+ return masc_next(self);
40
+ }
41
+
42
+ static Explanation *masc_explain(Scorer *self, int doc_num)
43
+ {
44
+ (void)self;
45
+ (void)doc_num;
46
+ return expl_new(1.0, "MatchAllScorer");
47
+ }
48
+
49
+ static Scorer *masc_new(Weight *weight, IndexReader *ir)
50
+ {
51
+ Scorer *self = scorer_new(MatchAllScorer, weight->similarity);
52
+
53
+ MASc(self)->ir = ir;
54
+ MASc(self)->max_doc = ir->max_doc(ir);
55
+ MASc(self)->score = weight->value;
56
+
57
+ self->doc = -1;
58
+ self->score = &masc_score;
59
+ self->next = &masc_next;
60
+ self->skip_to = &masc_skip_to;
61
+ self->explain = &masc_explain;
62
+ self->destroy = &scorer_destroy_i;
63
+
64
+ return self;
65
+ }
66
+
67
+ /***************************************************************************
68
+ *
69
+ * Weight
70
+ *
71
+ ***************************************************************************/
72
+
73
+ static char *maw_to_s(Weight *self)
74
+ {
75
+ return strfmt("MatchAllWeight(%f)", self->value);
76
+ }
77
+
78
+ static Explanation *maw_explain(Weight *self, IndexReader *ir, int doc_num)
79
+ {
80
+ Explanation *expl;
81
+ if (!ir->is_deleted(ir, doc_num)) {
82
+ expl = expl_new(self->value, "MatchAllQuery: product of:");
83
+ expl_add_detail(expl, expl_new(self->query->boost, "boost"));
84
+ expl_add_detail(expl, expl_new(self->qnorm, "query_norm"));
85
+ } else {
86
+ expl = expl_new(self->value,
87
+ "MatchAllQuery: doc %d was deleted", doc_num);
88
+ }
89
+
90
+ return expl;
91
+ }
92
+
93
+ static Weight *maw_new(Query *query, Searcher *searcher)
94
+ {
95
+ Weight *self = w_new(Weight, query);
96
+
97
+ self->scorer = &masc_new;
98
+ self->explain = &maw_explain;
99
+ self->to_s = &maw_to_s;
100
+
101
+ self->similarity = query->get_similarity(query, searcher);
102
+ self->idf = 1.0;
103
+
104
+ return self;
105
+ }
106
+
107
+ /***************************************************************************
108
+ *
109
+ * MatchAllQuery
110
+ *
111
+ ***************************************************************************/
112
+
113
+ char *maq_to_s(Query *self, const char *field)
114
+ {
115
+ (void)field;
116
+ if (self->boost == 1.0) {
117
+ return estrdup("*");
118
+ } else {
119
+ return strfmt("*^%f", self->boost);
120
+ }
121
+ }
122
+
123
+ static unsigned long maq_hash(Query *self)
124
+ {
125
+ (void)self;
126
+ return 0;
127
+ }
128
+
129
+ static int maq_eq(Query *self, Query *o)
130
+ {
131
+ (void)self; (void)o;
132
+ return true;
133
+ }
134
+
135
+ Query *maq_new()
136
+ {
137
+ Query *self = q_new(Query);
138
+
139
+ self->type = MATCH_ALL_QUERY;
140
+ self->to_s = &maq_to_s;
141
+ self->hash = &maq_hash;
142
+ self->eq = &maq_eq;
143
+ self->destroy_i = &q_destroy_i;
144
+ self->create_weight_i = &maw_new;
145
+
146
+ return self;
147
+ }
148
+
@@ -0,0 +1,677 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+ #include "priorityqueue.h"
4
+ #include "helper.h"
5
+
6
+ #define MTQ(query) ((MultiTermQuery *)(query))
7
+
8
+ /***************************************************************************
9
+ *
10
+ * MultiTerm
11
+ *
12
+ ***************************************************************************/
13
+
14
+ /***************************************************************************
15
+ * BoostedTerm
16
+ ***************************************************************************/
17
+
18
+ typedef struct BoostedTerm
19
+ {
20
+ char *term;
21
+ float boost;
22
+ } BoostedTerm;
23
+
24
+ static bool boosted_term_less_than(const BoostedTerm *bt1,
25
+ const BoostedTerm *bt2)
26
+ {
27
+ if (bt1->boost == bt2->boost) {
28
+ return (strcmp(bt1->term, bt2->term) < 0);
29
+ }
30
+
31
+ return (bt1->boost < bt2->boost);
32
+ }
33
+
34
+ static void boosted_term_destroy(BoostedTerm *self)
35
+ {
36
+ free(self->term);
37
+ free(self);
38
+ }
39
+
40
+ static BoostedTerm *boosted_term_new(const char *term, float boost)
41
+ {
42
+ BoostedTerm *self = ALLOC(BoostedTerm);
43
+ self->term = estrdup(term);
44
+ self->boost = boost;
45
+ return self;
46
+ }
47
+
48
+ /***************************************************************************
49
+ * TermDocEnumWrapper
50
+ ***************************************************************************/
51
+
52
+ #define TDE_READ_SIZE 16
53
+
54
+ typedef struct TermDocEnumWrapper
55
+ {
56
+ const char *term;
57
+ TermDocEnum *tde;
58
+ float boost;
59
+ int doc;
60
+ int freq;
61
+ int docs[TDE_READ_SIZE];
62
+ int freqs[TDE_READ_SIZE];
63
+ int pointer;
64
+ int pointer_max;
65
+ } TermDocEnumWrapper;
66
+
67
+ static bool tdew_less_than(const TermDocEnumWrapper *tdew1,
68
+ const TermDocEnumWrapper *tdew2)
69
+ {
70
+ return (tdew1->doc < tdew2->doc);
71
+ }
72
+
73
+ static bool tdew_next(TermDocEnumWrapper *self)
74
+ {
75
+ self->pointer++;
76
+ if (self->pointer >= self->pointer_max) {
77
+ /* refill buffer */
78
+ self->pointer_max = self->tde->read(self->tde, self->docs, self->freqs,
79
+ TDE_READ_SIZE);
80
+ if (self->pointer_max != 0) {
81
+ self->pointer = 0;
82
+ }
83
+ else {
84
+ return false;
85
+ }
86
+ }
87
+ self->doc = self->docs[self->pointer];
88
+ self->freq = self->freqs[self->pointer];
89
+ return true;
90
+ }
91
+
92
+ static bool tdew_skip_to(TermDocEnumWrapper *self, int doc_num)
93
+ {
94
+ TermDocEnum *tde = self->tde;
95
+
96
+ while (++(self->pointer) < self->pointer_max) {
97
+ if (self->docs[self->pointer] >= doc_num) {
98
+ self->doc = self->docs[self->pointer];
99
+ self->freq = self->freqs[self->pointer];
100
+ return true;
101
+ }
102
+ }
103
+
104
+ /* not found in cache, seek underlying stream */
105
+ if (tde->skip_to(tde, doc_num)) {
106
+ self->pointer_max = 1;
107
+ self->pointer = 0;
108
+ self->docs[0] = self->doc = tde->doc_num(tde);
109
+ self->freqs[0] = self->freq = tde->freq(tde);
110
+ return true;
111
+ }
112
+ else {
113
+ return false;
114
+ }
115
+ }
116
+
117
+ static void tdew_destroy(TermDocEnumWrapper *self)
118
+ {
119
+ self->tde->close(self->tde);
120
+ free(self);
121
+ }
122
+
123
+ static TermDocEnumWrapper *tdew_new(const char *term, TermDocEnum *tde,
124
+ float boost)
125
+ {
126
+ TermDocEnumWrapper *self = ALLOC_AND_ZERO(TermDocEnumWrapper);
127
+ self->term = term;
128
+ self->tde = tde;
129
+ self->boost = boost;
130
+ self->doc = -1;
131
+ return self;
132
+ }
133
+
134
+ /***************************************************************************
135
+ * MultiTermScorer
136
+ ***************************************************************************/
137
+
138
+ #define SCORE_CACHE_SIZE 32
139
+ #define MTSc(scorer) ((MultiTermScorer *)(scorer))
140
+
141
+ typedef struct MultiTermScorer
142
+ {
143
+ Scorer super;
144
+ const char *field;
145
+ uchar *norms;
146
+ Weight *weight;
147
+ TermDocEnumWrapper **tdew_a;
148
+ int tdew_cnt;
149
+ PriorityQueue *tdew_pq;
150
+ float weight_value;
151
+ float score_cache[SCORE_CACHE_SIZE];
152
+ float total_score;
153
+ } MultiTermScorer;
154
+
155
+ static float multi_tsc_score(Scorer *self)
156
+ {
157
+ return MTSc(self)->total_score * MTSc(self)->weight_value
158
+ * sim_decode_norm(self->similarity, MTSc(self)->norms[self->doc]);
159
+ }
160
+
161
+ static bool multi_tsc_next(Scorer *self)
162
+ {
163
+ int curr_doc;
164
+ float total_score = 0.0;
165
+ TermDocEnumWrapper *tdew;
166
+ MultiTermScorer *mtsc = MTSc(self);
167
+ PriorityQueue *tdew_pq = mtsc->tdew_pq;
168
+ if (tdew_pq == NULL) {
169
+ TermDocEnumWrapper **tdew_a = mtsc->tdew_a;
170
+ int i;
171
+ tdew_pq = pq_new(mtsc->tdew_cnt, (lt_ft)tdew_less_than, (free_ft)NULL);
172
+ for (i = mtsc->tdew_cnt - 1; i >= 0; i--) {
173
+ if (tdew_next(tdew_a[i])) {
174
+ pq_push(tdew_pq, tdew_a[i]);
175
+ }
176
+ }
177
+ mtsc->tdew_pq = tdew_pq;
178
+ }
179
+
180
+ tdew = (TermDocEnumWrapper *)pq_top(tdew_pq);
181
+ if (tdew == NULL) {
182
+ return false;
183
+ }
184
+
185
+ self->doc = curr_doc = tdew->doc;
186
+ do {
187
+ int freq = tdew->freq;
188
+ if (freq < SCORE_CACHE_SIZE) {
189
+ total_score += mtsc->score_cache[freq] * tdew->boost;
190
+ }
191
+ else {
192
+ total_score += sim_tf(self->similarity, (float)freq) * tdew->boost;
193
+ }
194
+
195
+ if (tdew_next(tdew)) {
196
+ pq_down(tdew_pq);
197
+ }
198
+ else {
199
+ pq_pop(tdew_pq);
200
+ }
201
+
202
+ } while (((tdew = (TermDocEnumWrapper *)pq_top(tdew_pq)) != NULL)
203
+ && tdew->doc == curr_doc);
204
+ mtsc->total_score = total_score;
205
+ return true;
206
+ }
207
+
208
+ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num)
209
+ {
210
+ PriorityQueue *tdew_pq = MTSc(self)->tdew_pq;
211
+ TermDocEnumWrapper *tdew;
212
+ if (tdew_pq == NULL) {
213
+ MultiTermScorer *mtsc = MTSc(self);
214
+ TermDocEnumWrapper **tdew_a = mtsc->tdew_a;
215
+ int i;
216
+ tdew_pq = pq_new(mtsc->tdew_cnt, (lt_ft)tdew_less_than, (free_ft)NULL);
217
+ for (i = mtsc->tdew_cnt - 1; i >= 0; i--) {
218
+ tdew_skip_to(tdew_a[i], target_doc_num);
219
+ pq_push(tdew_pq, tdew_a[i]);
220
+ }
221
+ MTSc(self)->tdew_pq = tdew_pq;
222
+ }
223
+ if (tdew_pq->size == 0) {
224
+ self->doc = -1;
225
+ return false;
226
+ }
227
+ while ((tdew = (TermDocEnumWrapper *)pq_top(tdew_pq)) != NULL
228
+ && (target_doc_num > tdew->doc)) {
229
+ if (tdew_skip_to(tdew, target_doc_num)) {
230
+ pq_down(tdew_pq);
231
+ }
232
+ else {
233
+ pq_pop(tdew_pq);
234
+ }
235
+ }
236
+ return (pq_top(tdew_pq) == NULL) ? false : true;
237
+ }
238
+
239
+ static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
240
+ {
241
+ return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
242
+ }
243
+
244
+ static Explanation *multi_tsc_explain(Scorer *self, int doc_num)
245
+ {
246
+ MultiTermScorer *mtsc = MTSc(self);
247
+ TermDocEnumWrapper *tdew;
248
+
249
+ if (multi_tsc_advance_to(self, doc_num) &&
250
+ (tdew = (TermDocEnumWrapper *)pq_top(mtsc->tdew_pq))->doc == doc_num) {
251
+
252
+ PriorityQueue *tdew_pq = MTSc(self)->tdew_pq;
253
+ Explanation *expl = expl_new(0.0, "The sum of:");
254
+ int curr_doc = self->doc = tdew->doc;
255
+ float total_score = 0.0;
256
+
257
+ do {
258
+ int freq = tdew->freq;
259
+ expl_add_detail(expl,
260
+ expl_new(sim_tf(self->similarity, (float)freq) * tdew->boost,
261
+ "tf(term_freq(%s:%s)=%d)^%f",
262
+ mtsc->field, tdew->term, freq, tdew->boost));
263
+
264
+ total_score += sim_tf(self->similarity, (float)freq) * tdew->boost;
265
+
266
+ /* maintain tdew queue, even though it probably won't get used
267
+ * again */
268
+ if (tdew_next(tdew)) {
269
+ pq_down(tdew_pq);
270
+ }
271
+ else {
272
+ pq_pop(tdew_pq);
273
+ }
274
+
275
+ } while (((tdew = (TermDocEnumWrapper *)pq_top(tdew_pq)) != NULL)
276
+ && tdew->doc == curr_doc);
277
+ expl->value = total_score;
278
+ return expl;
279
+ }
280
+ else {
281
+ return expl_new(0.0, "None of the required terms exist in the index");
282
+ }
283
+ }
284
+
285
+ static void multi_tsc_destroy(Scorer *self)
286
+ {
287
+ int i;
288
+ TermDocEnumWrapper **tdew_a = MTSc(self)->tdew_a;
289
+ for (i = MTSc(self)->tdew_cnt - 1; i >= 0; i--) {
290
+ tdew_destroy(tdew_a[i]);
291
+ }
292
+ free(tdew_a);
293
+ if (MTSc(self)->tdew_pq) pq_destroy(MTSc(self)->tdew_pq);
294
+ scorer_destroy_i(self);
295
+ }
296
+
297
+ static Scorer *multi_tsc_new(Weight *weight, const char *field,
298
+ TermDocEnumWrapper **tdew_a, int tdew_cnt,
299
+ uchar *norms)
300
+ {
301
+ int i;
302
+ Scorer *self = scorer_new(MultiTermScorer, weight->similarity);
303
+
304
+ MTSc(self)->weight = weight;
305
+ MTSc(self)->field = field;
306
+ MTSc(self)->weight_value = weight->value;
307
+ MTSc(self)->tdew_a = tdew_a;
308
+ MTSc(self)->tdew_cnt = tdew_cnt;
309
+ MTSc(self)->norms = norms;
310
+
311
+ for (i = 0; i < SCORE_CACHE_SIZE; i++) {
312
+ MTSc(self)->score_cache[i] = sim_tf(self->similarity, (float)i);
313
+ }
314
+
315
+ self->score = &multi_tsc_score;
316
+ self->next = &multi_tsc_next;
317
+ self->skip_to = &multi_tsc_skip_to;
318
+ self->explain = &multi_tsc_explain;
319
+ self->destroy = &multi_tsc_destroy;
320
+
321
+ return self;
322
+ }
323
+
324
+ /***************************************************************************
325
+ * MultiTermWeight
326
+ ***************************************************************************/
327
+
328
+ static char *multi_tw_to_s(Weight *self)
329
+ {
330
+ return strfmt("MultiTermWeight(%f)", self->value);
331
+ }
332
+
333
+ static Scorer *multi_tw_scorer(Weight *self, IndexReader *ir)
334
+ {
335
+ Scorer *multi_tsc = NULL;
336
+ PriorityQueue *boosted_terms = MTQ(self->query)->boosted_terms;
337
+ const int field_num = fis_get_field_num(ir->fis, MTQ(self->query)->field);
338
+
339
+ if (boosted_terms->size > 0 && field_num >= 0) {
340
+ int i;
341
+ TermDocEnum *tde;
342
+ TermEnum *te = ir->terms(ir, field_num);
343
+ TermDocEnumWrapper **tdew_a = ALLOC_N(TermDocEnumWrapper *,
344
+ boosted_terms->size);
345
+ int tdew_cnt = 0;
346
+ /* Priority queues skip the first element */
347
+ for (i = boosted_terms->size; i > 0; i--) {
348
+ char *term;
349
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
350
+ if ((term = te->skip_to(te, bt->term)) != NULL
351
+ && strcmp(term, bt->term) == 0) {
352
+ tde = ir->term_docs(ir);
353
+ tde->seek_te(tde, te);
354
+ tdew_a[tdew_cnt++] = tdew_new(bt->term, tde, bt->boost);
355
+ }
356
+ }
357
+ te->close(te);
358
+ if (tdew_cnt) {
359
+ multi_tsc = multi_tsc_new(self, MTQ(self->query)->field, tdew_a,
360
+ tdew_cnt, ir_get_norms_i(ir, field_num));
361
+ }
362
+ else {
363
+ free(tdew_a);
364
+ }
365
+ }
366
+
367
+ return multi_tsc;
368
+ }
369
+
370
+ Explanation *multi_tw_explain(Weight *self, IndexReader *ir, int doc_num)
371
+ {
372
+ Explanation *expl;
373
+ Explanation *idf_expl1;
374
+ Explanation *idf_expl2;
375
+ Explanation *query_expl;
376
+ Explanation *qnorm_expl;
377
+ Explanation *field_expl;
378
+ Explanation *tf_expl;
379
+ Scorer *scorer;
380
+ uchar *field_norms;
381
+ float field_norm;
382
+ Explanation *field_norm_expl;
383
+
384
+ char *query_str;
385
+ MultiTermQuery *mtq = MTQ(self->query);
386
+ const char *field = mtq->field;
387
+ PriorityQueue *bt_pq = mtq->boosted_terms;
388
+ int i;
389
+ int total_doc_freqs = 0;
390
+ char *doc_freqs = NULL;
391
+ size_t len = 0, pos = 0;
392
+ const int field_num = fis_get_field_num(ir->fis, field);
393
+
394
+ if (field_num < 0) {
395
+ return expl_new(0.0, "field \"%s\" does not exist in the index", field);
396
+ }
397
+
398
+ query_str = self->query->to_s(self->query, "");
399
+
400
+ expl = expl_new(0.0, "weight(%s in %d), product of:", query_str, doc_num);
401
+
402
+ len = 30;
403
+ for (i = bt_pq->size; i > 0; i--) {
404
+ len += strlen(((BoostedTerm *)bt_pq->heap[i])->term) + 30;
405
+ }
406
+ doc_freqs = ALLOC_N(char, len);
407
+ for (i = bt_pq->size; i > 0; i--) {
408
+ char *term = ((BoostedTerm *)bt_pq->heap[i])->term;
409
+ int doc_freq = ir->doc_freq(ir, field_num, term);
410
+ sprintf(doc_freqs + pos, "(%s=%d) + ", term, doc_freq);
411
+ pos += strlen(doc_freqs + pos);
412
+ total_doc_freqs += doc_freq;
413
+ }
414
+ pos -= 2; /* remove " + " from the end */
415
+ sprintf(doc_freqs + pos, "= %d", total_doc_freqs);
416
+
417
+ idf_expl1 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
418
+ idf_expl2 = expl_new(self->idf, "idf(%s:<%s>)", field, doc_freqs);
419
+ free(doc_freqs);
420
+
421
+ /* explain query weight */
422
+ query_expl = expl_new(0.0, "query_weight(%s), product of:", query_str);
423
+
424
+ if (self->query->boost != 1.0) {
425
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
426
+ }
427
+ expl_add_detail(query_expl, idf_expl1);
428
+
429
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
430
+ expl_add_detail(query_expl, qnorm_expl);
431
+
432
+ query_expl->value = self->query->boost * self->idf * self->qnorm;
433
+
434
+ expl_add_detail(expl, query_expl);
435
+
436
+ /* explain field weight */
437
+ field_expl = expl_new(0.0, "field_weight(%s in %d), product of:",
438
+ query_str, doc_num);
439
+ free(query_str);
440
+
441
+ if ((scorer = self->scorer(self, ir)) != NULL) {
442
+ tf_expl = scorer->explain(scorer, doc_num);
443
+ scorer->destroy(scorer);
444
+ }
445
+ else {
446
+ tf_expl = expl_new(0.0, "no terms were found");
447
+ }
448
+ expl_add_detail(field_expl, tf_expl);
449
+ expl_add_detail(field_expl, idf_expl2);
450
+
451
+ field_norms = ir->get_norms(ir, field_num);
452
+ field_norm = (field_norms != NULL)
453
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
454
+ : (float)0.0;
455
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
456
+ field, doc_num);
457
+
458
+ expl_add_detail(field_expl, field_norm_expl);
459
+
460
+ field_expl->value = tf_expl->value * self->idf * field_norm;
461
+
462
+ /* combine them */
463
+ if (query_expl->value == 1.0) {
464
+ expl_destroy(expl);
465
+ return field_expl;
466
+ }
467
+ else {
468
+ expl->value = (query_expl->value * field_expl->value);
469
+ expl_add_detail(expl, field_expl);
470
+ return expl;
471
+ }
472
+ }
473
+
474
+ static Weight *multi_tw_new(Query *query, Searcher *searcher)
475
+ {
476
+ int i;
477
+ int doc_freq = 0;
478
+ Weight *self = w_new(Weight, query);
479
+ const char *field = MTQ(query)->field;
480
+ PriorityQueue *bt_pq = MTQ(query)->boosted_terms;
481
+
482
+ self->scorer = &multi_tw_scorer;
483
+ self->explain = &multi_tw_explain;
484
+ self->to_s = &multi_tw_to_s;
485
+
486
+ self->similarity = query->get_similarity(query, searcher);
487
+ self->value = query->boost;
488
+ self->idf = 0.0;
489
+
490
+ for (i = bt_pq->size; i > 0; i--) {
491
+ doc_freq += searcher->doc_freq(searcher, field,
492
+ ((BoostedTerm *)bt_pq->heap[i])->term);
493
+ }
494
+ self->idf += sim_idf(self->similarity, doc_freq,
495
+ searcher->max_doc(searcher));
496
+
497
+ return self;
498
+ }
499
+
500
+
501
+ /***************************************************************************
502
+ * MultiTermQuery
503
+ ***************************************************************************/
504
+
505
+ static char *multi_tq_to_s(Query *self, const char *curr_field)
506
+ {
507
+ int i;
508
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms, *bt_pq_clone;
509
+ BoostedTerm *bt;
510
+ char *buffer, *bptr;
511
+ char *field = MTQ(self)->field;
512
+ int flen = (int)strlen(field);
513
+ int tlen = 0;
514
+
515
+ /* Priority queues skip the first element */
516
+ for (i = boosted_terms->size; i > 0; i--) {
517
+ tlen += (int)strlen(((BoostedTerm *)boosted_terms->heap[i])->term) + 35;
518
+ }
519
+
520
+ bptr = buffer = ALLOC_N(char, tlen + flen + 35);
521
+
522
+ if (strcmp(curr_field, field) != 0) {
523
+ sprintf(bptr, "%s:", field);
524
+ bptr += flen + 1;
525
+ }
526
+
527
+ *(bptr++) = '"';
528
+ bt_pq_clone = pq_clone(boosted_terms);
529
+ while ((bt = (BoostedTerm *)pq_pop(bt_pq_clone)) != NULL) {
530
+ sprintf(bptr, "%s", bt->term);
531
+ bptr += (int)strlen(bptr);
532
+
533
+ if (bt->boost != 1.0) {
534
+ *bptr = '^';
535
+ dbl_to_s(++bptr, bt->boost);
536
+ bptr += (int)strlen(bptr);
537
+ }
538
+
539
+ *(bptr++) = '|';
540
+ }
541
+ pq_destroy(bt_pq_clone);
542
+
543
+ if (bptr[-1] == '"') {
544
+ bptr++; /* handle zero term case */
545
+ }
546
+ bptr[-1] = '"'; /* delete last '|' char */
547
+ bptr[ 0] = '\0';
548
+
549
+ if (self->boost != 1.0) {
550
+ *bptr = '^';
551
+ dbl_to_s(++bptr, self->boost);
552
+ }
553
+
554
+ return buffer;
555
+ }
556
+
557
+ static void multi_tq_destroy_i(Query *self)
558
+ {
559
+ free(MTQ(self)->field);
560
+ pq_destroy(MTQ(self)->boosted_terms);
561
+ q_destroy_i(self);
562
+ }
563
+
564
+ static void multi_tq_extract_terms(Query *self, HashSet *terms)
565
+ {
566
+ int i;
567
+ char *field = MTQ(self)->field;
568
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
569
+ for (i = boosted_terms->size; i > 0; i--) {
570
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
571
+ hs_add(terms, term_new(field, bt->term));
572
+ }
573
+ }
574
+
575
+ static unsigned long multi_tq_hash(Query *self)
576
+ {
577
+ int i;
578
+ unsigned long hash = str_hash(MTQ(self)->field);
579
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
580
+ for (i = boosted_terms->size; i > 0; i--) {
581
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
582
+ hash ^= str_hash(bt->term) ^ float2int(bt->boost);
583
+ }
584
+ return hash;
585
+ }
586
+
587
+ static int multi_tq_eq(Query *self, Query *o)
588
+ {
589
+ int i;
590
+ PriorityQueue *boosted_terms1 = MTQ(self)->boosted_terms;
591
+ PriorityQueue *boosted_terms2 = MTQ(o)->boosted_terms;
592
+
593
+ if (strcmp(MTQ(self)->field, MTQ(o)->field) != 0
594
+ || boosted_terms1->size != boosted_terms2->size) {
595
+ return false;
596
+ }
597
+ for (i = boosted_terms1->size; i > 0; i--) {
598
+ BoostedTerm *bt1 = (BoostedTerm *)boosted_terms1->heap[i];
599
+ BoostedTerm *bt2 = (BoostedTerm *)boosted_terms2->heap[i];
600
+ if ((strcmp(bt1->term, bt2->term) != 0) || (bt1->boost != bt2->boost)) {
601
+ return false;
602
+ }
603
+ }
604
+ return true;
605
+ }
606
+
607
+ static MatchVector *multi_tq_get_matchv_i(Query *self, MatchVector *mv,
608
+ TermVector *tv)
609
+ {
610
+ if (strcmp(tv->field, MTQ(self)->field) == 0) {
611
+ int i;
612
+ PriorityQueue *boosted_terms = MTQ(self)->boosted_terms;
613
+ for (i = boosted_terms->size; i > 0; i--) {
614
+ int j;
615
+ BoostedTerm *bt = (BoostedTerm *)boosted_terms->heap[i];
616
+ TVTerm *tv_term = tv_get_tv_term(tv, bt->term);
617
+ if (tv_term) {
618
+ for (j = 0; j < tv_term->freq; j++) {
619
+ int pos = tv_term->positions[j];
620
+ matchv_add(mv, pos, pos);
621
+ }
622
+ }
623
+ }
624
+ }
625
+ return mv;
626
+ }
627
+
628
+ Query *multi_tq_new_conf(const char *field, int max_terms, float min_boost)
629
+ {
630
+ Query *self;
631
+
632
+ if (max_terms <= 0) {
633
+ RAISE(ARG_ERROR, ":max_terms must be greater than or equal to zero. "
634
+ "%d < 0. ", max_terms);
635
+ }
636
+
637
+ self = q_new(MultiTermQuery);
638
+
639
+ MTQ(self)->field = estrdup(field);
640
+ MTQ(self)->boosted_terms = pq_new(max_terms,
641
+ (lt_ft)&boosted_term_less_than,
642
+ (free_ft)&boosted_term_destroy);
643
+ MTQ(self)->min_boost = min_boost;
644
+
645
+ self->type = MULTI_TERM_QUERY;
646
+ self->to_s = &multi_tq_to_s;
647
+ self->extract_terms = &multi_tq_extract_terms;
648
+ self->hash = &multi_tq_hash;
649
+ self->eq = &multi_tq_eq;
650
+ self->destroy_i = &multi_tq_destroy_i;
651
+ self->create_weight_i = &multi_tw_new;
652
+ self->get_matchv_i = &multi_tq_get_matchv_i;
653
+
654
+ return self;
655
+ }
656
+
657
+ Query *multi_tq_new(const char *field)
658
+ {
659
+ return multi_tq_new_conf(field, MULTI_TERM_QUERY_MAX_TERMS, 0.0);
660
+ }
661
+
662
+ void multi_tq_add_term_boost(Query *self, const char *term, float boost)
663
+ {
664
+ if (boost > MTQ(self)->min_boost && term && term[0]) {
665
+ BoostedTerm *bt = boosted_term_new(term, boost);
666
+ PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
667
+ pq_insert(bt_pq, bt);
668
+ if (pq_full(bt_pq)) {
669
+ MTQ(self)->min_boost = ((BoostedTerm *)pq_top(bt_pq))->boost;
670
+ }
671
+ }
672
+ }
673
+
674
+ void multi_tq_add_term(Query *self, const char *term)
675
+ {
676
+ multi_tq_add_term_boost(self, term, 1.0);
677
+ }