ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/search.c CHANGED
@@ -1,8 +1,6 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
-
4
- static char * const NUM_DOCS_ARG_ERROR_MSG = "num_docs must be > 0 to run a search";
5
- static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a search";
3
+ #include "array.h"
6
4
 
7
5
  /***************************************************************************
8
6
  *
@@ -10,67 +8,65 @@ static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a s
10
8
  *
11
9
  ***************************************************************************/
12
10
 
13
- Explanation *expl_create(float value, char *description)
11
+ Explanation *expl_new(float value, const char *description, ...)
14
12
  {
15
- Explanation *self = ALLOC(Explanation);
16
- self->value = value;
17
- self->description = description;
18
- self->dcnt = 0;
19
- self->dcapa = EXPLANATION_DETAILS_START_SIZE;
20
- self->details = ALLOC_N(Explanation *, EXPLANATION_DETAILS_START_SIZE);
21
- return self;
13
+ Explanation *expl = ALLOC(Explanation);
14
+
15
+ va_list args;
16
+ va_start(args, description);
17
+ expl->description = vstrfmt(description, args);
18
+ va_end(args);
19
+
20
+ expl->value = value;
21
+ expl->details = ary_new_type_capa(Explanation *,
22
+ EXPLANATION_DETAILS_START_SIZE);
23
+ return expl;
22
24
  }
23
25
 
24
- void expl_destoy(void *p)
26
+ void expl_destroy(Explanation *expl)
25
27
  {
26
- Explanation *expl = (Explanation *)p;
27
- int i;
28
- for (i = 0; i < expl->dcnt; i++) {
29
- expl_destoy(expl->details[i]);
30
- }
31
- free(expl->details);
32
- free(expl->description);
33
- free(expl);
28
+ ary_destroy((void **)expl->details, (free_ft)expl_destroy);
29
+ free(expl->description);
30
+ free(expl);
34
31
  }
35
32
 
36
- Explanation *expl_add_detail(Explanation *self, Explanation *detail)
33
+ Explanation *expl_add_detail(Explanation *expl, Explanation *detail)
37
34
  {
38
- if (self->dcnt >= self->dcapa) {
39
- self->dcapa *= 2;
40
- REALLOC_N(self->details, Explanation *, self->dcapa);
41
- }
42
- self->details[self->dcnt] = detail;
43
- self->dcnt++;
44
- return self;
35
+ ary_push(expl->details, detail);
36
+ return expl;
45
37
  }
46
38
 
47
- char *expl_to_s(Explanation *self, int depth)
39
+ char *expl_to_s_depth(Explanation *expl, int depth)
48
40
  {
49
- int i;
50
- char *buffer = ALLOC_N(char, depth * 2 + 1);
51
- memset(buffer, ' ', sizeof(char) * depth * 2);
52
- buffer[depth*2] = 0;
41
+ int i;
42
+ char *buffer = ALLOC_N(char, depth * 2 + 1);
43
+ const int num_details = ary_size(expl->details);
44
+
45
+ memset(buffer, ' ', sizeof(char) * depth * 2);
46
+ buffer[depth*2] = 0;
53
47
 
54
- buffer = estrcat(buffer, strfmt("%f = %s\n", self->value, self->description));
55
- for (i = 0; i < self->dcnt; i++) {
56
- buffer = estrcat(buffer, expl_to_s(self->details[i], depth + 1));
57
- }
48
+ buffer = estrcat(buffer, strfmt("%f = %s\n", expl->value, expl->description));
49
+ for (i = 0; i < num_details; i++) {
50
+ buffer = estrcat(buffer, expl_to_s_depth(expl->details[i], depth + 1));
51
+ }
58
52
 
59
- return buffer;
53
+ return buffer;
60
54
  }
61
55
 
62
- char *expl_to_html(Explanation *self)
56
+ char *expl_to_html(Explanation *expl)
63
57
  {
64
- int i;
65
- char *buffer;
66
- buffer = strfmt("<ul>\n<li>%f = %s</li>\n", self->value, self->description);
58
+ int i;
59
+ char *buffer;
60
+ const int num_details = ary_size(expl->details);
61
+
62
+ buffer = strfmt("<ul>\n<li>%f = %s</li>\n", expl->value, expl->description);
67
63
 
68
- for (i = 0; i < self->dcnt; i++) {
69
- estrcat(buffer, expl_to_html(self->details[i]));
70
- }
64
+ for (i = 0; i < num_details; i++) {
65
+ estrcat(buffer, expl_to_html(expl->details[i]));
66
+ }
71
67
 
72
- REALLOC_N(buffer, char, strlen(buffer) + 10);
73
- return strcat(buffer, "</ul>\n");
68
+ REALLOC_N(buffer, char, strlen(buffer) + 10);
69
+ return strcat(buffer, "</ul>\n");
74
70
  }
75
71
 
76
72
  /***************************************************************************
@@ -79,88 +75,104 @@ char *expl_to_html(Explanation *self)
79
75
  *
80
76
  ***************************************************************************/
81
77
 
82
- bool hit_less_than(void *hit1, void *hit2)
78
+ static bool hit_less_than(const Hit *hit1, const Hit *hit2)
83
79
  {
84
- if (((Hit *)hit1)->score == ((Hit *)hit2)->score) {
85
- return ((Hit *)hit1)->doc > ((Hit *)hit2)->doc;
86
- } else {
87
- return ((Hit *)hit1)->score < ((Hit *)hit2)->score;
88
- }
80
+ if (hit1->score == hit2->score) {
81
+ return hit1->doc > hit2->doc;
82
+ }
83
+ else {
84
+ return hit1->score < hit1->score;
85
+ }
89
86
  }
90
87
 
91
- inline bool hit_lt(Hit *hit1, Hit *hit2)
88
+ static bool hit_lt(Hit *hit1, Hit *hit2)
92
89
  {
93
- if (hit1->score == hit2->score) {
94
- return hit1->doc > hit2->doc;
95
- } else {
96
- return hit1->score < hit2->score;
97
- }
90
+ if (hit1->score == hit2->score) {
91
+ return hit1->doc > hit2->doc;
92
+ }
93
+ else {
94
+ return hit1->score < hit2->score;
95
+ }
98
96
  }
99
97
 
100
- void hit_pq_down(PriorityQueue *pq)
98
+ static void hit_pq_down(PriorityQueue *pq)
101
99
  {
102
- register int i = 1;
103
- register int j = 2; //i << 1;
104
- register int k = 3; //j + 1;
105
- Hit **heap = (Hit **)pq->heap;
106
- Hit *node = heap[i]; // save top node
100
+ register int i = 1;
101
+ register int j = 2; /* i << 1; */
102
+ register int k = 3; /* j + 1; */
103
+ Hit **heap = (Hit **)pq->heap;
104
+ Hit *node = heap[i]; /* save top node */
107
105
 
108
- if ((k <= pq->count) && hit_lt(heap[k], heap[j]))
109
- j = k;
106
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
107
+ j = k;
108
+ }
110
109
 
111
- while ((j <= pq->count) && hit_lt(heap[j], node)) {
112
- heap[i] = heap[j]; // shift up child
113
- i = j;
114
- j = i << 1;
115
- k = j + 1;
116
- if ((k <= pq->count) && hit_lt(heap[k], heap[j]))
117
- j = k;
118
- }
119
- heap[i] = node;
110
+ while ((j <= pq->size) && hit_lt(heap[j], node)) {
111
+ heap[i] = heap[j]; /* shift up child */
112
+ i = j;
113
+ j = i << 1;
114
+ k = j + 1;
115
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
116
+ j = k;
117
+ }
118
+ }
119
+ heap[i] = node;
120
120
  }
121
121
 
122
- Hit *hit_pq_pop(PriorityQueue *pq)
122
+ static Hit *hit_pq_pop(PriorityQueue *pq)
123
123
  {
124
- if (pq->count > 0) {
125
- Hit *result = (Hit *)pq->heap[1]; // save first value
126
- pq->heap[1] = pq->heap[pq->count]; // move last to first
127
- pq->heap[pq->count] = NULL;
128
- pq->count--;
129
- hit_pq_down(pq); // adjust heap
130
- return result;
131
- } else {
132
- return NULL;
133
- }
124
+ if (pq->size > 0) {
125
+ Hit *result = (Hit *)pq->heap[1]; /* save first value */
126
+ pq->heap[1] = pq->heap[pq->size]; /* move last to first */
127
+ pq->heap[pq->size] = NULL;
128
+ pq->size--;
129
+ hit_pq_down(pq); /* adjust heap */
130
+ return result;
131
+ }
132
+ else {
133
+ return NULL;
134
+ }
134
135
  }
135
136
 
136
- inline void hit_pq_up(PriorityQueue *pq)
137
+ static void hit_pq_up(PriorityQueue *pq)
137
138
  {
138
- Hit **heap = (Hit **)pq->heap;
139
- Hit *node;
140
- int i = pq->count;
141
- int j = i >> 1;
142
- node = heap[i];
139
+ Hit **heap = (Hit **)pq->heap;
140
+ Hit *node;
141
+ int i = pq->size;
142
+ int j = i >> 1;
143
+ node = heap[i];
144
+
145
+ while ((j > 0) && hit_lt(node, heap[j])) {
146
+ heap[i] = heap[j];
147
+ i = j;
148
+ j = j >> 1;
149
+ }
150
+ heap[i] = node;
151
+ }
143
152
 
144
- while ((j > 0) && hit_lt(node, heap[j])) {
145
- heap[i] = heap[j];
146
- i = j;
147
- j = j >> 1;
148
- }
149
- heap[i] = node;
153
+ static void hit_pq_insert(PriorityQueue *pq, Hit *hit)
154
+ {
155
+ if (pq->size < pq->capa) {
156
+ Hit *new_hit = ALLOC(Hit);
157
+ memcpy(new_hit, hit, sizeof(Hit));
158
+ pq->size++;
159
+ if (pq->size >= pq->mem_capa) {
160
+ pq->mem_capa <<= 1;
161
+ REALLOC_N(pq->heap, void *, pq->mem_capa);
162
+ }
163
+ pq->heap[pq->size] = new_hit;
164
+ hit_pq_up(pq);
165
+ }
166
+ else if (pq->size > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
167
+ memcpy(pq->heap[1], hit, sizeof(Hit));
168
+ hit_pq_down(pq);
169
+ }
150
170
  }
151
171
 
152
- void hit_pq_insert(PriorityQueue *pq, Hit *hit)
172
+ static void hit_pq_multi_insert(PriorityQueue *pq, Hit *hit)
153
173
  {
154
- if (pq->count < pq->size) {
155
- Hit *new_hit = ALLOC(Hit);
156
- memcpy(new_hit, hit, sizeof(Hit));
157
- pq->count++;
158
- pq->heap[pq->count] = new_hit;
159
- hit_pq_up(pq);
160
- } else if (pq->count > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
161
- memcpy(pq->heap[1], hit, sizeof(Hit));
162
- hit_pq_down(pq);
163
- }
174
+ hit_pq_insert(pq, hit);
175
+ free(hit);
164
176
  }
165
177
 
166
178
  /***************************************************************************
@@ -169,35 +181,38 @@ void hit_pq_insert(PriorityQueue *pq, Hit *hit)
169
181
  *
170
182
  ***************************************************************************/
171
183
 
172
- TopDocs *td_create(int total_hits, int size, Hit **hits)
184
+ TopDocs *td_new(int total_hits, int size, Hit **hits, float max_score)
173
185
  {
174
- TopDocs *td = ALLOC(TopDocs);
175
- td->total_hits = total_hits;
176
- td->size = size;
177
- td->hits = hits;
178
- return td;
186
+ TopDocs *td = ALLOC(TopDocs);
187
+ td->total_hits = total_hits;
188
+ td->size = size;
189
+ td->hits = hits;
190
+ td->max_score = max_score;
191
+ return td;
179
192
  }
180
193
 
181
194
  void td_destroy(TopDocs *td)
182
195
  {
183
- int i;
184
- for (i = 0; i < td->size; i++) {
185
- free(td->hits[i]);
186
- }
187
- free(td->hits);
188
- free(td);
196
+ int i;
197
+
198
+ for (i = 0; i < td->size; i++) {
199
+ free(td->hits[i]);
200
+ }
201
+ free(td->hits);
202
+ free(td);
189
203
  }
190
204
 
191
205
  char *td_to_s(TopDocs *td)
192
206
  {
193
- int i;
194
- Hit *hit;
195
- char *buffer = strfmt("%d hits sorted by <score, doc_num>\n", td->total_hits);
196
- for (i = 0; i < td->size; i++) {
197
- hit = td->hits[i];
198
- estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
199
- }
200
- return buffer;
207
+ int i;
208
+ Hit *hit;
209
+ char *buffer = strfmt("%d hits sorted by <score, doc_num>\n",
210
+ td->total_hits);
211
+ for (i = 0; i < td->size; i++) {
212
+ hit = td->hits[i];
213
+ estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
214
+ }
215
+ return buffer;
201
216
  }
202
217
 
203
218
  /***************************************************************************
@@ -208,44 +223,50 @@ char *td_to_s(TopDocs *td)
208
223
 
209
224
  Query *w_get_query(Weight *self)
210
225
  {
211
- return self->query;
226
+ return self->query;
212
227
  }
213
228
 
214
229
  float w_get_value(Weight *self)
215
230
  {
216
- return self->value;
231
+ return self->value;
217
232
  }
218
233
 
219
234
  float w_sum_of_squared_weights(Weight *self)
220
235
  {
221
- self->qweight = self->idf * self->query->boost;
222
- return self->qweight * self->qweight; // square it
236
+ self->qweight = self->idf * self->query->boost;
237
+ return self->qweight * self->qweight; /* square it */
223
238
  }
224
239
 
225
240
  void w_normalize(Weight *self, float normalization_factor)
226
241
  {
227
- self->qnorm = normalization_factor;
228
- self->qweight *= normalization_factor; // normalize query weight
229
- self->value = self->qweight * self->idf; // idf for document
242
+ self->qnorm = normalization_factor;
243
+ self->qweight *= normalization_factor; /* normalize query weight */
244
+ self->value = self->qweight * self->idf;/* idf for document */
230
245
  }
231
246
 
232
247
  void w_destroy(Weight *self)
233
248
  {
234
- q_deref(self->query);
235
- free(self);
249
+ q_deref(self->query);
250
+ free(self);
236
251
  }
237
252
 
238
- Weight *w_create(Query *query)
253
+ Weight *w_create(size_t size, Query *query)
239
254
  {
240
- Weight *self = ALLOC_AND_ZERO_N(Weight, 1);
241
- ref(query);
242
- self->query = query;
243
-
244
- self->get_query = &w_get_query;
245
- self->get_value = &w_get_value;
246
- self->normalize = &w_normalize;
247
- self->destroy = &w_destroy;
248
- return self;
255
+ Weight *self = (Weight *)ecalloc(size);
256
+ #ifdef DEBUG
257
+ if (size < sizeof(Weight)) {
258
+ RAISE(ERROR, "size of weight <%d> should be at least <%d>",
259
+ (int)size, (int)sizeof(Weight));
260
+ }
261
+ #endif
262
+ REF(query);
263
+ self->query = query;
264
+ self->get_query = &w_get_query;
265
+ self->get_value = &w_get_value;
266
+ self->normalize = &w_normalize;
267
+ self->destroy = &w_destroy;
268
+ self->sum_of_squared_weights = &w_sum_of_squared_weights;
269
+ return self;
249
270
  }
250
271
 
251
272
  /***************************************************************************
@@ -254,128 +275,181 @@ Weight *w_create(Query *query)
254
275
  *
255
276
  ***************************************************************************/
256
277
 
257
- Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
258
- {
259
- return searcher->get_similarity(searcher);
260
- }
278
+ static const char *QUERY_NAMES[] = {
279
+ "TermQuery",
280
+ "MultiTermQuery",
281
+ "BooleanQuery",
282
+ "PhraseQuery",
283
+ "MultiPhraseQuery",
284
+ "ConstantScoreQuery",
285
+ "FilteredQuery",
286
+ "MatchAllQuery",
287
+ "RangeQuery",
288
+ "WildCardQuery",
289
+ "FuzzyQuery",
290
+ "PrefixQuery",
291
+ "SpanTermQuery",
292
+ "SpanFirstQuery",
293
+ "SpanOrQuery",
294
+ "SpanNotQuery",
295
+ "SpanNearQuery"
296
+ };
261
297
 
262
- Query *q_rewrite(Query *self, IndexReader *ir)
263
- {
264
- self->ref_cnt++;
265
- return self;
298
+ static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
299
+
300
+ const char *q_get_query_name(enum QUERY_TYPE type) {
301
+ if (type >= NELEMS(QUERY_NAMES)) {
302
+ return UNKNOWN_QUERY_NAME;
303
+ }
304
+ else {
305
+ return QUERY_NAMES[type];
306
+ }
266
307
  }
267
308
 
268
- Weight *q_weight(Query *self, Searcher *searcher)
309
+ static Query *q_rewrite(Query *self, IndexReader *ir)
269
310
  {
270
- Query *query = searcher->rewrite(searcher, self);
271
- Weight *weight = query->create_weight_i(query, searcher);
272
- float sum = weight->sum_of_squared_weights(weight);
273
- Similarity *sim = query->get_similarity(query, searcher);
274
- float norm = sim_query_norm(sim, sum);
275
- q_deref(query);
276
-
277
- weight->normalize(weight, norm);
278
- return self->weight = weight;
311
+ (void)ir;
312
+ self->ref_cnt++;
313
+ return self;
279
314
  }
280
315
 
281
- Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
316
+ static void q_extract_terms(Query *self, HashSet *terms)
282
317
  {
283
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
284
- return NULL;
318
+ /* do nothing by default */
319
+ (void)self;
320
+ (void)terms;
285
321
  }
286
322
 
287
- void q_destroy_i(Query *self)
323
+ Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
288
324
  {
289
- free(self);
325
+ (void)self;
326
+ return searcher->get_similarity(searcher);
290
327
  }
291
328
 
292
- void q_extract_terms(Query *self, HashSet *terms)
329
+ void q_destroy_i(Query *self)
293
330
  {
294
- /* do nothing by default */
331
+ free(self);
295
332
  }
296
333
 
297
334
  void q_deref(Query *self)
298
335
  {
299
- if (--self->ref_cnt == 0) {
300
- self->destroy_i(self);
301
- }
336
+ if (--(self->ref_cnt) == 0) {
337
+ self->destroy_i(self);
338
+ }
302
339
  }
303
340
 
304
- Query *q_create()
341
+ Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
305
342
  {
306
- Query *self = ALLOC(Query);
307
- ZEROSET(self, Query, 1);
308
- self->destroy_all = true;
309
- self->boost = 1.0;
310
- self->rewrite = &q_rewrite;
311
- self->get_similarity = &q_get_similarity_i;
312
- self->extract_terms = &q_extract_terms;
313
- self->weight = NULL;
314
- self->ref_cnt = 1;
315
- return self;
343
+ (void)self;
344
+ (void)searcher;
345
+ RAISE(UNSUPPORTED_ERROR,
346
+ "Create weight is unsupported for this type of query");
347
+ return NULL;
316
348
  }
317
349
 
318
- uint q_hash(Query *self)
350
+ Weight *q_weight(Query *self, Searcher *searcher)
319
351
  {
320
- return (self->hash(self) << 4) | self->type;
321
- }
352
+ Query *query = searcher->rewrite(searcher, self);
353
+ Weight *weight = query->create_weight_i(query, searcher);
354
+ float sum = weight->sum_of_squared_weights(weight);
355
+ Similarity *sim = query->get_similarity(query, searcher);
356
+ float norm = sim_query_norm(sim, sum);
357
+ q_deref(query);
322
358
 
323
- int q_eq(Query *self, Query *o)
324
- {
325
- return (self == o) || ((self->type == o->type) &&
326
- (self->boost == o->boost) &&
327
- self->eq(self, o));
359
+ weight->normalize(weight, norm);
360
+ return self->weight = weight;
328
361
  }
329
362
 
363
+ #define BQ(query) ((BooleanQuery *)(query))
330
364
  Query *q_combine(Query **queries, int q_cnt)
331
365
  {
332
- int i;
333
- Query *q, *ret_q;
334
- HashSet *uniques =
335
- hs_create((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
336
-
337
- for (i = 0; i < q_cnt; i++) {
338
- q = queries[i];
339
- if (q->type == BOOLEAN_QUERY) {
340
- int j;
341
- bool splittable = true;
342
- BooleanQuery *bq = (BooleanQuery *)q->data;
343
- if (bq->coord_disabled == false) {
344
- splittable = false;
345
- } else {
346
- for (j = 0; j < bq->clause_cnt; j++) {
347
- if (bq->clauses[j]->occur != BC_SHOULD) {
348
- splittable = false;
349
- break;
350
- }
366
+ int i;
367
+ Query *q, *ret_q;
368
+ HashSet *uniques = hs_new((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
369
+
370
+ for (i = 0; i < q_cnt; i++) {
371
+ q = queries[i];
372
+ if (q->type == BOOLEAN_QUERY) {
373
+ int j;
374
+ bool splittable = true;
375
+ if (BQ(q)->coord_disabled == false) {
376
+ splittable = false;
377
+ }
378
+ else {
379
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
380
+ if (BQ(q)->clauses[j]->occur != BC_SHOULD) {
381
+ splittable = false;
382
+ break;
383
+ }
384
+ }
385
+ }
386
+ if (splittable) {
387
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
388
+ Query *sub_q = BQ(q)->clauses[j]->query;
389
+ hs_add(uniques, sub_q);
390
+ }
391
+ }
392
+ else {
393
+ hs_add(uniques, q);
394
+ }
351
395
  }
352
- }
353
- if (splittable) {
354
- for (j = 0; j < bq->clause_cnt; j++) {
355
- q = bq->clauses[j]->query;
356
- hs_add(uniques, q);
396
+ else {
397
+ hs_add(uniques, q);
357
398
  }
358
- } else {
359
- hs_add(uniques, q);
360
- }
361
- } else {
362
- hs_add(uniques, q);
363
399
  }
364
- }
365
- if (uniques->size == 1) {
366
- ret_q = (Query *)uniques->elems[0];
367
- ref(ret_q);
368
- } else {
369
- ret_q = bq_create(true);
370
- for (i = 0; i < uniques->size; i++) {
371
- q = (Query *)uniques->elems[i];
372
- ref(q);
373
- bq_add_query(ret_q, q, BC_SHOULD);
400
+ if (uniques->size == 1) {
401
+ ret_q = (Query *)uniques->elems[0];
402
+ REF(ret_q);
403
+ }
404
+ else {
405
+ ret_q = bq_new(true);
406
+ for (i = 0; i < uniques->size; i++) {
407
+ q = (Query *)uniques->elems[i];
408
+ bq_add_query(ret_q, q, BC_SHOULD);
409
+ }
374
410
  }
375
- }
376
- hs_destroy(uniques);
411
+ hs_destroy(uniques);
377
412
 
378
- return ret_q;
413
+ return ret_q;
414
+ }
415
+
416
+ ulong q_hash(Query *self)
417
+ {
418
+ return (self->hash(self) << 5) | self->type;
419
+ }
420
+
421
+ int q_eq(Query *self, Query *o)
422
+ {
423
+ return (self == o)
424
+ || ((self->type == o->type)
425
+ && (self->boost == o->boost)
426
+ && self->eq(self, o));
427
+ }
428
+
429
+ static MatchVector *q_get_matchv_i(Query *self, MatchVector *mv, TermVector *tv)
430
+ {
431
+ /* be default we don't add any matches */
432
+ (void)self; (void)tv;
433
+ return mv;
434
+ }
435
+
436
+ Query *q_create(size_t size)
437
+ {
438
+ Query *self = (Query *)ecalloc(size);
439
+ #ifdef DEBUG
440
+ if (size < sizeof(Query)) {
441
+ RAISE(ERROR, "Size of a query <%d> should never be smaller than the "
442
+ "size of a Query struct <%d>", (int)size, (int)sizeof(Query));
443
+ }
444
+ #endif
445
+ self->boost = 1.0;
446
+ self->rewrite = &q_rewrite;
447
+ self->get_similarity = &q_get_similarity_i;
448
+ self->extract_terms = &q_extract_terms;
449
+ self->get_matchv_i = &q_get_matchv_i;
450
+ self->weight = NULL;
451
+ self->ref_cnt = 1;
452
+ return self;
379
453
  }
380
454
 
381
455
  /***************************************************************************
@@ -384,36 +458,154 @@ Query *q_combine(Query **queries, int q_cnt)
384
458
  *
385
459
  ***************************************************************************/
386
460
 
387
- void scorer_destroy_i(Scorer *self)
461
+ void scorer_destroy_i(Scorer *scorer)
388
462
  {
389
- free(self->data);
390
- free(self);
463
+ free(scorer);
391
464
  }
392
465
 
393
- Scorer *scorer_create(Similarity *similarity)
466
+ Scorer *scorer_create(size_t size, Similarity *similarity)
394
467
  {
395
- Scorer *self = ALLOC(Scorer);
396
- self->destroy = &scorer_destroy_i;
397
- self->data = NULL;
398
- self->similarity = similarity;
399
- return self;
468
+ Scorer *self = (Scorer *)ecalloc(size);
469
+ #ifdef DEBUG
470
+ if (size < sizeof(Scorer)) {
471
+ RAISE(ERROR, "size of scorer <%d> should be at least <%d>",
472
+ (int)size, (int)sizeof(Scorer));
473
+ }
474
+ #endif
475
+ self->destroy = &scorer_destroy_i;
476
+ self->similarity = similarity;
477
+ return self;
400
478
  }
401
479
 
402
480
  bool scorer_less_than(void *p1, void *p2)
403
481
  {
404
- Scorer *s1 = (Scorer *)p1;
405
- Scorer *s2 = (Scorer *)p2;
406
- return s1->score(s1) < s2->score(s2);
482
+ Scorer *s1 = (Scorer *)p1;
483
+ Scorer *s2 = (Scorer *)p2;
484
+ return s1->score(s1) < s2->score(s2);
407
485
  }
408
486
 
409
- bool scorer_doc_less_than(void *p1, void *p2)
487
+ bool scorer_doc_less_than(const Scorer *s1, const Scorer *s2)
410
488
  {
411
- return ((Scorer *)p1)->doc < ((Scorer *)p2)->doc;
489
+ return s1->doc < s2->doc;
412
490
  }
413
491
 
414
492
  int scorer_doc_cmp(const void *p1, const void *p2)
415
493
  {
416
- return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
494
+ return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
495
+ }
496
+
497
+ /***************************************************************************
498
+ *
499
+ * Highlighter
500
+ *
501
+ ***************************************************************************/
502
+
503
+ /* ** MatchRange ** */
504
+ static int match_range_cmp(const void *p1, const void *p2)
505
+ {
506
+ int diff = ((MatchRange *)p1)->start - ((MatchRange *)p2)->start;
507
+ if (diff != 0) {
508
+ return diff;
509
+ }
510
+ else {
511
+ return ((MatchRange *)p2)->end - ((MatchRange *)p1)->end;
512
+ }
513
+ }
514
+
515
+
516
+
517
+ /* ** MatchVector ** */
518
+ MatchVector *matchv_new()
519
+ {
520
+ MatchVector *matchv = ALLOC(MatchVector);
521
+
522
+ matchv->size = 0;
523
+ matchv->capa = MATCH_VECTOR_INIT_CAPA;
524
+ matchv->matches = ALLOC_N(MatchRange, MATCH_VECTOR_INIT_CAPA);
525
+
526
+ return matchv;
527
+ }
528
+
529
+ MatchVector *matchv_add(MatchVector *self, int start, int end)
530
+ {
531
+ if (self->size >= self->capa) {
532
+ self->capa <<= 1;
533
+ REALLOC_N(self->matches, MatchRange, self->capa);
534
+ }
535
+ self->matches[self->size].start = start;
536
+ self->matches[self->size].end = end;
537
+ self->matches[self->size++].score = 1.0;
538
+ return self;
539
+ }
540
+
541
+ MatchVector *matchv_sort(MatchVector *self)
542
+ {
543
+ qsort(self->matches, self->size, sizeof(MatchRange), &match_range_cmp);
544
+ return self;
545
+ }
546
+
547
+ MatchVector *matchv_compact(MatchVector *self)
548
+ {
549
+ int left, right;
550
+ matchv_sort(self);
551
+ for (right = left = 0; right < self->size; right++) {
552
+ /* Note the end + 1. This compacts a range 3:5 and 6:8 inleft 3:8 */
553
+ if (self->matches[right].start > self->matches[left].end + 1) {
554
+ left++;
555
+ self->matches[left].start = self->matches[right].start;
556
+ self->matches[left].end = self->matches[right].end;
557
+ self->matches[left].score = self->matches[right].score;
558
+ }
559
+ else if (self->matches[right].end > self->matches[left].end) {
560
+ self->matches[left].end = self->matches[right].end;
561
+ }
562
+ else {
563
+ self->matches[left].score += self->matches[right].score;
564
+ }
565
+ }
566
+ self->size = left + 1;
567
+ return self;
568
+ }
569
+
570
+ MatchVector *matchv_compact_with_breaks(MatchVector *self)
571
+ {
572
+ int left, right;
573
+ matchv_sort(self);
574
+ for (right = left = 0; right < self->size; right++) {
575
+ /* Note: no end + 1. Unlike above won't compact ranges 3:5 and 6:8 */
576
+ if (self->matches[right].start > self->matches[left].end) {
577
+ left++;
578
+ self->matches[left].start = self->matches[right].start;
579
+ self->matches[left].end = self->matches[right].end;
580
+ self->matches[left].score = self->matches[right].score;
581
+ }
582
+ else if (self->matches[right].end > self->matches[left].end) {
583
+ self->matches[left].end = self->matches[right].end;
584
+ self->matches[left].score += self->matches[right].score;
585
+ }
586
+ else if (right > left) {
587
+ self->matches[left].score += self->matches[right].score;
588
+ }
589
+ }
590
+ self->size = left + 1;
591
+ return self;
592
+ }
593
+
594
+
595
+ static MatchVector *matchv_set_offsets(MatchVector *mv, Offset *offsets)
596
+ {
597
+ int i;
598
+ for (i = 0; i < mv->size; i++) {
599
+ mv->matches[i].start_offset = offsets[mv->matches[i].start].start;
600
+ mv->matches[i].end_offset = offsets[mv->matches[i].end].end;
601
+ }
602
+ return mv;
603
+ }
604
+
605
+ void matchv_destroy(MatchVector *self)
606
+ {
607
+ free(self->matches);
608
+ free(self);
417
609
  }
418
610
 
419
611
  /***************************************************************************
@@ -422,211 +614,541 @@ int scorer_doc_cmp(const void *p1, const void *p2)
422
614
  *
423
615
  ***************************************************************************/
424
616
 
425
- static int s_doc_freq(Searcher *self, Term *term)
617
+ MatchVector *searcher_get_match_vector(Searcher *self,
618
+ Query *query,
619
+ const int doc_num,
620
+ const char *field)
621
+ {
622
+ MatchVector *mv = matchv_new();
623
+ Query *rewritten_query = self->rewrite(self, query);
624
+ TermVector *tv = self->get_term_vector(self, doc_num, field);
625
+ if (tv && tv->term_cnt > 0 && tv->terms[0].positions != NULL) {
626
+ mv = rewritten_query->get_matchv_i(rewritten_query, mv, tv);
627
+ tv_destroy(tv);
628
+ }
629
+ q_deref(rewritten_query);
630
+ return mv;
631
+ }
632
+
633
+ typedef struct Excerpt
426
634
  {
427
- return self->ir->doc_freq(self->ir, term);
635
+ int start;
636
+ int end;
637
+ int start_pos;
638
+ int end_pos;
639
+ int start_offset;
640
+ int end_offset;
641
+ double score;
642
+ } Excerpt;
643
+
644
+ /*
645
+ static int excerpt_cmp(const void *p1, const void *p2)
646
+ {
647
+ double score1 = (*((Excerpt **)p1))->score;
648
+ double score2 = (*((Excerpt **)p2))->score;
649
+ if (score1 > score2) return 1;
650
+ if (score1 < score2) return -1;
651
+ return 0;
428
652
  }
653
+ */
429
654
 
430
- static int *s_doc_freqs(Searcher *self, Term **terms, int tcnt)
655
+ static int excerpt_start_cmp(const void *p1, const void *p2)
431
656
  {
432
- int i;
433
- int *freqs = ALLOC_N(int, tcnt);
657
+ return (*((Excerpt **)p1))->start - (*((Excerpt **)p2))->start;
658
+ }
434
659
 
435
- for (i = 0; i < tcnt; i++) {
436
- freqs[i] = self->ir->doc_freq(self->ir, terms[i]);
437
- }
438
- return freqs;
660
+ static int excerpt_lt(Excerpt *e1, Excerpt *e2)
661
+ {
662
+ return e1->score > e2->score; /* want the highest score at top */
439
663
  }
440
664
 
441
- static int *ss_doc_freqs(Searcher *self, Term **terms, int tcnt)
665
+ static Excerpt *excerpt_new(int start, int end, double score)
442
666
  {
443
- int i;
444
- int *freqs = ALLOC_N(int, tcnt);
667
+ Excerpt *excerpt = ALLOC_AND_ZERO(Excerpt);
668
+ excerpt->start = start;
669
+ excerpt->end = end;
670
+ excerpt->score = score;
671
+ return excerpt;
672
+ }
445
673
 
446
- for (i = 0; i < tcnt; i++) {
447
- freqs[i] = self->doc_freq(self, terms[i]);
448
- }
674
+ static Excerpt *excerpt_recalc_score(Excerpt *e, MatchVector *mv)
675
+ {
676
+ int i;
677
+ double score = 0.0;
678
+ for (i = e->start; i <= e->end; i++) {
679
+ score += mv->matches[i].score;
680
+ }
681
+ e->score = score;
682
+ return e;
683
+ }
449
684
 
450
- return freqs;
685
+ /* expand an excerpt to it's largest possible size */
686
+ static Excerpt *excerpt_expand(Excerpt *e, const int len, TermVector *tv)
687
+ {
688
+ Offset *offsets = tv->offsets;
689
+ int offset_cnt = tv->offset_cnt;
690
+ bool did_expansion = true;
691
+ int i;
692
+ /* fill in skipped offsets */
693
+ for (i = 1; i < offset_cnt; i++) {
694
+ if (offsets[i].start == 0) {
695
+ offsets[i].start = offsets[i-1].start;
696
+ }
697
+ if (offsets[i].end == 0) {
698
+ offsets[i].end = offsets[i-1].end;
699
+ }
700
+ }
701
+
702
+ while (did_expansion) {
703
+ did_expansion = false;
704
+ if (e->start_pos > 0
705
+ && (e->end_offset - offsets[e->start_pos - 1].start) < len) {
706
+ e->start_pos--;
707
+ e->start_offset = offsets[e->start_pos].start;
708
+ did_expansion = true;
709
+ }
710
+ if (e->end_pos < (offset_cnt - 1)
711
+ && (offsets[e->end_pos + 1].end - e->start_offset) < len) {
712
+ e->end_pos++;
713
+ e->end_offset = offsets[e->end_pos].end;
714
+ did_expansion = true;
715
+ }
716
+ }
717
+ return e;
718
+ }
719
+
720
+ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
721
+ LazyDocField *lazy_df,
722
+ const char *pre_tag,
723
+ const char *post_tag,
724
+ const char *ellipsis)
725
+ {
726
+ int i, len;
727
+ int last_offset = e->start_offset;
728
+ const int num_matches = e->end - e->start + 1;
729
+ const int pre_tag_len = (int)strlen(pre_tag);
730
+ const int post_tag_len = (int)strlen(post_tag);
731
+ const int ellipsis_len = (int)strlen(ellipsis);
732
+ char *excerpt_str = ALLOC_N(char,
733
+ 10 + e->end_offset - e->start_offset
734
+ + (num_matches * (pre_tag_len + post_tag_len))
735
+ + (2 * ellipsis_len));
736
+ char *e_ptr = excerpt_str;
737
+ if (e->start_offset > 0) {
738
+ memcpy(e_ptr, ellipsis, ellipsis_len);
739
+ e_ptr += ellipsis_len;
740
+ }
741
+ for (i = e->start; i <= e->end; i++) {
742
+ MatchRange *mr = mv->matches + i;
743
+ len = mr->start_offset - last_offset;
744
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
745
+ e_ptr += len;
746
+ memcpy(e_ptr, pre_tag, pre_tag_len);
747
+ e_ptr += pre_tag_len;
748
+ len = mr->end_offset - mr->start_offset;
749
+ lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
750
+ e_ptr += len;
751
+ memcpy(e_ptr, post_tag, post_tag_len);
752
+ e_ptr += post_tag_len;
753
+ last_offset = mr->end_offset;
754
+ }
755
+ len = e->end_offset - last_offset;
756
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
757
+ e_ptr += len;
758
+ if (e->end_offset < lazy_df->len) {
759
+ memcpy(e_ptr, ellipsis, ellipsis_len);
760
+ e_ptr += ellipsis_len;
761
+ }
762
+ *e_ptr = '\0';
763
+ return excerpt_str;
764
+ }
765
+
766
+ char **searcher_highlight(Searcher *self,
767
+ Query *query,
768
+ const int doc_num,
769
+ const char *field,
770
+ const int excerpt_len,
771
+ const int num_excerpts,
772
+ const char *pre_tag,
773
+ const char *post_tag,
774
+ const char *ellipsis)
775
+ {
776
+ char **excerpt_strs = NULL;
777
+ TermVector *tv = self->get_term_vector(self, doc_num, field);
778
+ LazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
779
+ LazyDocField *lazy_df = NULL;
780
+ if (lazy_doc) {
781
+ lazy_df = h_get(lazy_doc->field_dict, field);
782
+ }
783
+ if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
784
+ && tv->offsets != NULL) {
785
+ MatchVector *mv;
786
+ query = self->rewrite(self, query);
787
+ mv = query->get_matchv_i(query, matchv_new(), tv);
788
+ if (mv->size > 0) {
789
+ Excerpt **excerpts = ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
790
+ int e_start, e_end, i, j;
791
+ MatchRange *matches = mv->matches;
792
+ double running_score = 0.0;
793
+ Offset *offsets = tv->offsets;
794
+ PriorityQueue *excerpt_pq;
795
+
796
+ matchv_compact_with_breaks(mv);
797
+ matchv_set_offsets(mv, offsets);
798
+ excerpt_pq = pq_new(mv->size, (lt_ft)&excerpt_lt, &free);
799
+ /* add all possible excerpts to the priority queue */
800
+
801
+ for (e_start = 0, e_end = 1; e_start < mv->size; e_start++) {
802
+ const int start_offset = matches[e_start].start_offset;
803
+ if (e_start >= e_end) {
804
+ e_end = e_start + 1;
805
+ }
806
+ running_score += matches[e_start].score;
807
+ while (e_end < mv->size && (matches[e_end].end_offset
808
+ <= start_offset + excerpt_len)) {
809
+ running_score += matches[e_end].score;
810
+ e_end++;
811
+ }
812
+ pq_push(excerpt_pq,
813
+ excerpt_new(e_start, e_end - 1, running_score));
814
+ /* - 0.1 so that earlier matches take priority */
815
+ running_score -= matches[e_start].score;
816
+ }
817
+
818
+ for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
819
+ excerpts[i] = pq_pop(excerpt_pq);
820
+ if (i < num_excerpts - 1) {
821
+ /* set match ranges alread included to 0 */
822
+ Excerpt *e = excerpts[i];
823
+ for (j = e->start; j <= e->end; j++) {
824
+ matches[j].score = 0.0;
825
+ }
826
+ e = NULL;
827
+ while (e != (Excerpt *)pq_top(excerpt_pq)) {
828
+ e = pq_top(excerpt_pq);
829
+ excerpt_recalc_score(e, mv);
830
+ pq_down(excerpt_pq);
831
+ }
832
+ }
833
+ }
834
+
835
+ qsort(excerpts, i, sizeof(Excerpt *), &excerpt_start_cmp);
836
+ for (j = 0; j < i; j++) {
837
+ Excerpt *e = excerpts[j];
838
+ e->start_pos = matches[e->start].start;
839
+ e->end_pos = matches[e->end].end;
840
+ e->start_offset = offsets[e->start_pos].start;
841
+ e->end_offset = offsets[e->end_pos].end;
842
+ }
843
+
844
+ if (i < num_excerpts) {
845
+ const int diff = num_excerpts - i;
846
+ memmove(excerpts + (diff), excerpts,
847
+ i * sizeof(Excerpt *));
848
+ for (j = 0; j < diff; j++) {
849
+ /* these new excerpts will grow into one long excerpt at
850
+ * the start */
851
+ excerpts[j] = ALLOC_AND_ZERO(Excerpt);
852
+ excerpts[j]->end = -1;
853
+ }
854
+ }
855
+
856
+ excerpt_strs = ary_new_type_capa(char *, num_excerpts);
857
+ /* merge excerpts where possible */
858
+ for (i = 0; i < num_excerpts;) {
859
+ Excerpt *ei = excerpts[i];
860
+ int merged = 1; /* 1 means a single excerpt, ie no merges */
861
+ for (j = i + 1; j < num_excerpts; j++) {
862
+ Excerpt *ej = excerpts[j];
863
+ if ((ej->end_offset - ei->start_offset)
864
+ < (j - i + 1) * excerpt_len) {
865
+ ei->end = ej->end;
866
+ ei->end_pos = ej->end_pos;
867
+ ei->end_offset = ej->end_offset;
868
+ merged = j - i + 1;
869
+ }
870
+ }
871
+ excerpt_expand(ei, merged * excerpt_len, tv);
872
+ ary_push(excerpt_strs,
873
+ excerpt_get_str(ei, mv, lazy_df,
874
+ pre_tag, post_tag, ellipsis));
875
+ i += merged;
876
+ }
877
+ for (i = 0; i < num_excerpts; i++) {
878
+ free(excerpts[i]);
879
+ }
880
+ free(excerpts);
881
+ pq_destroy(excerpt_pq);
882
+ matchv_destroy(mv);
883
+ }
884
+ q_deref(query);
885
+ }
886
+ if (tv) tv_destroy(tv);
887
+ if (lazy_doc) lazy_doc_close(lazy_doc);
888
+ return excerpt_strs;
451
889
  }
452
890
 
891
+ static Weight *sea_create_weight(Searcher *self, Query *query)
892
+ {
893
+ return q_weight(query, self);
894
+ }
453
895
 
454
- static Document *s_get_doc(Searcher *self, int doc_num)
896
+ static void sea_check_args(int num_docs, int first_doc)
455
897
  {
456
- return self->ir->get_doc(self->ir, doc_num);
898
+ if (num_docs <= 0) {
899
+ RAISE(ARG_ERROR, ":num_docs was set to %d but should be greater "
900
+ "than 0 : %d <= 0", num_docs, num_docs);
901
+ }
902
+
903
+ if (first_doc < 0) {
904
+ RAISE(ARG_ERROR, ":first_doc was set to %d but should be greater "
905
+ "than or equal to 0 : %d < 0", first_doc, first_doc);
906
+ }
457
907
  }
458
908
 
459
- static int s_max_doc(Searcher *self)
909
+ static Similarity *sea_get_similarity(Searcher *self)
460
910
  {
461
- return self->ir->max_doc(self->ir);
911
+ return self->similarity;
462
912
  }
463
913
 
464
- static Weight *s_create_weight(Searcher *self, Query *query)
914
+ /***************************************************************************
915
+ *
916
+ * IndexSearcher
917
+ *
918
+ ***************************************************************************/
919
+
920
+ #define ISEA(searcher) ((IndexSearcher *)(searcher))
921
+
922
+ int isea_doc_freq(Searcher *self, const char *field, const char *term)
465
923
  {
466
- return q_weight(query, self);
924
+ return ir_doc_freq(ISEA(self)->ir, field, term);
467
925
  }
468
926
 
469
- static TopDocs *s_search(Searcher *self, Query *query, int first_doc,
470
- int num_docs, Filter *filter, Sort *sort)
927
+ static Document *isea_get_doc(Searcher *self, int doc_num)
471
928
  {
472
- int max_size = first_doc + num_docs;
473
- int i;
474
- Weight *weight;
475
- Scorer *scorer;
476
- Hit **score_docs = NULL;
477
- Hit hit;
478
- int total_hits = 0;
479
- float score;
480
- BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
481
- Hit *(*hq_pop)(PriorityQueue *pq);
482
- void (*hq_insert)(PriorityQueue *pq, Hit *hit);
483
- void (*hq_destroy)(PriorityQueue *self);
484
- PriorityQueue *hq;
929
+ IndexReader *ir = ISEA(self)->ir;
930
+ return ir->get_doc(ir, doc_num);
931
+ }
485
932
 
933
+ static LazyDoc *isea_get_lazy_doc(Searcher *self, int doc_num)
934
+ {
935
+ IndexReader *ir = ISEA(self)->ir;
936
+ return ir->get_lazy_doc(ir, doc_num);
937
+ }
486
938
 
487
- if (num_docs <= 0)
488
- RAISE(ARG_ERROR, NUM_DOCS_ARG_ERROR_MSG);
939
+ static int isea_max_doc(Searcher *self)
940
+ {
941
+ IndexReader *ir = ISEA(self)->ir;
942
+ return ir->max_doc(ir);
943
+ }
489
944
 
490
- if (first_doc < 0)
491
- RAISE(ARG_ERROR, FIRST_DOC_ARG_ERROR_MSG);
945
+ #define IS_FILTERED(bits, filter_func, scorer, searcher) \
946
+ ((bits && !bv_get(bits, scorer->doc))\
947
+ || (filter_func \
948
+ && !filter_func(scorer->doc, scorer->score(scorer), searcher)))
492
949
 
493
- weight = q_weight(query, self);
494
- scorer = weight->scorer(weight, self->ir);
495
- if (!scorer) {
496
- if (bits) bv_destroy(bits);
497
- weight->destroy(weight);
498
- return td_create(0, 0, NULL);
499
- }
950
+ static TopDocs *isea_search_w(Searcher *self,
951
+ Weight *weight,
952
+ int first_doc,
953
+ int num_docs,
954
+ Filter *filter,
955
+ Sort *sort,
956
+ filter_ft filter_func,
957
+ bool load_fields)
958
+ {
959
+ int max_size = first_doc + num_docs;
960
+ int i;
961
+ Scorer *scorer;
962
+ Hit **score_docs = NULL;
963
+ Hit hit;
964
+ int total_hits = 0;
965
+ float score, max_score = 0.0;
966
+ BitVector *bits = (filter
967
+ ? filt_get_bv(filter, ISEA(self)->ir)
968
+ : NULL);
969
+ Hit *(*hq_pop)(PriorityQueue *pq);
970
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
971
+ void (*hq_destroy)(PriorityQueue *self);
972
+ PriorityQueue *hq;
500
973
 
501
- if (sort) {
502
- hq = fshq_pq_create(max_size, sort, self->ir);
503
- hq_pop = &fshq_pq_pop;
504
- hq_insert = &fshq_pq_insert;
505
- hq_destroy = &fshq_pq_destroy;
506
- } else {
507
- hq = pq_create(max_size, &hit_less_than);
508
- hq_pop = &hit_pq_pop;
509
- hq_insert = &hit_pq_insert;
510
- hq_destroy = &pq_destroy;
511
- }
974
+ sea_check_args(num_docs, first_doc);
512
975
 
513
- while (scorer->next(scorer)) {
514
- if (bits && !bv_get(bits, scorer->doc)) continue;
515
- total_hits++;
516
- score = scorer->score(scorer);
517
- hit.doc = scorer->doc; hit.score = score;
518
- hq_insert(hq, &hit);
519
- }
520
- scorer->destroy(scorer);
521
- weight->destroy(weight);
976
+ scorer = weight->scorer(weight, ISEA(self)->ir);
977
+ if (!scorer) {
978
+ return td_new(0, 0, NULL, 0.0);
979
+ }
522
980
 
523
- if (hq->count > first_doc) {
524
- if ((hq->count - first_doc) < num_docs) {
525
- num_docs = hq->count - first_doc;
981
+ if (sort) {
982
+ hq = fshq_pq_new(max_size, sort, ISEA(self)->ir);
983
+ hq_insert = &fshq_pq_insert;
984
+ hq_destroy = &fshq_pq_destroy;
985
+ if (load_fields) {
986
+ hq_pop = &fshq_pq_pop_fd;
987
+ }
988
+ else {
989
+ hq_pop = &fshq_pq_pop;
990
+ }
991
+ }
992
+ else {
993
+ hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
994
+ hq_pop = &hit_pq_pop;
995
+ hq_insert = &hit_pq_insert;
996
+ hq_destroy = &pq_destroy;
526
997
  }
527
- score_docs = ALLOC_N(Hit *, num_docs);
528
- for (i = num_docs - 1; i >= 0; i--) {
529
- score_docs[i] = hq_pop(hq);
530
- //hit = score_docs[i] = pq_pop(hq);
531
- //printf("hit = %d-->%f\n", hit->doc, hit->score);
998
+
999
+ while (scorer->next(scorer)) {
1000
+ if (IS_FILTERED(bits, filter_func, scorer, self)) {
1001
+ continue;
1002
+ }
1003
+ total_hits++;
1004
+ score = scorer->score(scorer);
1005
+ if (score > max_score) max_score = score;
1006
+ hit.doc = scorer->doc; hit.score = score;
1007
+ hq_insert(hq, &hit);
532
1008
  }
533
- } else {
534
- num_docs = 0;
535
- }
536
- pq_clear(hq);
537
- hq_destroy(hq);
1009
+ scorer->destroy(scorer);
538
1010
 
539
- if (bits) bv_destroy(bits);
540
- return td_create(total_hits, num_docs, score_docs);
1011
+ if (hq->size > first_doc) {
1012
+ if ((hq->size - first_doc) < num_docs) {
1013
+ num_docs = hq->size - first_doc;
1014
+ }
1015
+ score_docs = ALLOC_N(Hit *, num_docs);
1016
+ for (i = num_docs - 1; i >= 0; i--) {
1017
+ score_docs[i] = hq_pop(hq);
1018
+ /*
1019
+ hit = score_docs[i] = pq_pop(hq);
1020
+ printf("hit = %d-->%f\n", hit->doc, hit->score);
1021
+ */
1022
+ }
1023
+ }
1024
+ else {
1025
+ num_docs = 0;
1026
+ }
1027
+ pq_clear(hq);
1028
+ hq_destroy(hq);
1029
+
1030
+ return td_new(total_hits, num_docs, score_docs, max_score);
1031
+ }
1032
+
1033
+ static TopDocs *isea_search(Searcher *self,
1034
+ Query *query,
1035
+ int first_doc,
1036
+ int num_docs,
1037
+ Filter *filter,
1038
+ Sort *sort,
1039
+ filter_ft filter_func,
1040
+ bool load_fields)
1041
+ {
1042
+ TopDocs *td;
1043
+ Weight *weight = q_weight(query, self);
1044
+ td = isea_search_w(self, weight, first_doc, num_docs, filter,
1045
+ sort, filter_func, load_fields);
1046
+ weight->destroy(weight);
1047
+ return td;
541
1048
  }
542
1049
 
543
- static void s_search_each_w(Searcher *self, Weight *weight, Filter *filter,
544
- void (*fn)(Searcher *, int, float, void *), void *arg)
1050
+ static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
1051
+ filter_ft filter_func,
1052
+ void (*fn)(Searcher *, int, float, void *),
1053
+ void *arg)
545
1054
  {
546
- Scorer *scorer;
547
- BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
1055
+ Scorer *scorer;
1056
+ BitVector *bits = (filter
1057
+ ? filt_get_bv(filter, ISEA(self)->ir)
1058
+ : NULL);
548
1059
 
549
- scorer = weight->scorer(weight, self->ir);
550
- if (!scorer) {
551
- if (bits) bv_destroy(bits);
552
- return;
553
- }
1060
+ scorer = weight->scorer(weight, ISEA(self)->ir);
1061
+ if (!scorer) {
1062
+ return;
1063
+ }
554
1064
 
555
- while (scorer->next(scorer)) {
556
- if (bits && !bv_get(bits, scorer->doc)) continue;
557
- fn(self, scorer->doc, scorer->score(scorer), arg);
558
- }
559
- scorer->destroy(scorer);
1065
+ while (scorer->next(scorer)) {
1066
+ if (IS_FILTERED(bits, filter_func, scorer, self)) {
1067
+ continue;
1068
+ }
1069
+ fn(self, scorer->doc, scorer->score(scorer), arg);
1070
+ }
1071
+ scorer->destroy(scorer);
560
1072
  }
561
1073
 
562
- static void s_search_each(Searcher *self, Query *query, Filter *filter,
563
- void (*fn)(Searcher *, int, float, void *), void *arg)
1074
+ static void isea_search_each(Searcher *self, Query *query, Filter *filter,
1075
+ filter_ft filter_func,
1076
+ void (*fn)(Searcher *, int, float, void *),
1077
+ void *arg)
564
1078
  {
565
- Weight *weight = q_weight(query, self);
566
- s_search_each_w(self, weight, filter, fn, arg);
567
- weight->destroy(weight);
1079
+ Weight *weight = q_weight(query, self);
1080
+ isea_search_each_w(self, weight, filter, filter_func, fn, arg);
1081
+ weight->destroy(weight);
568
1082
  }
569
1083
 
570
- static Query *s_rewrite(Searcher *self, Query *original)
1084
+ static Query *isea_rewrite(Searcher *self, Query *original)
571
1085
  {
572
- int q_is_destroyed = false;
573
- Query *query = original;
574
- Query *rewritten_query = query->rewrite(query, self->ir);
575
- while (q_is_destroyed || (query != rewritten_query)) {
576
- query = rewritten_query;
577
- rewritten_query = query->rewrite(query, self->ir);
578
- q_is_destroyed = (query->ref_cnt <= 1);
579
- q_deref(query); /* destroy intermediate queries */
580
- }
581
- return query;
1086
+ int q_is_destroyed = false;
1087
+ Query *query = original;
1088
+ Query *rewritten_query = query->rewrite(query, ISEA(self)->ir);
1089
+ while (q_is_destroyed || (query != rewritten_query)) {
1090
+ query = rewritten_query;
1091
+ rewritten_query = query->rewrite(query, ISEA(self)->ir);
1092
+ q_is_destroyed = (query->ref_cnt <= 1);
1093
+ q_deref(query); /* destroy intermediate queries */
1094
+ }
1095
+ return query;
582
1096
  }
583
1097
 
584
- static Explanation *s_explain(Searcher *self, Query *query, int doc_num)
1098
+ static Explanation *isea_explain(Searcher *self, Query *query, int doc_num)
585
1099
  {
586
- Weight *weight = q_weight(query, self);
587
- Explanation *e = weight->explain(weight, self->ir, doc_num);
588
- weight->destroy(weight);
589
- return e;
1100
+ Weight *weight = q_weight(query, self);
1101
+ Explanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
1102
+ weight->destroy(weight);
1103
+ return e;
590
1104
  }
591
1105
 
592
- static Explanation *s_explain_w(Searcher *self, Weight *w, int doc_num)
1106
+ static Explanation *isea_explain_w(Searcher *self, Weight *w, int doc_num)
593
1107
  {
594
- return w->explain(w, self->ir, doc_num);
1108
+ return w->explain(w, ISEA(self)->ir, doc_num);
595
1109
  }
596
1110
 
597
- static Similarity *s_get_similarity(Searcher *self)
1111
+ static TermVector *isea_get_term_vector(Searcher *self,
1112
+ const int doc_num,
1113
+ const char *field)
598
1114
  {
599
- return self->similarity;
1115
+ IndexReader *ir = ISEA(self)->ir;
1116
+ return ir->term_vector(ir, doc_num, field);
600
1117
  }
601
1118
 
602
- static void s_close(Searcher *self)
1119
+ static void isea_close(Searcher *self)
603
1120
  {
604
- if (self->ir && self->close_ir) {
605
- ir_close(self->ir);
606
- }
607
- free(self);
1121
+ if (ISEA(self)->ir && ISEA(self)->close_ir) {
1122
+ ir_close(ISEA(self)->ir);
1123
+ }
1124
+ free(self);
608
1125
  }
609
1126
 
610
- Searcher *sea_create(IndexReader *ir)
1127
+ Searcher *isea_new(IndexReader *ir)
611
1128
  {
612
- Searcher *self = ALLOC(Searcher);
613
- self->ir = ir;
614
- self->close_ir = true;
615
- self->similarity = sim_create_default();
616
- self->doc_freq = &s_doc_freq;
617
- self->doc_freqs = &s_doc_freqs;
618
- self->get_doc = &s_get_doc;
619
- self->max_doc = &s_max_doc;
620
- self->create_weight = &s_create_weight;
621
- self->search = &s_search;
622
- self->search_each = &s_search_each;
623
- self->search_each_w = &s_search_each_w;
624
- self->rewrite = &s_rewrite;
625
- self->explain = &s_explain;
626
- self->explain_w = &s_explain_w;
627
- self->get_similarity = &s_get_similarity;
628
- self->close = &s_close;
629
- return self;
1129
+ Searcher *self = (Searcher *)ecalloc(sizeof(IndexSearcher));
1130
+
1131
+ ISEA(self)->ir = ir;
1132
+ ISEA(self)->close_ir = true;
1133
+
1134
+ self->similarity = sim_create_default();
1135
+ self->doc_freq = &isea_doc_freq;
1136
+ self->get_doc = &isea_get_doc;
1137
+ self->get_lazy_doc = &isea_get_lazy_doc;
1138
+ self->max_doc = &isea_max_doc;
1139
+ self->create_weight = &sea_create_weight;
1140
+ self->search = &isea_search;
1141
+ self->search_w = &isea_search_w;
1142
+ self->search_each = &isea_search_each;
1143
+ self->search_each_w = &isea_search_each_w;
1144
+ self->rewrite = &isea_rewrite;
1145
+ self->explain = &isea_explain;
1146
+ self->explain_w = &isea_explain_w;
1147
+ self->get_term_vector = &isea_get_term_vector;
1148
+ self->get_similarity = &sea_get_similarity;
1149
+ self->close = &isea_close;
1150
+
1151
+ return self;
630
1152
  }
631
1153
 
632
1154
  /***************************************************************************
@@ -635,109 +1157,144 @@ Searcher *sea_create(IndexReader *ir)
635
1157
  *
636
1158
  ***************************************************************************/
637
1159
 
638
- typedef struct CachedDFSearcher {
639
- HshTable *df_map;
640
- int max_doc;
1160
+ #define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
1161
+ typedef struct CachedDFSearcher
1162
+ {
1163
+ Searcher super;
1164
+ HashTable *df_map;
1165
+ int max_doc;
641
1166
  } CachedDFSearcher;
642
1167
 
643
- static int cdfsea_doc_freq(Searcher *self, Term *term)
1168
+ static int cdfsea_doc_freq(Searcher *self, const char *field, const char *text)
644
1169
  {
645
- CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
646
- return (int)h_get(cdfsea->df_map, term);
1170
+ Term term;
1171
+ int *df;
1172
+ term.field = (char *)field;
1173
+ term.text = (char *)text;
1174
+ df = (int *)h_get(CDFSEA(self)->df_map, &term);
1175
+ return df ? *df : 0;
647
1176
  }
648
1177
 
649
1178
  static Document *cdfsea_get_doc(Searcher *self, int doc_num)
650
1179
  {
651
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
652
- return NULL;
1180
+ (void)self; (void)doc_num;
1181
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1182
+ return NULL;
653
1183
  }
654
1184
 
655
1185
  static int cdfsea_max_doc(Searcher *self)
656
1186
  {
657
- return ((CachedDFSearcher *)self->data)->max_doc;
1187
+ (void)self;
1188
+ return CDFSEA(self)->max_doc;
658
1189
  }
659
1190
 
660
1191
  static Weight *cdfsea_create_weight(Searcher *self, Query *query)
661
1192
  {
662
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
663
- return NULL;
1193
+ (void)self; (void)query;
1194
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1195
+ return NULL;
1196
+ }
1197
+
1198
+ static TopDocs *cdfsea_search_w(Searcher *self, Weight *w, int fd, int nd,
1199
+ Filter *f, Sort *s, filter_ft ff, bool load)
1200
+ {
1201
+ (void)self; (void)w; (void)fd; (void)nd;
1202
+ (void)f; (void)s; (void)ff, (void)load;
1203
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1204
+ return NULL;
664
1205
  }
665
1206
 
666
- static TopDocs *cdfsea_search(Searcher *self, Query *query, int first_doc,
667
- int num_docs, Filter *filter, Sort *sort)
1207
+ static TopDocs *cdfsea_search(Searcher *self, Query *q, int fd, int nd,
1208
+ Filter *f, Sort *s, filter_ft ff, bool load)
668
1209
  {
669
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
670
- return NULL;
1210
+ (void)self; (void)q; (void)fd; (void)nd;
1211
+ (void)f; (void)s; (void)ff, (void)load;
1212
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1213
+ return NULL;
671
1214
  }
672
1215
 
673
1216
  static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
674
- void (*fn)(Searcher *, int, float, void *), void *arg)
1217
+ filter_ft ff,
1218
+ void (*fn)(Searcher *, int, float, void *),
1219
+ void *arg)
675
1220
  {
676
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1221
+ (void)self; (void)query; (void)filter; (void)ff; (void)fn; (void)arg;
1222
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
677
1223
  }
678
1224
 
679
1225
  static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
680
- void (*fn)(Searcher *, int, float, void *), void *arg)
1226
+ filter_ft ff,
1227
+ void (*fn)(Searcher *, int, float, void *),
1228
+ void *arg)
681
1229
  {
682
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1230
+ (void)self; (void)w; (void)filter; (void)ff; (void)fn; (void)arg;
1231
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
683
1232
  }
684
1233
 
685
1234
  static Query *cdfsea_rewrite(Searcher *self, Query *original)
686
1235
  {
687
- original->ref_cnt++;
688
- return original;
1236
+ (void)self;
1237
+ original->ref_cnt++;
1238
+ return original;
689
1239
  }
690
1240
 
691
1241
  static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
692
1242
  {
693
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
694
- return NULL;
1243
+ (void)self; (void)query; (void)doc_num;
1244
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1245
+ return NULL;
695
1246
  }
696
1247
 
697
1248
  static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
698
1249
  {
699
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
700
- return NULL;
1250
+ (void)self; (void)w; (void)doc_num;
1251
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1252
+ return NULL;
1253
+ }
1254
+
1255
+ static TermVector *cdfsea_get_term_vector(Searcher *self, const int doc_num,
1256
+ const char *field)
1257
+ {
1258
+ (void)self; (void)doc_num; (void)field;
1259
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1260
+ return NULL;
701
1261
  }
702
1262
 
703
1263
  static Similarity *cdfsea_get_similarity(Searcher *self)
704
1264
  {
705
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
706
- return NULL;
1265
+ (void)self;
1266
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1267
+ return NULL;
707
1268
  }
708
1269
 
709
1270
  static void cdfsea_close(Searcher *self)
710
1271
  {
711
- CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
712
- h_destroy(cdfsea->df_map);
713
- free(cdfsea);
714
- free(self);
1272
+ h_destroy(CDFSEA(self)->df_map);
1273
+ free(self);
715
1274
  }
716
1275
 
717
- Searcher *cdfsea_create(HshTable *df_map, int max_doc)
1276
+ static Searcher *cdfsea_new(HashTable *df_map, int max_doc)
718
1277
  {
719
- Searcher *self = ALLOC(Searcher);
1278
+ Searcher *self = (Searcher *)ecalloc(sizeof(CachedDFSearcher));
720
1279
 
721
- CachedDFSearcher *cdfsea = ALLOC(CachedDFSearcher);
1280
+ CDFSEA(self)->df_map = df_map;
1281
+ CDFSEA(self)->max_doc = max_doc;
722
1282
 
723
- cdfsea->df_map = df_map;
724
- cdfsea->max_doc = max_doc;
725
- self->data = cdfsea;
726
-
727
- self->doc_freq = &cdfsea_doc_freq;
728
- self->doc_freqs = &ss_doc_freqs;
729
- self->get_doc = &cdfsea_get_doc;
730
- self->max_doc = &cdfsea_max_doc;
731
- self->create_weight = &cdfsea_create_weight;
732
- self->search = &cdfsea_search;
733
- self->search_each = &cdfsea_search_each;
734
- self->search_each_w = &cdfsea_search_each_w;
735
- self->rewrite = &cdfsea_rewrite;
736
- self->explain = &cdfsea_explain;
737
- self->explain_w = &cdfsea_explain_w;
738
- self->get_similarity = &cdfsea_get_similarity;
739
- self->close = &cdfsea_close;
740
- return self;
1283
+ self->doc_freq = &cdfsea_doc_freq;
1284
+ self->get_doc = &cdfsea_get_doc;
1285
+ self->max_doc = &cdfsea_max_doc;
1286
+ self->create_weight = &cdfsea_create_weight;
1287
+ self->search = &cdfsea_search;
1288
+ self->search_w = &cdfsea_search_w;
1289
+ self->search_each = &cdfsea_search_each;
1290
+ self->search_each_w = &cdfsea_search_each_w;
1291
+ self->rewrite = &cdfsea_rewrite;
1292
+ self->explain = &cdfsea_explain;
1293
+ self->explain_w = &cdfsea_explain_w;
1294
+ self->get_term_vector = &cdfsea_get_term_vector;
1295
+ self->get_similarity = &cdfsea_get_similarity;
1296
+ self->close = &cdfsea_close;
1297
+ return self;
741
1298
  }
742
1299
 
743
1300
  /***************************************************************************
@@ -746,301 +1303,367 @@ Searcher *cdfsea_create(HshTable *df_map, int max_doc)
746
1303
  *
747
1304
  ***************************************************************************/
748
1305
 
1306
+ #define MSEA(searcher) ((MultiSearcher *)(searcher))
749
1307
  static inline int msea_get_searcher_index(Searcher *self, int n)
750
1308
  {
751
- MultiSearcher *msea = (MultiSearcher *)self->data;
752
- int lo = 0; /* search starts array */
753
- int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
754
- int mid, mid_val;
1309
+ MultiSearcher *msea = MSEA(self);
1310
+ int lo = 0; /* search starts array */
1311
+ int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
1312
+ int mid, mid_val;
755
1313
 
756
- while (hi >= lo) {
757
- mid = (lo + hi) >> 1;
758
- mid_val = msea->starts[mid];
759
- if (n < mid_val) {
760
- hi = mid - 1;
761
- } else if (n > mid_val) {
762
- lo = mid + 1;
763
- } else { /* found a match */
764
- while (((mid+1) < msea->s_cnt) && (msea->starts[mid+1] == mid_val)) {
765
- mid++; /* scan to last match */
766
- }
767
- return mid;
1314
+ while (hi >= lo) {
1315
+ mid = (lo + hi) >> 1;
1316
+ mid_val = msea->starts[mid];
1317
+ if (n < mid_val) {
1318
+ hi = mid - 1;
1319
+ }
1320
+ else if (n > mid_val) {
1321
+ lo = mid + 1;
1322
+ }
1323
+ else { /* found a match */
1324
+ while (((mid+1) < msea->s_cnt)
1325
+ && (msea->starts[mid+1] == mid_val)) {
1326
+ mid++; /* scan to last match */
1327
+ }
1328
+ return mid;
1329
+ }
768
1330
  }
769
- }
770
- return hi;
1331
+ return hi;
771
1332
  }
772
1333
 
773
- static int msea_doc_freq(Searcher *self, Term *term)
1334
+ static int msea_doc_freq(Searcher *self, const char *field, const char *term)
774
1335
  {
775
- int i;
776
- int doc_freq = 0;
777
- Searcher *s;
778
- MultiSearcher *msea = (MultiSearcher *)self->data;
779
- for (i = 0; i < msea->s_cnt; i++) {
780
- s = msea->searchers[i];
781
- doc_freq += s->doc_freq(s, term);
782
- }
1336
+ int i;
1337
+ int doc_freq = 0;
1338
+ MultiSearcher *msea = MSEA(self);
1339
+ for (i = 0; i < msea->s_cnt; i++) {
1340
+ Searcher *s = msea->searchers[i];
1341
+ doc_freq += s->doc_freq(s, field, term);
1342
+ }
783
1343
 
784
- return doc_freq;
1344
+ return doc_freq;
785
1345
  }
786
1346
 
787
1347
  static Document *msea_get_doc(Searcher *self, int doc_num)
788
1348
  {
789
- MultiSearcher *msea = (MultiSearcher *)self->data;
790
- int i = msea_get_searcher_index(self, doc_num);
791
- Searcher *s = msea->searchers[i];
792
- return s->get_doc(s, doc_num - msea->starts[i]);
1349
+ MultiSearcher *msea = MSEA(self);
1350
+ int i = msea_get_searcher_index(self, doc_num);
1351
+ Searcher *s = msea->searchers[i];
1352
+ return s->get_doc(s, doc_num - msea->starts[i]);
1353
+ }
1354
+
1355
+ static LazyDoc *msea_get_lazy_doc(Searcher *self, int doc_num)
1356
+ {
1357
+ MultiSearcher *msea = MSEA(self);
1358
+ int i = msea_get_searcher_index(self, doc_num);
1359
+ Searcher *s = msea->searchers[i];
1360
+ return s->get_lazy_doc(s, doc_num - msea->starts[i]);
793
1361
  }
794
1362
 
795
1363
  static int msea_max_doc(Searcher *self)
796
1364
  {
797
- return ((MultiSearcher *)self->data)->max_doc;
1365
+ return MSEA(self)->max_doc;
1366
+ }
1367
+
1368
+ static int *msea_get_doc_freqs(Searcher *self, HashSet *terms)
1369
+ {
1370
+ int i;
1371
+ const int num_terms = terms->size;
1372
+ int *doc_freqs = ALLOC_N(int, num_terms);
1373
+ for (i = 0; i < num_terms; i++) {
1374
+ Term *t = (Term *)terms->elems[i];
1375
+ doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
1376
+ }
1377
+ return doc_freqs;
798
1378
  }
799
1379
 
800
1380
  static Weight *msea_create_weight(Searcher *self, Query *query)
801
1381
  {
802
- int i, *dfs;
803
- Searcher *cdfsea;
804
- Weight *w;
805
- HshTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
806
- (free_ft)NULL, (free_ft)NULL);
807
- Query *rq = self->rewrite(self, query);
808
- HashSet *terms = term_set_create();
809
- rq->extract_terms(rq, terms);
810
- dfs = self->doc_freqs(self, (Term **)terms->elems, terms->size);
1382
+ int i, *doc_freqs;
1383
+ Searcher *cdfsea;
1384
+ Weight *w;
1385
+ HashTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
1386
+ (free_ft)NULL, free);
1387
+ Query *rewritten_query = self->rewrite(self, query);
1388
+ HashSet *terms = term_set_new();
811
1389
 
812
- for (i = 0; i < terms->size; i++) {
813
- h_set(df_map, terms->elems[i], (void *)dfs[i]);
814
- }
815
- /* don't destroy the individual terms, only the HashSet */
816
- hs_destroy(terms);
817
- free(dfs);
1390
+ rewritten_query->extract_terms(rewritten_query, terms);
1391
+ doc_freqs = msea_get_doc_freqs(self, terms);
818
1392
 
819
- cdfsea = cdfsea_create(df_map, ((MultiSearcher *)self->data)->max_doc);
1393
+ for (i = 0; i < terms->size; i++) {
1394
+ h_set(df_map, terms->elems[i], imalloc(doc_freqs[i]));
1395
+ }
1396
+ hs_destroy(terms);
1397
+ free(doc_freqs);
820
1398
 
821
- w = q_weight(rq, cdfsea);
822
- q_deref(rq);
823
- cdfsea->close(cdfsea);
1399
+ cdfsea = cdfsea_new(df_map, MSEA(self)->max_doc);
824
1400
 
825
- return w;
1401
+ w = q_weight(rewritten_query, cdfsea);
1402
+ q_deref(rewritten_query);
1403
+ cdfsea->close(cdfsea);
1404
+
1405
+ return w;
826
1406
  }
827
1407
 
828
1408
  struct MultiSearchEachArg {
829
- int start;
830
- void *arg;
831
- void (*fn)(Searcher *, int, float, void *);
1409
+ int start;
1410
+ void *arg;
1411
+ void (*fn)(Searcher *, int, float, void *);
832
1412
  };
833
1413
 
834
1414
  void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
835
1415
  {
836
- struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
1416
+ struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
837
1417
 
838
- mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
1418
+ mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
839
1419
  }
840
1420
 
841
1421
  static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
842
- void (*fn)(Searcher *, int, float, void *), void *arg)
1422
+ filter_ft filter_func,
1423
+ void (*fn)(Searcher *, int, float, void *),
1424
+ void *arg)
843
1425
  {
844
- int i;
845
- struct MultiSearchEachArg mse_arg;
846
- MultiSearcher *msea = (MultiSearcher *)self->data;
847
- Searcher *s;
1426
+ int i;
1427
+ struct MultiSearchEachArg mse_arg;
1428
+ MultiSearcher *msea = MSEA(self);
1429
+ Searcher *s;
848
1430
 
849
- mse_arg.fn = fn;
850
- mse_arg.arg = arg;
851
- for (i = 0; i < msea->s_cnt; i++) {
852
- s = msea->searchers[i];
853
- mse_arg.start = msea->starts[i];
854
- s->search_each_w(s, w, filter, &msea_search_each_i, &mse_arg);
855
- }
1431
+ mse_arg.fn = fn;
1432
+ mse_arg.arg = arg;
1433
+ for (i = 0; i < msea->s_cnt; i++) {
1434
+ s = msea->searchers[i];
1435
+ mse_arg.start = msea->starts[i];
1436
+ s->search_each_w(s, w, filter, filter_func,
1437
+ &msea_search_each_i, &mse_arg);
1438
+ }
856
1439
  }
857
1440
 
858
1441
  static void msea_search_each(Searcher *self, Query *query, Filter *filter,
859
- void (*fn)(Searcher *, int, float, void *), void *arg)
1442
+ filter_ft filter_func,
1443
+ void (*fn)(Searcher *, int, float, void *), void *arg)
860
1444
  {
861
- Weight *w = q_weight(query, self);
862
- msea_search_each_w(self, w, filter, fn, arg);
863
- w->destroy(w);
1445
+ Weight *w = q_weight(query, self);
1446
+ msea_search_each_w(self, w, filter, filter_func, fn, arg);
1447
+ w->destroy(w);
864
1448
  }
865
1449
 
866
1450
  struct MultiSearchArg {
867
- int total_hits, max_size;
868
- PriorityQueue *hq;
869
- void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1451
+ int total_hits, max_size;
1452
+ PriorityQueue *hq;
1453
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
870
1454
  };
871
1455
 
872
1456
  void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
873
1457
  {
874
- struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
875
- Hit hit;
876
-
877
- ms_arg->total_hits++;
878
- hit.doc = doc_num;
879
- hit.score = score;
880
- ms_arg->hq_insert(ms_arg->hq, &hit);
881
- }
882
-
883
- static TopDocs *msea_search(Searcher *self, Query *query, int first_doc,
884
- int num_docs, Filter *filter, Sort *sort)
885
- {
886
- int max_size = first_doc + num_docs;
887
- int i;
888
- Weight *weight;
889
- Hit **score_docs = NULL;
890
- BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
891
- Hit *(*hq_pop)(PriorityQueue *pq);
892
- void (*hq_insert)(PriorityQueue *pq, Hit *hit);
893
- void (*hq_destroy)(PriorityQueue *self);
894
- PriorityQueue *hq;
895
- struct MultiSearchArg ms_arg;
896
-
897
-
898
- if (num_docs <= 0)
899
- RAISE(ARG_ERROR, NUM_DOCS_ARG_ERROR_MSG);
900
-
901
- if (first_doc < 0)
902
- RAISE(ARG_ERROR, FIRST_DOC_ARG_ERROR_MSG);
903
-
904
- weight = q_weight(query, self);
905
- if (sort) {
906
- hq = fshq_pq_create(max_size, sort, self->ir);
907
- hq_pop = &fshq_pq_pop;
908
- hq_insert = &fshq_pq_insert;
909
- hq_destroy = &fshq_pq_destroy;
910
- } else {
911
- hq = pq_create(max_size, &hit_less_than);
912
- hq_pop = &hit_pq_pop;
913
- hq_insert = &hit_pq_insert;
914
- hq_destroy = &pq_destroy;
915
- }
916
-
917
-
918
- ms_arg.hq = hq;
919
- ms_arg.total_hits = 0;
920
- ms_arg.max_size = max_size;
921
- ms_arg.hq_insert = hq_insert;
922
-
923
- msea_search_each_w(self, weight, filter, msea_search_i, &ms_arg);
1458
+ struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
1459
+ Hit hit;
1460
+ (void)self;
1461
+
1462
+ ms_arg->total_hits++;
1463
+ hit.doc = doc_num;
1464
+ hit.score = score;
1465
+ ms_arg->hq_insert(ms_arg->hq, &hit);
1466
+ }
1467
+
1468
+ static TopDocs *msea_search_w(Searcher *self,
1469
+ Weight *weight,
1470
+ int first_doc,
1471
+ int num_docs,
1472
+ Filter *filter,
1473
+ Sort *sort,
1474
+ filter_ft filter_func,
1475
+ bool load_fields)
1476
+ {
1477
+ int max_size = first_doc + num_docs;
1478
+ int i;
1479
+ int total_hits = 0;
1480
+ Hit **score_docs = NULL;
1481
+ Hit *(*hq_pop)(PriorityQueue *pq);
1482
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1483
+ PriorityQueue *hq;
1484
+ float max_score = 0.0;
1485
+ (void)load_fields; /* does it automatically */
1486
+
1487
+ sea_check_args(num_docs, first_doc);
1488
+
1489
+ if (sort) {
1490
+ hq = pq_new(max_size, (lt_ft)fdshq_lt, &free);
1491
+ hq_insert = (void (*)(PriorityQueue *pq, Hit *hit))&pq_insert;
1492
+ hq_pop = (Hit *(*)(PriorityQueue *pq))&pq_pop;
1493
+ }
1494
+ else {
1495
+ hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
1496
+ hq_insert = &hit_pq_multi_insert;
1497
+ hq_pop = &hit_pq_pop;
1498
+ }
924
1499
 
925
- weight->destroy(weight);
1500
+ /*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
1501
+ for (i = 0; i < MSEA(self)->s_cnt; i++) {
1502
+ Searcher *s = MSEA(self)->searchers[i];
1503
+ TopDocs *td = s->search_w(s, weight, 0, max_size,
1504
+ filter, sort, filter_func, true);
1505
+ /*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
1506
+ if (td->size > 0) {
1507
+ /*printf("td->size = %d %d\n", td->size, num_docs); */
1508
+ int j;
1509
+ int start = MSEA(self)->starts[i];
1510
+ for (j = 0; j < td->size; j++) {
1511
+ Hit *hit = td->hits[j];
1512
+ hit->doc += start;
1513
+ /*
1514
+ printf("adding hit = %d:%f\n", hit->doc, hit->score);
1515
+ */
1516
+ hq_insert(hq, hit);
1517
+ }
1518
+ td->size = 0;
1519
+ if (td->max_score > max_score) max_score = td->max_score;
1520
+ }
1521
+ total_hits += td->total_hits;
1522
+ td_destroy(td);
1523
+ }
926
1524
 
927
- if (hq->count > first_doc) {
928
- if ((hq->count - first_doc) < num_docs) {
929
- num_docs = hq->count - first_doc;
1525
+ if (hq->size > first_doc) {
1526
+ if ((hq->size - first_doc) < num_docs) {
1527
+ num_docs = hq->size - first_doc;
1528
+ }
1529
+ score_docs = ALLOC_N(Hit *, num_docs);
1530
+ for (i = num_docs - 1; i >= 0; i--) {
1531
+ score_docs[i] = hq_pop(hq);
1532
+ /*
1533
+ Hit *hit = score_docs[i] = hq_pop(hq);
1534
+ printf("popped hit = %d-->%f\n", hit->doc, hit->score);
1535
+ */
1536
+ }
930
1537
  }
931
- score_docs = ALLOC_N(Hit *, num_docs);
932
- for (i = num_docs - 1; i >= 0; i--) {
933
- score_docs[i] = hq_pop(hq);
934
- //hit = score_docs[i] = pq_pop(hq);
935
- //printf("hit = %d-->%f\n", hit->doc, hit->score);
1538
+ else {
1539
+ num_docs = 0;
936
1540
  }
937
- } else {
938
- num_docs = 0;
939
- }
940
- pq_clear(hq);
941
- hq_destroy(hq);
1541
+ pq_clear(hq);
1542
+ pq_destroy(hq);
1543
+
1544
+ return td_new(total_hits, num_docs, score_docs, max_score);
1545
+ }
942
1546
 
943
- if (bits) bv_destroy(bits);
944
- return td_create(ms_arg.total_hits, num_docs, score_docs);
1547
+ static TopDocs *msea_search(Searcher *self,
1548
+ Query *query,
1549
+ int first_doc,
1550
+ int num_docs,
1551
+ Filter *filter,
1552
+ Sort *sort,
1553
+ filter_ft filter_func,
1554
+ bool load_fields)
1555
+ {
1556
+ TopDocs *td;
1557
+ Weight *weight = q_weight(query, self);
1558
+ td = msea_search_w(self, weight, first_doc, num_docs, filter,
1559
+ sort, filter_func, load_fields);
1560
+ weight->destroy(weight);
1561
+ return td;
945
1562
  }
946
1563
 
947
1564
  static Query *msea_rewrite(Searcher *self, Query *original)
948
1565
  {
949
- int i;
950
- Searcher *s;
951
- MultiSearcher *msea = (MultiSearcher *)self->data;
952
- Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
1566
+ int i;
1567
+ Searcher *s;
1568
+ MultiSearcher *msea = MSEA(self);
1569
+ Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
953
1570
 
954
- for (i = 0; i < msea->s_cnt; i++) {
955
- s = msea->searchers[i];
956
- queries[i] = s->rewrite(s, original);
957
- }
958
- rewritten = q_combine(queries, msea->s_cnt);
1571
+ for (i = 0; i < msea->s_cnt; i++) {
1572
+ s = msea->searchers[i];
1573
+ queries[i] = s->rewrite(s, original);
1574
+ }
1575
+ rewritten = q_combine(queries, msea->s_cnt);
959
1576
 
960
- for (i = 0; i < msea->s_cnt; i++) {
961
- q_deref(queries[i]);
962
- }
963
- free(queries);
964
- return rewritten;
1577
+ for (i = 0; i < msea->s_cnt; i++) {
1578
+ q_deref(queries[i]);
1579
+ }
1580
+ free(queries);
1581
+ return rewritten;
965
1582
  }
966
1583
 
967
1584
  static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
968
1585
  {
969
- MultiSearcher *msea = (MultiSearcher *)self->data;
970
- int i = msea_get_searcher_index(self, doc_num);
971
- Weight *w = q_weight(query, self);
972
- Searcher *s = msea->searchers[i];
973
- Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
974
- w->destroy(w);
975
- return e;
1586
+ MultiSearcher *msea = MSEA(self);
1587
+ int i = msea_get_searcher_index(self, doc_num);
1588
+ Weight *w = q_weight(query, self);
1589
+ Searcher *s = msea->searchers[i];
1590
+ Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1591
+ w->destroy(w);
1592
+ return e;
976
1593
  }
977
1594
 
978
1595
  static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
979
1596
  {
980
- MultiSearcher *msea = (MultiSearcher *)self->data;
981
- int i = msea_get_searcher_index(self, doc_num);
982
- Searcher *s = msea->searchers[i];
983
- Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
984
- return e;
1597
+ MultiSearcher *msea = MSEA(self);
1598
+ int i = msea_get_searcher_index(self, doc_num);
1599
+ Searcher *s = msea->searchers[i];
1600
+ Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1601
+ return e;
1602
+ }
1603
+
1604
+ static TermVector *msea_get_term_vector(Searcher *self, const int doc_num,
1605
+ const char *field)
1606
+ {
1607
+ MultiSearcher *msea = MSEA(self);
1608
+ int i = msea_get_searcher_index(self, doc_num);
1609
+ Searcher *s = msea->searchers[i];
1610
+ return s->get_term_vector(s, doc_num - msea->starts[i],
1611
+ field);
985
1612
  }
986
1613
 
987
1614
  static Similarity *msea_get_similarity(Searcher *self)
988
1615
  {
989
- return self->similarity;
1616
+ return self->similarity;
990
1617
  }
991
1618
 
992
1619
  static void msea_close(Searcher *self)
993
1620
  {
994
- int i;
995
- Searcher *s;
996
- MultiSearcher *msea = (MultiSearcher *)self->data;
997
- if (msea->close_subs) {
998
- for (i = 0; i < msea->s_cnt; i++) {
999
- s = msea->searchers[i];
1000
- s->close(s);
1621
+ int i;
1622
+ Searcher *s;
1623
+ MultiSearcher *msea = MSEA(self);
1624
+ if (msea->close_subs) {
1625
+ for (i = 0; i < msea->s_cnt; i++) {
1626
+ s = msea->searchers[i];
1627
+ s->close(s);
1628
+ }
1629
+ free(msea->searchers);
1001
1630
  }
1002
- free(msea->searchers);
1003
- }
1004
- free(msea->starts);
1005
- free(msea);
1006
- free(self);
1631
+ free(msea->starts);
1632
+ free(self);
1007
1633
  }
1008
1634
 
1009
- Searcher *msea_create(Searcher **searchers, int s_cnt, bool close_subs)
1635
+ Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
1010
1636
  {
1011
- int i, max_doc = 0, *starts;
1012
- Searcher *self = ALLOC(Searcher);
1013
-
1014
- MultiSearcher *msea = ALLOC(MultiSearcher);
1015
-
1016
- starts = ALLOC_N(int, s_cnt + 1);
1017
- for (i = 0; i < s_cnt; i++) {
1637
+ int i, max_doc = 0;
1638
+ Searcher *self = (Searcher *)ecalloc(sizeof(MultiSearcher));
1639
+ int *starts = ALLOC_N(int, s_cnt + 1);
1640
+ for (i = 0; i < s_cnt; i++) {
1641
+ starts[i] = max_doc;
1642
+ max_doc += searchers[i]->max_doc(searchers[i]);
1643
+ }
1018
1644
  starts[i] = max_doc;
1019
- max_doc += searchers[i]->max_doc(searchers[i]);
1020
- }
1021
- starts[i] = max_doc;
1022
-
1023
- msea->s_cnt = s_cnt;
1024
- msea->searchers = searchers;
1025
- msea->starts = starts;
1026
- msea->max_doc = max_doc;
1027
- msea->close_subs = close_subs;
1028
- self->data = msea;
1029
-
1030
- self->ir = (IndexReader *)NULL;
1031
- self->similarity = sim_create_default();
1032
- self->doc_freq = &msea_doc_freq;
1033
- self->doc_freqs = &ss_doc_freqs;
1034
- self->get_doc = &msea_get_doc;
1035
- self->max_doc = &msea_max_doc;
1036
- self->create_weight = &msea_create_weight;
1037
- self->search = &msea_search;
1038
- self->search_each = &msea_search_each;
1039
- self->search_each_w = &msea_search_each_w;
1040
- self->rewrite = &msea_rewrite;
1041
- self->explain = &msea_explain;
1042
- self->explain_w = &msea_explain_w;
1043
- self->get_similarity = &msea_get_similarity;
1044
- self->close = &msea_close;
1045
- return self;
1645
+
1646
+ MSEA(self)->s_cnt = s_cnt;
1647
+ MSEA(self)->searchers = searchers;
1648
+ MSEA(self)->starts = starts;
1649
+ MSEA(self)->max_doc = max_doc;
1650
+ MSEA(self)->close_subs = close_subs;
1651
+
1652
+ self->similarity = sim_create_default();
1653
+ self->doc_freq = &msea_doc_freq;
1654
+ self->get_doc = &msea_get_doc;
1655
+ self->get_lazy_doc = &msea_get_lazy_doc;
1656
+ self->max_doc = &msea_max_doc;
1657
+ self->create_weight = &msea_create_weight;
1658
+ self->search = &msea_search;
1659
+ self->search_w = &msea_search_w;
1660
+ self->search_each = &msea_search_each;
1661
+ self->search_each_w = &msea_search_each_w;
1662
+ self->rewrite = &msea_rewrite;
1663
+ self->explain = &msea_explain;
1664
+ self->explain_w = &msea_explain_w;
1665
+ self->get_term_vector = &msea_get_term_vector;
1666
+ self->get_similarity = &msea_get_similarity;
1667
+ self->close = &msea_close;
1668
+ return self;
1046
1669
  }