ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/search.c CHANGED
@@ -1,8 +1,6 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
-
4
- static char * const NUM_DOCS_ARG_ERROR_MSG = "num_docs must be > 0 to run a search";
5
- static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a search";
3
+ #include "array.h"
6
4
 
7
5
  /***************************************************************************
8
6
  *
@@ -10,67 +8,65 @@ static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a s
10
8
  *
11
9
  ***************************************************************************/
12
10
 
13
- Explanation *expl_create(float value, char *description)
11
+ Explanation *expl_new(float value, const char *description, ...)
14
12
  {
15
- Explanation *self = ALLOC(Explanation);
16
- self->value = value;
17
- self->description = description;
18
- self->dcnt = 0;
19
- self->dcapa = EXPLANATION_DETAILS_START_SIZE;
20
- self->details = ALLOC_N(Explanation *, EXPLANATION_DETAILS_START_SIZE);
21
- return self;
13
+ Explanation *expl = ALLOC(Explanation);
14
+
15
+ va_list args;
16
+ va_start(args, description);
17
+ expl->description = vstrfmt(description, args);
18
+ va_end(args);
19
+
20
+ expl->value = value;
21
+ expl->details = ary_new_type_capa(Explanation *,
22
+ EXPLANATION_DETAILS_START_SIZE);
23
+ return expl;
22
24
  }
23
25
 
24
- void expl_destoy(void *p)
26
+ void expl_destroy(Explanation *expl)
25
27
  {
26
- Explanation *expl = (Explanation *)p;
27
- int i;
28
- for (i = 0; i < expl->dcnt; i++) {
29
- expl_destoy(expl->details[i]);
30
- }
31
- free(expl->details);
32
- free(expl->description);
33
- free(expl);
28
+ ary_destroy((void **)expl->details, (free_ft)expl_destroy);
29
+ free(expl->description);
30
+ free(expl);
34
31
  }
35
32
 
36
- Explanation *expl_add_detail(Explanation *self, Explanation *detail)
33
+ Explanation *expl_add_detail(Explanation *expl, Explanation *detail)
37
34
  {
38
- if (self->dcnt >= self->dcapa) {
39
- self->dcapa *= 2;
40
- REALLOC_N(self->details, Explanation *, self->dcapa);
41
- }
42
- self->details[self->dcnt] = detail;
43
- self->dcnt++;
44
- return self;
35
+ ary_push(expl->details, detail);
36
+ return expl;
45
37
  }
46
38
 
47
- char *expl_to_s(Explanation *self, int depth)
39
+ char *expl_to_s_depth(Explanation *expl, int depth)
48
40
  {
49
- int i;
50
- char *buffer = ALLOC_N(char, depth * 2 + 1);
51
- memset(buffer, ' ', sizeof(char) * depth * 2);
52
- buffer[depth*2] = 0;
41
+ int i;
42
+ char *buffer = ALLOC_N(char, depth * 2 + 1);
43
+ const int num_details = ary_size(expl->details);
44
+
45
+ memset(buffer, ' ', sizeof(char) * depth * 2);
46
+ buffer[depth*2] = 0;
53
47
 
54
- buffer = estrcat(buffer, strfmt("%f = %s\n", self->value, self->description));
55
- for (i = 0; i < self->dcnt; i++) {
56
- buffer = estrcat(buffer, expl_to_s(self->details[i], depth + 1));
57
- }
48
+ buffer = estrcat(buffer, strfmt("%f = %s\n", expl->value, expl->description));
49
+ for (i = 0; i < num_details; i++) {
50
+ buffer = estrcat(buffer, expl_to_s_depth(expl->details[i], depth + 1));
51
+ }
58
52
 
59
- return buffer;
53
+ return buffer;
60
54
  }
61
55
 
62
- char *expl_to_html(Explanation *self)
56
+ char *expl_to_html(Explanation *expl)
63
57
  {
64
- int i;
65
- char *buffer;
66
- buffer = strfmt("<ul>\n<li>%f = %s</li>\n", self->value, self->description);
58
+ int i;
59
+ char *buffer;
60
+ const int num_details = ary_size(expl->details);
61
+
62
+ buffer = strfmt("<ul>\n<li>%f = %s</li>\n", expl->value, expl->description);
67
63
 
68
- for (i = 0; i < self->dcnt; i++) {
69
- estrcat(buffer, expl_to_html(self->details[i]));
70
- }
64
+ for (i = 0; i < num_details; i++) {
65
+ estrcat(buffer, expl_to_html(expl->details[i]));
66
+ }
71
67
 
72
- REALLOC_N(buffer, char, strlen(buffer) + 10);
73
- return strcat(buffer, "</ul>\n");
68
+ REALLOC_N(buffer, char, strlen(buffer) + 10);
69
+ return strcat(buffer, "</ul>\n");
74
70
  }
75
71
 
76
72
  /***************************************************************************
@@ -79,88 +75,104 @@ char *expl_to_html(Explanation *self)
79
75
  *
80
76
  ***************************************************************************/
81
77
 
82
- bool hit_less_than(void *hit1, void *hit2)
78
+ static bool hit_less_than(const Hit *hit1, const Hit *hit2)
83
79
  {
84
- if (((Hit *)hit1)->score == ((Hit *)hit2)->score) {
85
- return ((Hit *)hit1)->doc > ((Hit *)hit2)->doc;
86
- } else {
87
- return ((Hit *)hit1)->score < ((Hit *)hit2)->score;
88
- }
80
+ if (hit1->score == hit2->score) {
81
+ return hit1->doc > hit2->doc;
82
+ }
83
+ else {
84
+ return hit1->score < hit1->score;
85
+ }
89
86
  }
90
87
 
91
- inline bool hit_lt(Hit *hit1, Hit *hit2)
88
+ static bool hit_lt(Hit *hit1, Hit *hit2)
92
89
  {
93
- if (hit1->score == hit2->score) {
94
- return hit1->doc > hit2->doc;
95
- } else {
96
- return hit1->score < hit2->score;
97
- }
90
+ if (hit1->score == hit2->score) {
91
+ return hit1->doc > hit2->doc;
92
+ }
93
+ else {
94
+ return hit1->score < hit2->score;
95
+ }
98
96
  }
99
97
 
100
- void hit_pq_down(PriorityQueue *pq)
98
+ static void hit_pq_down(PriorityQueue *pq)
101
99
  {
102
- register int i = 1;
103
- register int j = 2; //i << 1;
104
- register int k = 3; //j + 1;
105
- Hit **heap = (Hit **)pq->heap;
106
- Hit *node = heap[i]; // save top node
100
+ register int i = 1;
101
+ register int j = 2; /* i << 1; */
102
+ register int k = 3; /* j + 1; */
103
+ Hit **heap = (Hit **)pq->heap;
104
+ Hit *node = heap[i]; /* save top node */
107
105
 
108
- if ((k <= pq->count) && hit_lt(heap[k], heap[j]))
109
- j = k;
106
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
107
+ j = k;
108
+ }
110
109
 
111
- while ((j <= pq->count) && hit_lt(heap[j], node)) {
112
- heap[i] = heap[j]; // shift up child
113
- i = j;
114
- j = i << 1;
115
- k = j + 1;
116
- if ((k <= pq->count) && hit_lt(heap[k], heap[j]))
117
- j = k;
118
- }
119
- heap[i] = node;
110
+ while ((j <= pq->size) && hit_lt(heap[j], node)) {
111
+ heap[i] = heap[j]; /* shift up child */
112
+ i = j;
113
+ j = i << 1;
114
+ k = j + 1;
115
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
116
+ j = k;
117
+ }
118
+ }
119
+ heap[i] = node;
120
120
  }
121
121
 
122
- Hit *hit_pq_pop(PriorityQueue *pq)
122
+ static Hit *hit_pq_pop(PriorityQueue *pq)
123
123
  {
124
- if (pq->count > 0) {
125
- Hit *result = (Hit *)pq->heap[1]; // save first value
126
- pq->heap[1] = pq->heap[pq->count]; // move last to first
127
- pq->heap[pq->count] = NULL;
128
- pq->count--;
129
- hit_pq_down(pq); // adjust heap
130
- return result;
131
- } else {
132
- return NULL;
133
- }
124
+ if (pq->size > 0) {
125
+ Hit *result = (Hit *)pq->heap[1]; /* save first value */
126
+ pq->heap[1] = pq->heap[pq->size]; /* move last to first */
127
+ pq->heap[pq->size] = NULL;
128
+ pq->size--;
129
+ hit_pq_down(pq); /* adjust heap */
130
+ return result;
131
+ }
132
+ else {
133
+ return NULL;
134
+ }
134
135
  }
135
136
 
136
- inline void hit_pq_up(PriorityQueue *pq)
137
+ static void hit_pq_up(PriorityQueue *pq)
137
138
  {
138
- Hit **heap = (Hit **)pq->heap;
139
- Hit *node;
140
- int i = pq->count;
141
- int j = i >> 1;
142
- node = heap[i];
139
+ Hit **heap = (Hit **)pq->heap;
140
+ Hit *node;
141
+ int i = pq->size;
142
+ int j = i >> 1;
143
+ node = heap[i];
144
+
145
+ while ((j > 0) && hit_lt(node, heap[j])) {
146
+ heap[i] = heap[j];
147
+ i = j;
148
+ j = j >> 1;
149
+ }
150
+ heap[i] = node;
151
+ }
143
152
 
144
- while ((j > 0) && hit_lt(node, heap[j])) {
145
- heap[i] = heap[j];
146
- i = j;
147
- j = j >> 1;
148
- }
149
- heap[i] = node;
153
+ static void hit_pq_insert(PriorityQueue *pq, Hit *hit)
154
+ {
155
+ if (pq->size < pq->capa) {
156
+ Hit *new_hit = ALLOC(Hit);
157
+ memcpy(new_hit, hit, sizeof(Hit));
158
+ pq->size++;
159
+ if (pq->size >= pq->mem_capa) {
160
+ pq->mem_capa <<= 1;
161
+ REALLOC_N(pq->heap, void *, pq->mem_capa);
162
+ }
163
+ pq->heap[pq->size] = new_hit;
164
+ hit_pq_up(pq);
165
+ }
166
+ else if (pq->size > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
167
+ memcpy(pq->heap[1], hit, sizeof(Hit));
168
+ hit_pq_down(pq);
169
+ }
150
170
  }
151
171
 
152
- void hit_pq_insert(PriorityQueue *pq, Hit *hit)
172
+ static void hit_pq_multi_insert(PriorityQueue *pq, Hit *hit)
153
173
  {
154
- if (pq->count < pq->size) {
155
- Hit *new_hit = ALLOC(Hit);
156
- memcpy(new_hit, hit, sizeof(Hit));
157
- pq->count++;
158
- pq->heap[pq->count] = new_hit;
159
- hit_pq_up(pq);
160
- } else if (pq->count > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
161
- memcpy(pq->heap[1], hit, sizeof(Hit));
162
- hit_pq_down(pq);
163
- }
174
+ hit_pq_insert(pq, hit);
175
+ free(hit);
164
176
  }
165
177
 
166
178
  /***************************************************************************
@@ -169,35 +181,38 @@ void hit_pq_insert(PriorityQueue *pq, Hit *hit)
169
181
  *
170
182
  ***************************************************************************/
171
183
 
172
- TopDocs *td_create(int total_hits, int size, Hit **hits)
184
+ TopDocs *td_new(int total_hits, int size, Hit **hits, float max_score)
173
185
  {
174
- TopDocs *td = ALLOC(TopDocs);
175
- td->total_hits = total_hits;
176
- td->size = size;
177
- td->hits = hits;
178
- return td;
186
+ TopDocs *td = ALLOC(TopDocs);
187
+ td->total_hits = total_hits;
188
+ td->size = size;
189
+ td->hits = hits;
190
+ td->max_score = max_score;
191
+ return td;
179
192
  }
180
193
 
181
194
  void td_destroy(TopDocs *td)
182
195
  {
183
- int i;
184
- for (i = 0; i < td->size; i++) {
185
- free(td->hits[i]);
186
- }
187
- free(td->hits);
188
- free(td);
196
+ int i;
197
+
198
+ for (i = 0; i < td->size; i++) {
199
+ free(td->hits[i]);
200
+ }
201
+ free(td->hits);
202
+ free(td);
189
203
  }
190
204
 
191
205
  char *td_to_s(TopDocs *td)
192
206
  {
193
- int i;
194
- Hit *hit;
195
- char *buffer = strfmt("%d hits sorted by <score, doc_num>\n", td->total_hits);
196
- for (i = 0; i < td->size; i++) {
197
- hit = td->hits[i];
198
- estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
199
- }
200
- return buffer;
207
+ int i;
208
+ Hit *hit;
209
+ char *buffer = strfmt("%d hits sorted by <score, doc_num>\n",
210
+ td->total_hits);
211
+ for (i = 0; i < td->size; i++) {
212
+ hit = td->hits[i];
213
+ estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
214
+ }
215
+ return buffer;
201
216
  }
202
217
 
203
218
  /***************************************************************************
@@ -208,44 +223,50 @@ char *td_to_s(TopDocs *td)
208
223
 
209
224
  Query *w_get_query(Weight *self)
210
225
  {
211
- return self->query;
226
+ return self->query;
212
227
  }
213
228
 
214
229
  float w_get_value(Weight *self)
215
230
  {
216
- return self->value;
231
+ return self->value;
217
232
  }
218
233
 
219
234
  float w_sum_of_squared_weights(Weight *self)
220
235
  {
221
- self->qweight = self->idf * self->query->boost;
222
- return self->qweight * self->qweight; // square it
236
+ self->qweight = self->idf * self->query->boost;
237
+ return self->qweight * self->qweight; /* square it */
223
238
  }
224
239
 
225
240
  void w_normalize(Weight *self, float normalization_factor)
226
241
  {
227
- self->qnorm = normalization_factor;
228
- self->qweight *= normalization_factor; // normalize query weight
229
- self->value = self->qweight * self->idf; // idf for document
242
+ self->qnorm = normalization_factor;
243
+ self->qweight *= normalization_factor; /* normalize query weight */
244
+ self->value = self->qweight * self->idf;/* idf for document */
230
245
  }
231
246
 
232
247
  void w_destroy(Weight *self)
233
248
  {
234
- q_deref(self->query);
235
- free(self);
249
+ q_deref(self->query);
250
+ free(self);
236
251
  }
237
252
 
238
- Weight *w_create(Query *query)
253
+ Weight *w_create(size_t size, Query *query)
239
254
  {
240
- Weight *self = ALLOC_AND_ZERO_N(Weight, 1);
241
- ref(query);
242
- self->query = query;
243
-
244
- self->get_query = &w_get_query;
245
- self->get_value = &w_get_value;
246
- self->normalize = &w_normalize;
247
- self->destroy = &w_destroy;
248
- return self;
255
+ Weight *self = (Weight *)ecalloc(size);
256
+ #ifdef DEBUG
257
+ if (size < sizeof(Weight)) {
258
+ RAISE(ERROR, "size of weight <%d> should be at least <%d>",
259
+ (int)size, (int)sizeof(Weight));
260
+ }
261
+ #endif
262
+ REF(query);
263
+ self->query = query;
264
+ self->get_query = &w_get_query;
265
+ self->get_value = &w_get_value;
266
+ self->normalize = &w_normalize;
267
+ self->destroy = &w_destroy;
268
+ self->sum_of_squared_weights = &w_sum_of_squared_weights;
269
+ return self;
249
270
  }
250
271
 
251
272
  /***************************************************************************
@@ -254,128 +275,181 @@ Weight *w_create(Query *query)
254
275
  *
255
276
  ***************************************************************************/
256
277
 
257
- Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
258
- {
259
- return searcher->get_similarity(searcher);
260
- }
278
+ static const char *QUERY_NAMES[] = {
279
+ "TermQuery",
280
+ "MultiTermQuery",
281
+ "BooleanQuery",
282
+ "PhraseQuery",
283
+ "MultiPhraseQuery",
284
+ "ConstantScoreQuery",
285
+ "FilteredQuery",
286
+ "MatchAllQuery",
287
+ "RangeQuery",
288
+ "WildCardQuery",
289
+ "FuzzyQuery",
290
+ "PrefixQuery",
291
+ "SpanTermQuery",
292
+ "SpanFirstQuery",
293
+ "SpanOrQuery",
294
+ "SpanNotQuery",
295
+ "SpanNearQuery"
296
+ };
261
297
 
262
- Query *q_rewrite(Query *self, IndexReader *ir)
263
- {
264
- self->ref_cnt++;
265
- return self;
298
+ static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
299
+
300
+ const char *q_get_query_name(enum QUERY_TYPE type) {
301
+ if (type >= NELEMS(QUERY_NAMES)) {
302
+ return UNKNOWN_QUERY_NAME;
303
+ }
304
+ else {
305
+ return QUERY_NAMES[type];
306
+ }
266
307
  }
267
308
 
268
- Weight *q_weight(Query *self, Searcher *searcher)
309
+ static Query *q_rewrite(Query *self, IndexReader *ir)
269
310
  {
270
- Query *query = searcher->rewrite(searcher, self);
271
- Weight *weight = query->create_weight_i(query, searcher);
272
- float sum = weight->sum_of_squared_weights(weight);
273
- Similarity *sim = query->get_similarity(query, searcher);
274
- float norm = sim_query_norm(sim, sum);
275
- q_deref(query);
276
-
277
- weight->normalize(weight, norm);
278
- return self->weight = weight;
311
+ (void)ir;
312
+ self->ref_cnt++;
313
+ return self;
279
314
  }
280
315
 
281
- Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
316
+ static void q_extract_terms(Query *self, HashSet *terms)
282
317
  {
283
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
284
- return NULL;
318
+ /* do nothing by default */
319
+ (void)self;
320
+ (void)terms;
285
321
  }
286
322
 
287
- void q_destroy_i(Query *self)
323
+ Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
288
324
  {
289
- free(self);
325
+ (void)self;
326
+ return searcher->get_similarity(searcher);
290
327
  }
291
328
 
292
- void q_extract_terms(Query *self, HashSet *terms)
329
+ void q_destroy_i(Query *self)
293
330
  {
294
- /* do nothing by default */
331
+ free(self);
295
332
  }
296
333
 
297
334
  void q_deref(Query *self)
298
335
  {
299
- if (--self->ref_cnt == 0) {
300
- self->destroy_i(self);
301
- }
336
+ if (--(self->ref_cnt) == 0) {
337
+ self->destroy_i(self);
338
+ }
302
339
  }
303
340
 
304
- Query *q_create()
341
+ Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
305
342
  {
306
- Query *self = ALLOC(Query);
307
- ZEROSET(self, Query, 1);
308
- self->destroy_all = true;
309
- self->boost = 1.0;
310
- self->rewrite = &q_rewrite;
311
- self->get_similarity = &q_get_similarity_i;
312
- self->extract_terms = &q_extract_terms;
313
- self->weight = NULL;
314
- self->ref_cnt = 1;
315
- return self;
343
+ (void)self;
344
+ (void)searcher;
345
+ RAISE(UNSUPPORTED_ERROR,
346
+ "Create weight is unsupported for this type of query");
347
+ return NULL;
316
348
  }
317
349
 
318
- uint q_hash(Query *self)
350
+ Weight *q_weight(Query *self, Searcher *searcher)
319
351
  {
320
- return (self->hash(self) << 4) | self->type;
321
- }
352
+ Query *query = searcher->rewrite(searcher, self);
353
+ Weight *weight = query->create_weight_i(query, searcher);
354
+ float sum = weight->sum_of_squared_weights(weight);
355
+ Similarity *sim = query->get_similarity(query, searcher);
356
+ float norm = sim_query_norm(sim, sum);
357
+ q_deref(query);
322
358
 
323
- int q_eq(Query *self, Query *o)
324
- {
325
- return (self == o) || ((self->type == o->type) &&
326
- (self->boost == o->boost) &&
327
- self->eq(self, o));
359
+ weight->normalize(weight, norm);
360
+ return self->weight = weight;
328
361
  }
329
362
 
363
+ #define BQ(query) ((BooleanQuery *)(query))
330
364
  Query *q_combine(Query **queries, int q_cnt)
331
365
  {
332
- int i;
333
- Query *q, *ret_q;
334
- HashSet *uniques =
335
- hs_create((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
336
-
337
- for (i = 0; i < q_cnt; i++) {
338
- q = queries[i];
339
- if (q->type == BOOLEAN_QUERY) {
340
- int j;
341
- bool splittable = true;
342
- BooleanQuery *bq = (BooleanQuery *)q->data;
343
- if (bq->coord_disabled == false) {
344
- splittable = false;
345
- } else {
346
- for (j = 0; j < bq->clause_cnt; j++) {
347
- if (bq->clauses[j]->occur != BC_SHOULD) {
348
- splittable = false;
349
- break;
350
- }
366
+ int i;
367
+ Query *q, *ret_q;
368
+ HashSet *uniques = hs_new((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
369
+
370
+ for (i = 0; i < q_cnt; i++) {
371
+ q = queries[i];
372
+ if (q->type == BOOLEAN_QUERY) {
373
+ int j;
374
+ bool splittable = true;
375
+ if (BQ(q)->coord_disabled == false) {
376
+ splittable = false;
377
+ }
378
+ else {
379
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
380
+ if (BQ(q)->clauses[j]->occur != BC_SHOULD) {
381
+ splittable = false;
382
+ break;
383
+ }
384
+ }
385
+ }
386
+ if (splittable) {
387
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
388
+ Query *sub_q = BQ(q)->clauses[j]->query;
389
+ hs_add(uniques, sub_q);
390
+ }
391
+ }
392
+ else {
393
+ hs_add(uniques, q);
394
+ }
351
395
  }
352
- }
353
- if (splittable) {
354
- for (j = 0; j < bq->clause_cnt; j++) {
355
- q = bq->clauses[j]->query;
356
- hs_add(uniques, q);
396
+ else {
397
+ hs_add(uniques, q);
357
398
  }
358
- } else {
359
- hs_add(uniques, q);
360
- }
361
- } else {
362
- hs_add(uniques, q);
363
399
  }
364
- }
365
- if (uniques->size == 1) {
366
- ret_q = (Query *)uniques->elems[0];
367
- ref(ret_q);
368
- } else {
369
- ret_q = bq_create(true);
370
- for (i = 0; i < uniques->size; i++) {
371
- q = (Query *)uniques->elems[i];
372
- ref(q);
373
- bq_add_query(ret_q, q, BC_SHOULD);
400
+ if (uniques->size == 1) {
401
+ ret_q = (Query *)uniques->elems[0];
402
+ REF(ret_q);
403
+ }
404
+ else {
405
+ ret_q = bq_new(true);
406
+ for (i = 0; i < uniques->size; i++) {
407
+ q = (Query *)uniques->elems[i];
408
+ bq_add_query(ret_q, q, BC_SHOULD);
409
+ }
374
410
  }
375
- }
376
- hs_destroy(uniques);
411
+ hs_destroy(uniques);
377
412
 
378
- return ret_q;
413
+ return ret_q;
414
+ }
415
+
416
+ ulong q_hash(Query *self)
417
+ {
418
+ return (self->hash(self) << 5) | self->type;
419
+ }
420
+
421
+ int q_eq(Query *self, Query *o)
422
+ {
423
+ return (self == o)
424
+ || ((self->type == o->type)
425
+ && (self->boost == o->boost)
426
+ && self->eq(self, o));
427
+ }
428
+
429
+ static MatchVector *q_get_matchv_i(Query *self, MatchVector *mv, TermVector *tv)
430
+ {
431
+ /* be default we don't add any matches */
432
+ (void)self; (void)tv;
433
+ return mv;
434
+ }
435
+
436
+ Query *q_create(size_t size)
437
+ {
438
+ Query *self = (Query *)ecalloc(size);
439
+ #ifdef DEBUG
440
+ if (size < sizeof(Query)) {
441
+ RAISE(ERROR, "Size of a query <%d> should never be smaller than the "
442
+ "size of a Query struct <%d>", (int)size, (int)sizeof(Query));
443
+ }
444
+ #endif
445
+ self->boost = 1.0;
446
+ self->rewrite = &q_rewrite;
447
+ self->get_similarity = &q_get_similarity_i;
448
+ self->extract_terms = &q_extract_terms;
449
+ self->get_matchv_i = &q_get_matchv_i;
450
+ self->weight = NULL;
451
+ self->ref_cnt = 1;
452
+ return self;
379
453
  }
380
454
 
381
455
  /***************************************************************************
@@ -384,36 +458,154 @@ Query *q_combine(Query **queries, int q_cnt)
384
458
  *
385
459
  ***************************************************************************/
386
460
 
387
- void scorer_destroy_i(Scorer *self)
461
+ void scorer_destroy_i(Scorer *scorer)
388
462
  {
389
- free(self->data);
390
- free(self);
463
+ free(scorer);
391
464
  }
392
465
 
393
- Scorer *scorer_create(Similarity *similarity)
466
+ Scorer *scorer_create(size_t size, Similarity *similarity)
394
467
  {
395
- Scorer *self = ALLOC(Scorer);
396
- self->destroy = &scorer_destroy_i;
397
- self->data = NULL;
398
- self->similarity = similarity;
399
- return self;
468
+ Scorer *self = (Scorer *)ecalloc(size);
469
+ #ifdef DEBUG
470
+ if (size < sizeof(Scorer)) {
471
+ RAISE(ERROR, "size of scorer <%d> should be at least <%d>",
472
+ (int)size, (int)sizeof(Scorer));
473
+ }
474
+ #endif
475
+ self->destroy = &scorer_destroy_i;
476
+ self->similarity = similarity;
477
+ return self;
400
478
  }
401
479
 
402
480
  bool scorer_less_than(void *p1, void *p2)
403
481
  {
404
- Scorer *s1 = (Scorer *)p1;
405
- Scorer *s2 = (Scorer *)p2;
406
- return s1->score(s1) < s2->score(s2);
482
+ Scorer *s1 = (Scorer *)p1;
483
+ Scorer *s2 = (Scorer *)p2;
484
+ return s1->score(s1) < s2->score(s2);
407
485
  }
408
486
 
409
- bool scorer_doc_less_than(void *p1, void *p2)
487
+ bool scorer_doc_less_than(const Scorer *s1, const Scorer *s2)
410
488
  {
411
- return ((Scorer *)p1)->doc < ((Scorer *)p2)->doc;
489
+ return s1->doc < s2->doc;
412
490
  }
413
491
 
414
492
  int scorer_doc_cmp(const void *p1, const void *p2)
415
493
  {
416
- return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
494
+ return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
495
+ }
496
+
497
+ /***************************************************************************
498
+ *
499
+ * Highlighter
500
+ *
501
+ ***************************************************************************/
502
+
503
+ /* ** MatchRange ** */
504
+ static int match_range_cmp(const void *p1, const void *p2)
505
+ {
506
+ int diff = ((MatchRange *)p1)->start - ((MatchRange *)p2)->start;
507
+ if (diff != 0) {
508
+ return diff;
509
+ }
510
+ else {
511
+ return ((MatchRange *)p2)->end - ((MatchRange *)p1)->end;
512
+ }
513
+ }
514
+
515
+
516
+
517
+ /* ** MatchVector ** */
518
+ MatchVector *matchv_new()
519
+ {
520
+ MatchVector *matchv = ALLOC(MatchVector);
521
+
522
+ matchv->size = 0;
523
+ matchv->capa = MATCH_VECTOR_INIT_CAPA;
524
+ matchv->matches = ALLOC_N(MatchRange, MATCH_VECTOR_INIT_CAPA);
525
+
526
+ return matchv;
527
+ }
528
+
529
+ MatchVector *matchv_add(MatchVector *self, int start, int end)
530
+ {
531
+ if (self->size >= self->capa) {
532
+ self->capa <<= 1;
533
+ REALLOC_N(self->matches, MatchRange, self->capa);
534
+ }
535
+ self->matches[self->size].start = start;
536
+ self->matches[self->size].end = end;
537
+ self->matches[self->size++].score = 1.0;
538
+ return self;
539
+ }
540
+
541
+ MatchVector *matchv_sort(MatchVector *self)
542
+ {
543
+ qsort(self->matches, self->size, sizeof(MatchRange), &match_range_cmp);
544
+ return self;
545
+ }
546
+
547
+ MatchVector *matchv_compact(MatchVector *self)
548
+ {
549
+ int left, right;
550
+ matchv_sort(self);
551
+ for (right = left = 0; right < self->size; right++) {
552
+ /* Note the end + 1. This compacts a range 3:5 and 6:8 inleft 3:8 */
553
+ if (self->matches[right].start > self->matches[left].end + 1) {
554
+ left++;
555
+ self->matches[left].start = self->matches[right].start;
556
+ self->matches[left].end = self->matches[right].end;
557
+ self->matches[left].score = self->matches[right].score;
558
+ }
559
+ else if (self->matches[right].end > self->matches[left].end) {
560
+ self->matches[left].end = self->matches[right].end;
561
+ }
562
+ else {
563
+ self->matches[left].score += self->matches[right].score;
564
+ }
565
+ }
566
+ self->size = left + 1;
567
+ return self;
568
+ }
569
+
570
+ MatchVector *matchv_compact_with_breaks(MatchVector *self)
571
+ {
572
+ int left, right;
573
+ matchv_sort(self);
574
+ for (right = left = 0; right < self->size; right++) {
575
+ /* Note: no end + 1. Unlike above won't compact ranges 3:5 and 6:8 */
576
+ if (self->matches[right].start > self->matches[left].end) {
577
+ left++;
578
+ self->matches[left].start = self->matches[right].start;
579
+ self->matches[left].end = self->matches[right].end;
580
+ self->matches[left].score = self->matches[right].score;
581
+ }
582
+ else if (self->matches[right].end > self->matches[left].end) {
583
+ self->matches[left].end = self->matches[right].end;
584
+ self->matches[left].score += self->matches[right].score;
585
+ }
586
+ else if (right > left) {
587
+ self->matches[left].score += self->matches[right].score;
588
+ }
589
+ }
590
+ self->size = left + 1;
591
+ return self;
592
+ }
593
+
594
+
595
+ static MatchVector *matchv_set_offsets(MatchVector *mv, Offset *offsets)
596
+ {
597
+ int i;
598
+ for (i = 0; i < mv->size; i++) {
599
+ mv->matches[i].start_offset = offsets[mv->matches[i].start].start;
600
+ mv->matches[i].end_offset = offsets[mv->matches[i].end].end;
601
+ }
602
+ return mv;
603
+ }
604
+
605
+ void matchv_destroy(MatchVector *self)
606
+ {
607
+ free(self->matches);
608
+ free(self);
417
609
  }
418
610
 
419
611
  /***************************************************************************
@@ -422,211 +614,541 @@ int scorer_doc_cmp(const void *p1, const void *p2)
422
614
  *
423
615
  ***************************************************************************/
424
616
 
425
- static int s_doc_freq(Searcher *self, Term *term)
617
+ MatchVector *searcher_get_match_vector(Searcher *self,
618
+ Query *query,
619
+ const int doc_num,
620
+ const char *field)
621
+ {
622
+ MatchVector *mv = matchv_new();
623
+ Query *rewritten_query = self->rewrite(self, query);
624
+ TermVector *tv = self->get_term_vector(self, doc_num, field);
625
+ if (tv && tv->term_cnt > 0 && tv->terms[0].positions != NULL) {
626
+ mv = rewritten_query->get_matchv_i(rewritten_query, mv, tv);
627
+ tv_destroy(tv);
628
+ }
629
+ q_deref(rewritten_query);
630
+ return mv;
631
+ }
632
+
633
+ typedef struct Excerpt
426
634
  {
427
- return self->ir->doc_freq(self->ir, term);
635
+ int start;
636
+ int end;
637
+ int start_pos;
638
+ int end_pos;
639
+ int start_offset;
640
+ int end_offset;
641
+ double score;
642
+ } Excerpt;
643
+
644
+ /*
645
+ static int excerpt_cmp(const void *p1, const void *p2)
646
+ {
647
+ double score1 = (*((Excerpt **)p1))->score;
648
+ double score2 = (*((Excerpt **)p2))->score;
649
+ if (score1 > score2) return 1;
650
+ if (score1 < score2) return -1;
651
+ return 0;
428
652
  }
653
+ */
429
654
 
430
- static int *s_doc_freqs(Searcher *self, Term **terms, int tcnt)
655
+ static int excerpt_start_cmp(const void *p1, const void *p2)
431
656
  {
432
- int i;
433
- int *freqs = ALLOC_N(int, tcnt);
657
+ return (*((Excerpt **)p1))->start - (*((Excerpt **)p2))->start;
658
+ }
434
659
 
435
- for (i = 0; i < tcnt; i++) {
436
- freqs[i] = self->ir->doc_freq(self->ir, terms[i]);
437
- }
438
- return freqs;
660
+ static int excerpt_lt(Excerpt *e1, Excerpt *e2)
661
+ {
662
+ return e1->score > e2->score; /* want the highest score at top */
439
663
  }
440
664
 
441
- static int *ss_doc_freqs(Searcher *self, Term **terms, int tcnt)
665
+ static Excerpt *excerpt_new(int start, int end, double score)
442
666
  {
443
- int i;
444
- int *freqs = ALLOC_N(int, tcnt);
667
+ Excerpt *excerpt = ALLOC_AND_ZERO(Excerpt);
668
+ excerpt->start = start;
669
+ excerpt->end = end;
670
+ excerpt->score = score;
671
+ return excerpt;
672
+ }
445
673
 
446
- for (i = 0; i < tcnt; i++) {
447
- freqs[i] = self->doc_freq(self, terms[i]);
448
- }
674
+ static Excerpt *excerpt_recalc_score(Excerpt *e, MatchVector *mv)
675
+ {
676
+ int i;
677
+ double score = 0.0;
678
+ for (i = e->start; i <= e->end; i++) {
679
+ score += mv->matches[i].score;
680
+ }
681
+ e->score = score;
682
+ return e;
683
+ }
449
684
 
450
- return freqs;
685
+ /* expand an excerpt to it's largest possible size */
686
+ static Excerpt *excerpt_expand(Excerpt *e, const int len, TermVector *tv)
687
+ {
688
+ Offset *offsets = tv->offsets;
689
+ int offset_cnt = tv->offset_cnt;
690
+ bool did_expansion = true;
691
+ int i;
692
+ /* fill in skipped offsets */
693
+ for (i = 1; i < offset_cnt; i++) {
694
+ if (offsets[i].start == 0) {
695
+ offsets[i].start = offsets[i-1].start;
696
+ }
697
+ if (offsets[i].end == 0) {
698
+ offsets[i].end = offsets[i-1].end;
699
+ }
700
+ }
701
+
702
+ while (did_expansion) {
703
+ did_expansion = false;
704
+ if (e->start_pos > 0
705
+ && (e->end_offset - offsets[e->start_pos - 1].start) < len) {
706
+ e->start_pos--;
707
+ e->start_offset = offsets[e->start_pos].start;
708
+ did_expansion = true;
709
+ }
710
+ if (e->end_pos < (offset_cnt - 1)
711
+ && (offsets[e->end_pos + 1].end - e->start_offset) < len) {
712
+ e->end_pos++;
713
+ e->end_offset = offsets[e->end_pos].end;
714
+ did_expansion = true;
715
+ }
716
+ }
717
+ return e;
718
+ }
719
+
720
+ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
721
+ LazyDocField *lazy_df,
722
+ const char *pre_tag,
723
+ const char *post_tag,
724
+ const char *ellipsis)
725
+ {
726
+ int i, len;
727
+ int last_offset = e->start_offset;
728
+ const int num_matches = e->end - e->start + 1;
729
+ const int pre_tag_len = (int)strlen(pre_tag);
730
+ const int post_tag_len = (int)strlen(post_tag);
731
+ const int ellipsis_len = (int)strlen(ellipsis);
732
+ char *excerpt_str = ALLOC_N(char,
733
+ 10 + e->end_offset - e->start_offset
734
+ + (num_matches * (pre_tag_len + post_tag_len))
735
+ + (2 * ellipsis_len));
736
+ char *e_ptr = excerpt_str;
737
+ if (e->start_offset > 0) {
738
+ memcpy(e_ptr, ellipsis, ellipsis_len);
739
+ e_ptr += ellipsis_len;
740
+ }
741
+ for (i = e->start; i <= e->end; i++) {
742
+ MatchRange *mr = mv->matches + i;
743
+ len = mr->start_offset - last_offset;
744
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
745
+ e_ptr += len;
746
+ memcpy(e_ptr, pre_tag, pre_tag_len);
747
+ e_ptr += pre_tag_len;
748
+ len = mr->end_offset - mr->start_offset;
749
+ lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
750
+ e_ptr += len;
751
+ memcpy(e_ptr, post_tag, post_tag_len);
752
+ e_ptr += post_tag_len;
753
+ last_offset = mr->end_offset;
754
+ }
755
+ len = e->end_offset - last_offset;
756
+ lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
757
+ e_ptr += len;
758
+ if (e->end_offset < lazy_df->len) {
759
+ memcpy(e_ptr, ellipsis, ellipsis_len);
760
+ e_ptr += ellipsis_len;
761
+ }
762
+ *e_ptr = '\0';
763
+ return excerpt_str;
764
+ }
765
+
766
+ char **searcher_highlight(Searcher *self,
767
+ Query *query,
768
+ const int doc_num,
769
+ const char *field,
770
+ const int excerpt_len,
771
+ const int num_excerpts,
772
+ const char *pre_tag,
773
+ const char *post_tag,
774
+ const char *ellipsis)
775
+ {
776
+ char **excerpt_strs = NULL;
777
+ TermVector *tv = self->get_term_vector(self, doc_num, field);
778
+ LazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
779
+ LazyDocField *lazy_df = NULL;
780
+ if (lazy_doc) {
781
+ lazy_df = h_get(lazy_doc->field_dict, field);
782
+ }
783
+ if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
784
+ && tv->offsets != NULL) {
785
+ MatchVector *mv;
786
+ query = self->rewrite(self, query);
787
+ mv = query->get_matchv_i(query, matchv_new(), tv);
788
+ if (mv->size > 0) {
789
+ Excerpt **excerpts = ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
790
+ int e_start, e_end, i, j;
791
+ MatchRange *matches = mv->matches;
792
+ double running_score = 0.0;
793
+ Offset *offsets = tv->offsets;
794
+ PriorityQueue *excerpt_pq;
795
+
796
+ matchv_compact_with_breaks(mv);
797
+ matchv_set_offsets(mv, offsets);
798
+ excerpt_pq = pq_new(mv->size, (lt_ft)&excerpt_lt, &free);
799
+ /* add all possible excerpts to the priority queue */
800
+
801
+ for (e_start = 0, e_end = 1; e_start < mv->size; e_start++) {
802
+ const int start_offset = matches[e_start].start_offset;
803
+ if (e_start >= e_end) {
804
+ e_end = e_start + 1;
805
+ }
806
+ running_score += matches[e_start].score;
807
+ while (e_end < mv->size && (matches[e_end].end_offset
808
+ <= start_offset + excerpt_len)) {
809
+ running_score += matches[e_end].score;
810
+ e_end++;
811
+ }
812
+ pq_push(excerpt_pq,
813
+ excerpt_new(e_start, e_end - 1, running_score));
814
+ /* - 0.1 so that earlier matches take priority */
815
+ running_score -= matches[e_start].score;
816
+ }
817
+
818
+ for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
819
+ excerpts[i] = pq_pop(excerpt_pq);
820
+ if (i < num_excerpts - 1) {
821
+ /* set match ranges alread included to 0 */
822
+ Excerpt *e = excerpts[i];
823
+ for (j = e->start; j <= e->end; j++) {
824
+ matches[j].score = 0.0;
825
+ }
826
+ e = NULL;
827
+ while (e != (Excerpt *)pq_top(excerpt_pq)) {
828
+ e = pq_top(excerpt_pq);
829
+ excerpt_recalc_score(e, mv);
830
+ pq_down(excerpt_pq);
831
+ }
832
+ }
833
+ }
834
+
835
+ qsort(excerpts, i, sizeof(Excerpt *), &excerpt_start_cmp);
836
+ for (j = 0; j < i; j++) {
837
+ Excerpt *e = excerpts[j];
838
+ e->start_pos = matches[e->start].start;
839
+ e->end_pos = matches[e->end].end;
840
+ e->start_offset = offsets[e->start_pos].start;
841
+ e->end_offset = offsets[e->end_pos].end;
842
+ }
843
+
844
+ if (i < num_excerpts) {
845
+ const int diff = num_excerpts - i;
846
+ memmove(excerpts + (diff), excerpts,
847
+ i * sizeof(Excerpt *));
848
+ for (j = 0; j < diff; j++) {
849
+ /* these new excerpts will grow into one long excerpt at
850
+ * the start */
851
+ excerpts[j] = ALLOC_AND_ZERO(Excerpt);
852
+ excerpts[j]->end = -1;
853
+ }
854
+ }
855
+
856
+ excerpt_strs = ary_new_type_capa(char *, num_excerpts);
857
+ /* merge excerpts where possible */
858
+ for (i = 0; i < num_excerpts;) {
859
+ Excerpt *ei = excerpts[i];
860
+ int merged = 1; /* 1 means a single excerpt, ie no merges */
861
+ for (j = i + 1; j < num_excerpts; j++) {
862
+ Excerpt *ej = excerpts[j];
863
+ if ((ej->end_offset - ei->start_offset)
864
+ < (j - i + 1) * excerpt_len) {
865
+ ei->end = ej->end;
866
+ ei->end_pos = ej->end_pos;
867
+ ei->end_offset = ej->end_offset;
868
+ merged = j - i + 1;
869
+ }
870
+ }
871
+ excerpt_expand(ei, merged * excerpt_len, tv);
872
+ ary_push(excerpt_strs,
873
+ excerpt_get_str(ei, mv, lazy_df,
874
+ pre_tag, post_tag, ellipsis));
875
+ i += merged;
876
+ }
877
+ for (i = 0; i < num_excerpts; i++) {
878
+ free(excerpts[i]);
879
+ }
880
+ free(excerpts);
881
+ pq_destroy(excerpt_pq);
882
+ matchv_destroy(mv);
883
+ }
884
+ q_deref(query);
885
+ }
886
+ if (tv) tv_destroy(tv);
887
+ if (lazy_doc) lazy_doc_close(lazy_doc);
888
+ return excerpt_strs;
451
889
  }
452
890
 
891
+ static Weight *sea_create_weight(Searcher *self, Query *query)
892
+ {
893
+ return q_weight(query, self);
894
+ }
453
895
 
454
- static Document *s_get_doc(Searcher *self, int doc_num)
896
+ static void sea_check_args(int num_docs, int first_doc)
455
897
  {
456
- return self->ir->get_doc(self->ir, doc_num);
898
+ if (num_docs <= 0) {
899
+ RAISE(ARG_ERROR, ":num_docs was set to %d but should be greater "
900
+ "than 0 : %d <= 0", num_docs, num_docs);
901
+ }
902
+
903
+ if (first_doc < 0) {
904
+ RAISE(ARG_ERROR, ":first_doc was set to %d but should be greater "
905
+ "than or equal to 0 : %d < 0", first_doc, first_doc);
906
+ }
457
907
  }
458
908
 
459
- static int s_max_doc(Searcher *self)
909
+ static Similarity *sea_get_similarity(Searcher *self)
460
910
  {
461
- return self->ir->max_doc(self->ir);
911
+ return self->similarity;
462
912
  }
463
913
 
464
- static Weight *s_create_weight(Searcher *self, Query *query)
914
+ /***************************************************************************
915
+ *
916
+ * IndexSearcher
917
+ *
918
+ ***************************************************************************/
919
+
920
+ #define ISEA(searcher) ((IndexSearcher *)(searcher))
921
+
922
+ int isea_doc_freq(Searcher *self, const char *field, const char *term)
465
923
  {
466
- return q_weight(query, self);
924
+ return ir_doc_freq(ISEA(self)->ir, field, term);
467
925
  }
468
926
 
469
- static TopDocs *s_search(Searcher *self, Query *query, int first_doc,
470
- int num_docs, Filter *filter, Sort *sort)
927
+ static Document *isea_get_doc(Searcher *self, int doc_num)
471
928
  {
472
- int max_size = first_doc + num_docs;
473
- int i;
474
- Weight *weight;
475
- Scorer *scorer;
476
- Hit **score_docs = NULL;
477
- Hit hit;
478
- int total_hits = 0;
479
- float score;
480
- BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
481
- Hit *(*hq_pop)(PriorityQueue *pq);
482
- void (*hq_insert)(PriorityQueue *pq, Hit *hit);
483
- void (*hq_destroy)(PriorityQueue *self);
484
- PriorityQueue *hq;
929
+ IndexReader *ir = ISEA(self)->ir;
930
+ return ir->get_doc(ir, doc_num);
931
+ }
485
932
 
933
+ static LazyDoc *isea_get_lazy_doc(Searcher *self, int doc_num)
934
+ {
935
+ IndexReader *ir = ISEA(self)->ir;
936
+ return ir->get_lazy_doc(ir, doc_num);
937
+ }
486
938
 
487
- if (num_docs <= 0)
488
- RAISE(ARG_ERROR, NUM_DOCS_ARG_ERROR_MSG);
939
+ static int isea_max_doc(Searcher *self)
940
+ {
941
+ IndexReader *ir = ISEA(self)->ir;
942
+ return ir->max_doc(ir);
943
+ }
489
944
 
490
- if (first_doc < 0)
491
- RAISE(ARG_ERROR, FIRST_DOC_ARG_ERROR_MSG);
945
+ #define IS_FILTERED(bits, filter_func, scorer, searcher) \
946
+ ((bits && !bv_get(bits, scorer->doc))\
947
+ || (filter_func \
948
+ && !filter_func(scorer->doc, scorer->score(scorer), searcher)))
492
949
 
493
- weight = q_weight(query, self);
494
- scorer = weight->scorer(weight, self->ir);
495
- if (!scorer) {
496
- if (bits) bv_destroy(bits);
497
- weight->destroy(weight);
498
- return td_create(0, 0, NULL);
499
- }
950
+ static TopDocs *isea_search_w(Searcher *self,
951
+ Weight *weight,
952
+ int first_doc,
953
+ int num_docs,
954
+ Filter *filter,
955
+ Sort *sort,
956
+ filter_ft filter_func,
957
+ bool load_fields)
958
+ {
959
+ int max_size = first_doc + num_docs;
960
+ int i;
961
+ Scorer *scorer;
962
+ Hit **score_docs = NULL;
963
+ Hit hit;
964
+ int total_hits = 0;
965
+ float score, max_score = 0.0;
966
+ BitVector *bits = (filter
967
+ ? filt_get_bv(filter, ISEA(self)->ir)
968
+ : NULL);
969
+ Hit *(*hq_pop)(PriorityQueue *pq);
970
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
971
+ void (*hq_destroy)(PriorityQueue *self);
972
+ PriorityQueue *hq;
500
973
 
501
- if (sort) {
502
- hq = fshq_pq_create(max_size, sort, self->ir);
503
- hq_pop = &fshq_pq_pop;
504
- hq_insert = &fshq_pq_insert;
505
- hq_destroy = &fshq_pq_destroy;
506
- } else {
507
- hq = pq_create(max_size, &hit_less_than);
508
- hq_pop = &hit_pq_pop;
509
- hq_insert = &hit_pq_insert;
510
- hq_destroy = &pq_destroy;
511
- }
974
+ sea_check_args(num_docs, first_doc);
512
975
 
513
- while (scorer->next(scorer)) {
514
- if (bits && !bv_get(bits, scorer->doc)) continue;
515
- total_hits++;
516
- score = scorer->score(scorer);
517
- hit.doc = scorer->doc; hit.score = score;
518
- hq_insert(hq, &hit);
519
- }
520
- scorer->destroy(scorer);
521
- weight->destroy(weight);
976
+ scorer = weight->scorer(weight, ISEA(self)->ir);
977
+ if (!scorer) {
978
+ return td_new(0, 0, NULL, 0.0);
979
+ }
522
980
 
523
- if (hq->count > first_doc) {
524
- if ((hq->count - first_doc) < num_docs) {
525
- num_docs = hq->count - first_doc;
981
+ if (sort) {
982
+ hq = fshq_pq_new(max_size, sort, ISEA(self)->ir);
983
+ hq_insert = &fshq_pq_insert;
984
+ hq_destroy = &fshq_pq_destroy;
985
+ if (load_fields) {
986
+ hq_pop = &fshq_pq_pop_fd;
987
+ }
988
+ else {
989
+ hq_pop = &fshq_pq_pop;
990
+ }
991
+ }
992
+ else {
993
+ hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
994
+ hq_pop = &hit_pq_pop;
995
+ hq_insert = &hit_pq_insert;
996
+ hq_destroy = &pq_destroy;
526
997
  }
527
- score_docs = ALLOC_N(Hit *, num_docs);
528
- for (i = num_docs - 1; i >= 0; i--) {
529
- score_docs[i] = hq_pop(hq);
530
- //hit = score_docs[i] = pq_pop(hq);
531
- //printf("hit = %d-->%f\n", hit->doc, hit->score);
998
+
999
+ while (scorer->next(scorer)) {
1000
+ if (IS_FILTERED(bits, filter_func, scorer, self)) {
1001
+ continue;
1002
+ }
1003
+ total_hits++;
1004
+ score = scorer->score(scorer);
1005
+ if (score > max_score) max_score = score;
1006
+ hit.doc = scorer->doc; hit.score = score;
1007
+ hq_insert(hq, &hit);
532
1008
  }
533
- } else {
534
- num_docs = 0;
535
- }
536
- pq_clear(hq);
537
- hq_destroy(hq);
1009
+ scorer->destroy(scorer);
538
1010
 
539
- if (bits) bv_destroy(bits);
540
- return td_create(total_hits, num_docs, score_docs);
1011
+ if (hq->size > first_doc) {
1012
+ if ((hq->size - first_doc) < num_docs) {
1013
+ num_docs = hq->size - first_doc;
1014
+ }
1015
+ score_docs = ALLOC_N(Hit *, num_docs);
1016
+ for (i = num_docs - 1; i >= 0; i--) {
1017
+ score_docs[i] = hq_pop(hq);
1018
+ /*
1019
+ hit = score_docs[i] = pq_pop(hq);
1020
+ printf("hit = %d-->%f\n", hit->doc, hit->score);
1021
+ */
1022
+ }
1023
+ }
1024
+ else {
1025
+ num_docs = 0;
1026
+ }
1027
+ pq_clear(hq);
1028
+ hq_destroy(hq);
1029
+
1030
+ return td_new(total_hits, num_docs, score_docs, max_score);
1031
+ }
1032
+
1033
+ static TopDocs *isea_search(Searcher *self,
1034
+ Query *query,
1035
+ int first_doc,
1036
+ int num_docs,
1037
+ Filter *filter,
1038
+ Sort *sort,
1039
+ filter_ft filter_func,
1040
+ bool load_fields)
1041
+ {
1042
+ TopDocs *td;
1043
+ Weight *weight = q_weight(query, self);
1044
+ td = isea_search_w(self, weight, first_doc, num_docs, filter,
1045
+ sort, filter_func, load_fields);
1046
+ weight->destroy(weight);
1047
+ return td;
541
1048
  }
542
1049
 
543
- static void s_search_each_w(Searcher *self, Weight *weight, Filter *filter,
544
- void (*fn)(Searcher *, int, float, void *), void *arg)
1050
+ static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
1051
+ filter_ft filter_func,
1052
+ void (*fn)(Searcher *, int, float, void *),
1053
+ void *arg)
545
1054
  {
546
- Scorer *scorer;
547
- BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
1055
+ Scorer *scorer;
1056
+ BitVector *bits = (filter
1057
+ ? filt_get_bv(filter, ISEA(self)->ir)
1058
+ : NULL);
548
1059
 
549
- scorer = weight->scorer(weight, self->ir);
550
- if (!scorer) {
551
- if (bits) bv_destroy(bits);
552
- return;
553
- }
1060
+ scorer = weight->scorer(weight, ISEA(self)->ir);
1061
+ if (!scorer) {
1062
+ return;
1063
+ }
554
1064
 
555
- while (scorer->next(scorer)) {
556
- if (bits && !bv_get(bits, scorer->doc)) continue;
557
- fn(self, scorer->doc, scorer->score(scorer), arg);
558
- }
559
- scorer->destroy(scorer);
1065
+ while (scorer->next(scorer)) {
1066
+ if (IS_FILTERED(bits, filter_func, scorer, self)) {
1067
+ continue;
1068
+ }
1069
+ fn(self, scorer->doc, scorer->score(scorer), arg);
1070
+ }
1071
+ scorer->destroy(scorer);
560
1072
  }
561
1073
 
562
- static void s_search_each(Searcher *self, Query *query, Filter *filter,
563
- void (*fn)(Searcher *, int, float, void *), void *arg)
1074
+ static void isea_search_each(Searcher *self, Query *query, Filter *filter,
1075
+ filter_ft filter_func,
1076
+ void (*fn)(Searcher *, int, float, void *),
1077
+ void *arg)
564
1078
  {
565
- Weight *weight = q_weight(query, self);
566
- s_search_each_w(self, weight, filter, fn, arg);
567
- weight->destroy(weight);
1079
+ Weight *weight = q_weight(query, self);
1080
+ isea_search_each_w(self, weight, filter, filter_func, fn, arg);
1081
+ weight->destroy(weight);
568
1082
  }
569
1083
 
570
- static Query *s_rewrite(Searcher *self, Query *original)
1084
+ static Query *isea_rewrite(Searcher *self, Query *original)
571
1085
  {
572
- int q_is_destroyed = false;
573
- Query *query = original;
574
- Query *rewritten_query = query->rewrite(query, self->ir);
575
- while (q_is_destroyed || (query != rewritten_query)) {
576
- query = rewritten_query;
577
- rewritten_query = query->rewrite(query, self->ir);
578
- q_is_destroyed = (query->ref_cnt <= 1);
579
- q_deref(query); /* destroy intermediate queries */
580
- }
581
- return query;
1086
+ int q_is_destroyed = false;
1087
+ Query *query = original;
1088
+ Query *rewritten_query = query->rewrite(query, ISEA(self)->ir);
1089
+ while (q_is_destroyed || (query != rewritten_query)) {
1090
+ query = rewritten_query;
1091
+ rewritten_query = query->rewrite(query, ISEA(self)->ir);
1092
+ q_is_destroyed = (query->ref_cnt <= 1);
1093
+ q_deref(query); /* destroy intermediate queries */
1094
+ }
1095
+ return query;
582
1096
  }
583
1097
 
584
- static Explanation *s_explain(Searcher *self, Query *query, int doc_num)
1098
+ static Explanation *isea_explain(Searcher *self, Query *query, int doc_num)
585
1099
  {
586
- Weight *weight = q_weight(query, self);
587
- Explanation *e = weight->explain(weight, self->ir, doc_num);
588
- weight->destroy(weight);
589
- return e;
1100
+ Weight *weight = q_weight(query, self);
1101
+ Explanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
1102
+ weight->destroy(weight);
1103
+ return e;
590
1104
  }
591
1105
 
592
- static Explanation *s_explain_w(Searcher *self, Weight *w, int doc_num)
1106
+ static Explanation *isea_explain_w(Searcher *self, Weight *w, int doc_num)
593
1107
  {
594
- return w->explain(w, self->ir, doc_num);
1108
+ return w->explain(w, ISEA(self)->ir, doc_num);
595
1109
  }
596
1110
 
597
- static Similarity *s_get_similarity(Searcher *self)
1111
+ static TermVector *isea_get_term_vector(Searcher *self,
1112
+ const int doc_num,
1113
+ const char *field)
598
1114
  {
599
- return self->similarity;
1115
+ IndexReader *ir = ISEA(self)->ir;
1116
+ return ir->term_vector(ir, doc_num, field);
600
1117
  }
601
1118
 
602
- static void s_close(Searcher *self)
1119
+ static void isea_close(Searcher *self)
603
1120
  {
604
- if (self->ir && self->close_ir) {
605
- ir_close(self->ir);
606
- }
607
- free(self);
1121
+ if (ISEA(self)->ir && ISEA(self)->close_ir) {
1122
+ ir_close(ISEA(self)->ir);
1123
+ }
1124
+ free(self);
608
1125
  }
609
1126
 
610
- Searcher *sea_create(IndexReader *ir)
1127
+ Searcher *isea_new(IndexReader *ir)
611
1128
  {
612
- Searcher *self = ALLOC(Searcher);
613
- self->ir = ir;
614
- self->close_ir = true;
615
- self->similarity = sim_create_default();
616
- self->doc_freq = &s_doc_freq;
617
- self->doc_freqs = &s_doc_freqs;
618
- self->get_doc = &s_get_doc;
619
- self->max_doc = &s_max_doc;
620
- self->create_weight = &s_create_weight;
621
- self->search = &s_search;
622
- self->search_each = &s_search_each;
623
- self->search_each_w = &s_search_each_w;
624
- self->rewrite = &s_rewrite;
625
- self->explain = &s_explain;
626
- self->explain_w = &s_explain_w;
627
- self->get_similarity = &s_get_similarity;
628
- self->close = &s_close;
629
- return self;
1129
+ Searcher *self = (Searcher *)ecalloc(sizeof(IndexSearcher));
1130
+
1131
+ ISEA(self)->ir = ir;
1132
+ ISEA(self)->close_ir = true;
1133
+
1134
+ self->similarity = sim_create_default();
1135
+ self->doc_freq = &isea_doc_freq;
1136
+ self->get_doc = &isea_get_doc;
1137
+ self->get_lazy_doc = &isea_get_lazy_doc;
1138
+ self->max_doc = &isea_max_doc;
1139
+ self->create_weight = &sea_create_weight;
1140
+ self->search = &isea_search;
1141
+ self->search_w = &isea_search_w;
1142
+ self->search_each = &isea_search_each;
1143
+ self->search_each_w = &isea_search_each_w;
1144
+ self->rewrite = &isea_rewrite;
1145
+ self->explain = &isea_explain;
1146
+ self->explain_w = &isea_explain_w;
1147
+ self->get_term_vector = &isea_get_term_vector;
1148
+ self->get_similarity = &sea_get_similarity;
1149
+ self->close = &isea_close;
1150
+
1151
+ return self;
630
1152
  }
631
1153
 
632
1154
  /***************************************************************************
@@ -635,109 +1157,144 @@ Searcher *sea_create(IndexReader *ir)
635
1157
  *
636
1158
  ***************************************************************************/
637
1159
 
638
- typedef struct CachedDFSearcher {
639
- HshTable *df_map;
640
- int max_doc;
1160
+ #define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
1161
+ typedef struct CachedDFSearcher
1162
+ {
1163
+ Searcher super;
1164
+ HashTable *df_map;
1165
+ int max_doc;
641
1166
  } CachedDFSearcher;
642
1167
 
643
- static int cdfsea_doc_freq(Searcher *self, Term *term)
1168
+ static int cdfsea_doc_freq(Searcher *self, const char *field, const char *text)
644
1169
  {
645
- CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
646
- return (int)h_get(cdfsea->df_map, term);
1170
+ Term term;
1171
+ int *df;
1172
+ term.field = (char *)field;
1173
+ term.text = (char *)text;
1174
+ df = (int *)h_get(CDFSEA(self)->df_map, &term);
1175
+ return df ? *df : 0;
647
1176
  }
648
1177
 
649
1178
  static Document *cdfsea_get_doc(Searcher *self, int doc_num)
650
1179
  {
651
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
652
- return NULL;
1180
+ (void)self; (void)doc_num;
1181
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1182
+ return NULL;
653
1183
  }
654
1184
 
655
1185
  static int cdfsea_max_doc(Searcher *self)
656
1186
  {
657
- return ((CachedDFSearcher *)self->data)->max_doc;
1187
+ (void)self;
1188
+ return CDFSEA(self)->max_doc;
658
1189
  }
659
1190
 
660
1191
  static Weight *cdfsea_create_weight(Searcher *self, Query *query)
661
1192
  {
662
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
663
- return NULL;
1193
+ (void)self; (void)query;
1194
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1195
+ return NULL;
1196
+ }
1197
+
1198
+ static TopDocs *cdfsea_search_w(Searcher *self, Weight *w, int fd, int nd,
1199
+ Filter *f, Sort *s, filter_ft ff, bool load)
1200
+ {
1201
+ (void)self; (void)w; (void)fd; (void)nd;
1202
+ (void)f; (void)s; (void)ff, (void)load;
1203
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1204
+ return NULL;
664
1205
  }
665
1206
 
666
- static TopDocs *cdfsea_search(Searcher *self, Query *query, int first_doc,
667
- int num_docs, Filter *filter, Sort *sort)
1207
+ static TopDocs *cdfsea_search(Searcher *self, Query *q, int fd, int nd,
1208
+ Filter *f, Sort *s, filter_ft ff, bool load)
668
1209
  {
669
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
670
- return NULL;
1210
+ (void)self; (void)q; (void)fd; (void)nd;
1211
+ (void)f; (void)s; (void)ff, (void)load;
1212
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1213
+ return NULL;
671
1214
  }
672
1215
 
673
1216
  static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
674
- void (*fn)(Searcher *, int, float, void *), void *arg)
1217
+ filter_ft ff,
1218
+ void (*fn)(Searcher *, int, float, void *),
1219
+ void *arg)
675
1220
  {
676
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1221
+ (void)self; (void)query; (void)filter; (void)ff; (void)fn; (void)arg;
1222
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
677
1223
  }
678
1224
 
679
1225
  static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
680
- void (*fn)(Searcher *, int, float, void *), void *arg)
1226
+ filter_ft ff,
1227
+ void (*fn)(Searcher *, int, float, void *),
1228
+ void *arg)
681
1229
  {
682
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1230
+ (void)self; (void)w; (void)filter; (void)ff; (void)fn; (void)arg;
1231
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
683
1232
  }
684
1233
 
685
1234
  static Query *cdfsea_rewrite(Searcher *self, Query *original)
686
1235
  {
687
- original->ref_cnt++;
688
- return original;
1236
+ (void)self;
1237
+ original->ref_cnt++;
1238
+ return original;
689
1239
  }
690
1240
 
691
1241
  static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
692
1242
  {
693
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
694
- return NULL;
1243
+ (void)self; (void)query; (void)doc_num;
1244
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1245
+ return NULL;
695
1246
  }
696
1247
 
697
1248
  static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
698
1249
  {
699
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
700
- return NULL;
1250
+ (void)self; (void)w; (void)doc_num;
1251
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1252
+ return NULL;
1253
+ }
1254
+
1255
+ static TermVector *cdfsea_get_term_vector(Searcher *self, const int doc_num,
1256
+ const char *field)
1257
+ {
1258
+ (void)self; (void)doc_num; (void)field;
1259
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1260
+ return NULL;
701
1261
  }
702
1262
 
703
1263
  static Similarity *cdfsea_get_similarity(Searcher *self)
704
1264
  {
705
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
706
- return NULL;
1265
+ (void)self;
1266
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1267
+ return NULL;
707
1268
  }
708
1269
 
709
1270
  static void cdfsea_close(Searcher *self)
710
1271
  {
711
- CachedDFSearcher *cdfsea = (CachedDFSearcher *)self->data;
712
- h_destroy(cdfsea->df_map);
713
- free(cdfsea);
714
- free(self);
1272
+ h_destroy(CDFSEA(self)->df_map);
1273
+ free(self);
715
1274
  }
716
1275
 
717
- Searcher *cdfsea_create(HshTable *df_map, int max_doc)
1276
+ static Searcher *cdfsea_new(HashTable *df_map, int max_doc)
718
1277
  {
719
- Searcher *self = ALLOC(Searcher);
1278
+ Searcher *self = (Searcher *)ecalloc(sizeof(CachedDFSearcher));
720
1279
 
721
- CachedDFSearcher *cdfsea = ALLOC(CachedDFSearcher);
1280
+ CDFSEA(self)->df_map = df_map;
1281
+ CDFSEA(self)->max_doc = max_doc;
722
1282
 
723
- cdfsea->df_map = df_map;
724
- cdfsea->max_doc = max_doc;
725
- self->data = cdfsea;
726
-
727
- self->doc_freq = &cdfsea_doc_freq;
728
- self->doc_freqs = &ss_doc_freqs;
729
- self->get_doc = &cdfsea_get_doc;
730
- self->max_doc = &cdfsea_max_doc;
731
- self->create_weight = &cdfsea_create_weight;
732
- self->search = &cdfsea_search;
733
- self->search_each = &cdfsea_search_each;
734
- self->search_each_w = &cdfsea_search_each_w;
735
- self->rewrite = &cdfsea_rewrite;
736
- self->explain = &cdfsea_explain;
737
- self->explain_w = &cdfsea_explain_w;
738
- self->get_similarity = &cdfsea_get_similarity;
739
- self->close = &cdfsea_close;
740
- return self;
1283
+ self->doc_freq = &cdfsea_doc_freq;
1284
+ self->get_doc = &cdfsea_get_doc;
1285
+ self->max_doc = &cdfsea_max_doc;
1286
+ self->create_weight = &cdfsea_create_weight;
1287
+ self->search = &cdfsea_search;
1288
+ self->search_w = &cdfsea_search_w;
1289
+ self->search_each = &cdfsea_search_each;
1290
+ self->search_each_w = &cdfsea_search_each_w;
1291
+ self->rewrite = &cdfsea_rewrite;
1292
+ self->explain = &cdfsea_explain;
1293
+ self->explain_w = &cdfsea_explain_w;
1294
+ self->get_term_vector = &cdfsea_get_term_vector;
1295
+ self->get_similarity = &cdfsea_get_similarity;
1296
+ self->close = &cdfsea_close;
1297
+ return self;
741
1298
  }
742
1299
 
743
1300
  /***************************************************************************
@@ -746,301 +1303,367 @@ Searcher *cdfsea_create(HshTable *df_map, int max_doc)
746
1303
  *
747
1304
  ***************************************************************************/
748
1305
 
1306
+ #define MSEA(searcher) ((MultiSearcher *)(searcher))
749
1307
  static inline int msea_get_searcher_index(Searcher *self, int n)
750
1308
  {
751
- MultiSearcher *msea = (MultiSearcher *)self->data;
752
- int lo = 0; /* search starts array */
753
- int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
754
- int mid, mid_val;
1309
+ MultiSearcher *msea = MSEA(self);
1310
+ int lo = 0; /* search starts array */
1311
+ int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
1312
+ int mid, mid_val;
755
1313
 
756
- while (hi >= lo) {
757
- mid = (lo + hi) >> 1;
758
- mid_val = msea->starts[mid];
759
- if (n < mid_val) {
760
- hi = mid - 1;
761
- } else if (n > mid_val) {
762
- lo = mid + 1;
763
- } else { /* found a match */
764
- while (((mid+1) < msea->s_cnt) && (msea->starts[mid+1] == mid_val)) {
765
- mid++; /* scan to last match */
766
- }
767
- return mid;
1314
+ while (hi >= lo) {
1315
+ mid = (lo + hi) >> 1;
1316
+ mid_val = msea->starts[mid];
1317
+ if (n < mid_val) {
1318
+ hi = mid - 1;
1319
+ }
1320
+ else if (n > mid_val) {
1321
+ lo = mid + 1;
1322
+ }
1323
+ else { /* found a match */
1324
+ while (((mid+1) < msea->s_cnt)
1325
+ && (msea->starts[mid+1] == mid_val)) {
1326
+ mid++; /* scan to last match */
1327
+ }
1328
+ return mid;
1329
+ }
768
1330
  }
769
- }
770
- return hi;
1331
+ return hi;
771
1332
  }
772
1333
 
773
- static int msea_doc_freq(Searcher *self, Term *term)
1334
+ static int msea_doc_freq(Searcher *self, const char *field, const char *term)
774
1335
  {
775
- int i;
776
- int doc_freq = 0;
777
- Searcher *s;
778
- MultiSearcher *msea = (MultiSearcher *)self->data;
779
- for (i = 0; i < msea->s_cnt; i++) {
780
- s = msea->searchers[i];
781
- doc_freq += s->doc_freq(s, term);
782
- }
1336
+ int i;
1337
+ int doc_freq = 0;
1338
+ MultiSearcher *msea = MSEA(self);
1339
+ for (i = 0; i < msea->s_cnt; i++) {
1340
+ Searcher *s = msea->searchers[i];
1341
+ doc_freq += s->doc_freq(s, field, term);
1342
+ }
783
1343
 
784
- return doc_freq;
1344
+ return doc_freq;
785
1345
  }
786
1346
 
787
1347
  static Document *msea_get_doc(Searcher *self, int doc_num)
788
1348
  {
789
- MultiSearcher *msea = (MultiSearcher *)self->data;
790
- int i = msea_get_searcher_index(self, doc_num);
791
- Searcher *s = msea->searchers[i];
792
- return s->get_doc(s, doc_num - msea->starts[i]);
1349
+ MultiSearcher *msea = MSEA(self);
1350
+ int i = msea_get_searcher_index(self, doc_num);
1351
+ Searcher *s = msea->searchers[i];
1352
+ return s->get_doc(s, doc_num - msea->starts[i]);
1353
+ }
1354
+
1355
+ static LazyDoc *msea_get_lazy_doc(Searcher *self, int doc_num)
1356
+ {
1357
+ MultiSearcher *msea = MSEA(self);
1358
+ int i = msea_get_searcher_index(self, doc_num);
1359
+ Searcher *s = msea->searchers[i];
1360
+ return s->get_lazy_doc(s, doc_num - msea->starts[i]);
793
1361
  }
794
1362
 
795
1363
  static int msea_max_doc(Searcher *self)
796
1364
  {
797
- return ((MultiSearcher *)self->data)->max_doc;
1365
+ return MSEA(self)->max_doc;
1366
+ }
1367
+
1368
+ static int *msea_get_doc_freqs(Searcher *self, HashSet *terms)
1369
+ {
1370
+ int i;
1371
+ const int num_terms = terms->size;
1372
+ int *doc_freqs = ALLOC_N(int, num_terms);
1373
+ for (i = 0; i < num_terms; i++) {
1374
+ Term *t = (Term *)terms->elems[i];
1375
+ doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
1376
+ }
1377
+ return doc_freqs;
798
1378
  }
799
1379
 
800
1380
  static Weight *msea_create_weight(Searcher *self, Query *query)
801
1381
  {
802
- int i, *dfs;
803
- Searcher *cdfsea;
804
- Weight *w;
805
- HshTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
806
- (free_ft)NULL, (free_ft)NULL);
807
- Query *rq = self->rewrite(self, query);
808
- HashSet *terms = term_set_create();
809
- rq->extract_terms(rq, terms);
810
- dfs = self->doc_freqs(self, (Term **)terms->elems, terms->size);
1382
+ int i, *doc_freqs;
1383
+ Searcher *cdfsea;
1384
+ Weight *w;
1385
+ HashTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
1386
+ (free_ft)NULL, free);
1387
+ Query *rewritten_query = self->rewrite(self, query);
1388
+ HashSet *terms = term_set_new();
811
1389
 
812
- for (i = 0; i < terms->size; i++) {
813
- h_set(df_map, terms->elems[i], (void *)dfs[i]);
814
- }
815
- /* don't destroy the individual terms, only the HashSet */
816
- hs_destroy(terms);
817
- free(dfs);
1390
+ rewritten_query->extract_terms(rewritten_query, terms);
1391
+ doc_freqs = msea_get_doc_freqs(self, terms);
818
1392
 
819
- cdfsea = cdfsea_create(df_map, ((MultiSearcher *)self->data)->max_doc);
1393
+ for (i = 0; i < terms->size; i++) {
1394
+ h_set(df_map, terms->elems[i], imalloc(doc_freqs[i]));
1395
+ }
1396
+ hs_destroy(terms);
1397
+ free(doc_freqs);
820
1398
 
821
- w = q_weight(rq, cdfsea);
822
- q_deref(rq);
823
- cdfsea->close(cdfsea);
1399
+ cdfsea = cdfsea_new(df_map, MSEA(self)->max_doc);
824
1400
 
825
- return w;
1401
+ w = q_weight(rewritten_query, cdfsea);
1402
+ q_deref(rewritten_query);
1403
+ cdfsea->close(cdfsea);
1404
+
1405
+ return w;
826
1406
  }
827
1407
 
828
1408
  struct MultiSearchEachArg {
829
- int start;
830
- void *arg;
831
- void (*fn)(Searcher *, int, float, void *);
1409
+ int start;
1410
+ void *arg;
1411
+ void (*fn)(Searcher *, int, float, void *);
832
1412
  };
833
1413
 
834
1414
  void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
835
1415
  {
836
- struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
1416
+ struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
837
1417
 
838
- mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
1418
+ mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
839
1419
  }
840
1420
 
841
1421
  static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
842
- void (*fn)(Searcher *, int, float, void *), void *arg)
1422
+ filter_ft filter_func,
1423
+ void (*fn)(Searcher *, int, float, void *),
1424
+ void *arg)
843
1425
  {
844
- int i;
845
- struct MultiSearchEachArg mse_arg;
846
- MultiSearcher *msea = (MultiSearcher *)self->data;
847
- Searcher *s;
1426
+ int i;
1427
+ struct MultiSearchEachArg mse_arg;
1428
+ MultiSearcher *msea = MSEA(self);
1429
+ Searcher *s;
848
1430
 
849
- mse_arg.fn = fn;
850
- mse_arg.arg = arg;
851
- for (i = 0; i < msea->s_cnt; i++) {
852
- s = msea->searchers[i];
853
- mse_arg.start = msea->starts[i];
854
- s->search_each_w(s, w, filter, &msea_search_each_i, &mse_arg);
855
- }
1431
+ mse_arg.fn = fn;
1432
+ mse_arg.arg = arg;
1433
+ for (i = 0; i < msea->s_cnt; i++) {
1434
+ s = msea->searchers[i];
1435
+ mse_arg.start = msea->starts[i];
1436
+ s->search_each_w(s, w, filter, filter_func,
1437
+ &msea_search_each_i, &mse_arg);
1438
+ }
856
1439
  }
857
1440
 
858
1441
  static void msea_search_each(Searcher *self, Query *query, Filter *filter,
859
- void (*fn)(Searcher *, int, float, void *), void *arg)
1442
+ filter_ft filter_func,
1443
+ void (*fn)(Searcher *, int, float, void *), void *arg)
860
1444
  {
861
- Weight *w = q_weight(query, self);
862
- msea_search_each_w(self, w, filter, fn, arg);
863
- w->destroy(w);
1445
+ Weight *w = q_weight(query, self);
1446
+ msea_search_each_w(self, w, filter, filter_func, fn, arg);
1447
+ w->destroy(w);
864
1448
  }
865
1449
 
866
1450
  struct MultiSearchArg {
867
- int total_hits, max_size;
868
- PriorityQueue *hq;
869
- void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1451
+ int total_hits, max_size;
1452
+ PriorityQueue *hq;
1453
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
870
1454
  };
871
1455
 
872
1456
  void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
873
1457
  {
874
- struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
875
- Hit hit;
876
-
877
- ms_arg->total_hits++;
878
- hit.doc = doc_num;
879
- hit.score = score;
880
- ms_arg->hq_insert(ms_arg->hq, &hit);
881
- }
882
-
883
- static TopDocs *msea_search(Searcher *self, Query *query, int first_doc,
884
- int num_docs, Filter *filter, Sort *sort)
885
- {
886
- int max_size = first_doc + num_docs;
887
- int i;
888
- Weight *weight;
889
- Hit **score_docs = NULL;
890
- BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
891
- Hit *(*hq_pop)(PriorityQueue *pq);
892
- void (*hq_insert)(PriorityQueue *pq, Hit *hit);
893
- void (*hq_destroy)(PriorityQueue *self);
894
- PriorityQueue *hq;
895
- struct MultiSearchArg ms_arg;
896
-
897
-
898
- if (num_docs <= 0)
899
- RAISE(ARG_ERROR, NUM_DOCS_ARG_ERROR_MSG);
900
-
901
- if (first_doc < 0)
902
- RAISE(ARG_ERROR, FIRST_DOC_ARG_ERROR_MSG);
903
-
904
- weight = q_weight(query, self);
905
- if (sort) {
906
- hq = fshq_pq_create(max_size, sort, self->ir);
907
- hq_pop = &fshq_pq_pop;
908
- hq_insert = &fshq_pq_insert;
909
- hq_destroy = &fshq_pq_destroy;
910
- } else {
911
- hq = pq_create(max_size, &hit_less_than);
912
- hq_pop = &hit_pq_pop;
913
- hq_insert = &hit_pq_insert;
914
- hq_destroy = &pq_destroy;
915
- }
916
-
917
-
918
- ms_arg.hq = hq;
919
- ms_arg.total_hits = 0;
920
- ms_arg.max_size = max_size;
921
- ms_arg.hq_insert = hq_insert;
922
-
923
- msea_search_each_w(self, weight, filter, msea_search_i, &ms_arg);
1458
+ struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
1459
+ Hit hit;
1460
+ (void)self;
1461
+
1462
+ ms_arg->total_hits++;
1463
+ hit.doc = doc_num;
1464
+ hit.score = score;
1465
+ ms_arg->hq_insert(ms_arg->hq, &hit);
1466
+ }
1467
+
1468
+ static TopDocs *msea_search_w(Searcher *self,
1469
+ Weight *weight,
1470
+ int first_doc,
1471
+ int num_docs,
1472
+ Filter *filter,
1473
+ Sort *sort,
1474
+ filter_ft filter_func,
1475
+ bool load_fields)
1476
+ {
1477
+ int max_size = first_doc + num_docs;
1478
+ int i;
1479
+ int total_hits = 0;
1480
+ Hit **score_docs = NULL;
1481
+ Hit *(*hq_pop)(PriorityQueue *pq);
1482
+ void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1483
+ PriorityQueue *hq;
1484
+ float max_score = 0.0;
1485
+ (void)load_fields; /* does it automatically */
1486
+
1487
+ sea_check_args(num_docs, first_doc);
1488
+
1489
+ if (sort) {
1490
+ hq = pq_new(max_size, (lt_ft)fdshq_lt, &free);
1491
+ hq_insert = (void (*)(PriorityQueue *pq, Hit *hit))&pq_insert;
1492
+ hq_pop = (Hit *(*)(PriorityQueue *pq))&pq_pop;
1493
+ }
1494
+ else {
1495
+ hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
1496
+ hq_insert = &hit_pq_multi_insert;
1497
+ hq_pop = &hit_pq_pop;
1498
+ }
924
1499
 
925
- weight->destroy(weight);
1500
+ /*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
1501
+ for (i = 0; i < MSEA(self)->s_cnt; i++) {
1502
+ Searcher *s = MSEA(self)->searchers[i];
1503
+ TopDocs *td = s->search_w(s, weight, 0, max_size,
1504
+ filter, sort, filter_func, true);
1505
+ /*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
1506
+ if (td->size > 0) {
1507
+ /*printf("td->size = %d %d\n", td->size, num_docs); */
1508
+ int j;
1509
+ int start = MSEA(self)->starts[i];
1510
+ for (j = 0; j < td->size; j++) {
1511
+ Hit *hit = td->hits[j];
1512
+ hit->doc += start;
1513
+ /*
1514
+ printf("adding hit = %d:%f\n", hit->doc, hit->score);
1515
+ */
1516
+ hq_insert(hq, hit);
1517
+ }
1518
+ td->size = 0;
1519
+ if (td->max_score > max_score) max_score = td->max_score;
1520
+ }
1521
+ total_hits += td->total_hits;
1522
+ td_destroy(td);
1523
+ }
926
1524
 
927
- if (hq->count > first_doc) {
928
- if ((hq->count - first_doc) < num_docs) {
929
- num_docs = hq->count - first_doc;
1525
+ if (hq->size > first_doc) {
1526
+ if ((hq->size - first_doc) < num_docs) {
1527
+ num_docs = hq->size - first_doc;
1528
+ }
1529
+ score_docs = ALLOC_N(Hit *, num_docs);
1530
+ for (i = num_docs - 1; i >= 0; i--) {
1531
+ score_docs[i] = hq_pop(hq);
1532
+ /*
1533
+ Hit *hit = score_docs[i] = hq_pop(hq);
1534
+ printf("popped hit = %d-->%f\n", hit->doc, hit->score);
1535
+ */
1536
+ }
930
1537
  }
931
- score_docs = ALLOC_N(Hit *, num_docs);
932
- for (i = num_docs - 1; i >= 0; i--) {
933
- score_docs[i] = hq_pop(hq);
934
- //hit = score_docs[i] = pq_pop(hq);
935
- //printf("hit = %d-->%f\n", hit->doc, hit->score);
1538
+ else {
1539
+ num_docs = 0;
936
1540
  }
937
- } else {
938
- num_docs = 0;
939
- }
940
- pq_clear(hq);
941
- hq_destroy(hq);
1541
+ pq_clear(hq);
1542
+ pq_destroy(hq);
1543
+
1544
+ return td_new(total_hits, num_docs, score_docs, max_score);
1545
+ }
942
1546
 
943
- if (bits) bv_destroy(bits);
944
- return td_create(ms_arg.total_hits, num_docs, score_docs);
1547
+ static TopDocs *msea_search(Searcher *self,
1548
+ Query *query,
1549
+ int first_doc,
1550
+ int num_docs,
1551
+ Filter *filter,
1552
+ Sort *sort,
1553
+ filter_ft filter_func,
1554
+ bool load_fields)
1555
+ {
1556
+ TopDocs *td;
1557
+ Weight *weight = q_weight(query, self);
1558
+ td = msea_search_w(self, weight, first_doc, num_docs, filter,
1559
+ sort, filter_func, load_fields);
1560
+ weight->destroy(weight);
1561
+ return td;
945
1562
  }
946
1563
 
947
1564
  static Query *msea_rewrite(Searcher *self, Query *original)
948
1565
  {
949
- int i;
950
- Searcher *s;
951
- MultiSearcher *msea = (MultiSearcher *)self->data;
952
- Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
1566
+ int i;
1567
+ Searcher *s;
1568
+ MultiSearcher *msea = MSEA(self);
1569
+ Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
953
1570
 
954
- for (i = 0; i < msea->s_cnt; i++) {
955
- s = msea->searchers[i];
956
- queries[i] = s->rewrite(s, original);
957
- }
958
- rewritten = q_combine(queries, msea->s_cnt);
1571
+ for (i = 0; i < msea->s_cnt; i++) {
1572
+ s = msea->searchers[i];
1573
+ queries[i] = s->rewrite(s, original);
1574
+ }
1575
+ rewritten = q_combine(queries, msea->s_cnt);
959
1576
 
960
- for (i = 0; i < msea->s_cnt; i++) {
961
- q_deref(queries[i]);
962
- }
963
- free(queries);
964
- return rewritten;
1577
+ for (i = 0; i < msea->s_cnt; i++) {
1578
+ q_deref(queries[i]);
1579
+ }
1580
+ free(queries);
1581
+ return rewritten;
965
1582
  }
966
1583
 
967
1584
  static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
968
1585
  {
969
- MultiSearcher *msea = (MultiSearcher *)self->data;
970
- int i = msea_get_searcher_index(self, doc_num);
971
- Weight *w = q_weight(query, self);
972
- Searcher *s = msea->searchers[i];
973
- Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
974
- w->destroy(w);
975
- return e;
1586
+ MultiSearcher *msea = MSEA(self);
1587
+ int i = msea_get_searcher_index(self, doc_num);
1588
+ Weight *w = q_weight(query, self);
1589
+ Searcher *s = msea->searchers[i];
1590
+ Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1591
+ w->destroy(w);
1592
+ return e;
976
1593
  }
977
1594
 
978
1595
  static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
979
1596
  {
980
- MultiSearcher *msea = (MultiSearcher *)self->data;
981
- int i = msea_get_searcher_index(self, doc_num);
982
- Searcher *s = msea->searchers[i];
983
- Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
984
- return e;
1597
+ MultiSearcher *msea = MSEA(self);
1598
+ int i = msea_get_searcher_index(self, doc_num);
1599
+ Searcher *s = msea->searchers[i];
1600
+ Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1601
+ return e;
1602
+ }
1603
+
1604
+ static TermVector *msea_get_term_vector(Searcher *self, const int doc_num,
1605
+ const char *field)
1606
+ {
1607
+ MultiSearcher *msea = MSEA(self);
1608
+ int i = msea_get_searcher_index(self, doc_num);
1609
+ Searcher *s = msea->searchers[i];
1610
+ return s->get_term_vector(s, doc_num - msea->starts[i],
1611
+ field);
985
1612
  }
986
1613
 
987
1614
  static Similarity *msea_get_similarity(Searcher *self)
988
1615
  {
989
- return self->similarity;
1616
+ return self->similarity;
990
1617
  }
991
1618
 
992
1619
  static void msea_close(Searcher *self)
993
1620
  {
994
- int i;
995
- Searcher *s;
996
- MultiSearcher *msea = (MultiSearcher *)self->data;
997
- if (msea->close_subs) {
998
- for (i = 0; i < msea->s_cnt; i++) {
999
- s = msea->searchers[i];
1000
- s->close(s);
1621
+ int i;
1622
+ Searcher *s;
1623
+ MultiSearcher *msea = MSEA(self);
1624
+ if (msea->close_subs) {
1625
+ for (i = 0; i < msea->s_cnt; i++) {
1626
+ s = msea->searchers[i];
1627
+ s->close(s);
1628
+ }
1629
+ free(msea->searchers);
1001
1630
  }
1002
- free(msea->searchers);
1003
- }
1004
- free(msea->starts);
1005
- free(msea);
1006
- free(self);
1631
+ free(msea->starts);
1632
+ free(self);
1007
1633
  }
1008
1634
 
1009
- Searcher *msea_create(Searcher **searchers, int s_cnt, bool close_subs)
1635
+ Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
1010
1636
  {
1011
- int i, max_doc = 0, *starts;
1012
- Searcher *self = ALLOC(Searcher);
1013
-
1014
- MultiSearcher *msea = ALLOC(MultiSearcher);
1015
-
1016
- starts = ALLOC_N(int, s_cnt + 1);
1017
- for (i = 0; i < s_cnt; i++) {
1637
+ int i, max_doc = 0;
1638
+ Searcher *self = (Searcher *)ecalloc(sizeof(MultiSearcher));
1639
+ int *starts = ALLOC_N(int, s_cnt + 1);
1640
+ for (i = 0; i < s_cnt; i++) {
1641
+ starts[i] = max_doc;
1642
+ max_doc += searchers[i]->max_doc(searchers[i]);
1643
+ }
1018
1644
  starts[i] = max_doc;
1019
- max_doc += searchers[i]->max_doc(searchers[i]);
1020
- }
1021
- starts[i] = max_doc;
1022
-
1023
- msea->s_cnt = s_cnt;
1024
- msea->searchers = searchers;
1025
- msea->starts = starts;
1026
- msea->max_doc = max_doc;
1027
- msea->close_subs = close_subs;
1028
- self->data = msea;
1029
-
1030
- self->ir = (IndexReader *)NULL;
1031
- self->similarity = sim_create_default();
1032
- self->doc_freq = &msea_doc_freq;
1033
- self->doc_freqs = &ss_doc_freqs;
1034
- self->get_doc = &msea_get_doc;
1035
- self->max_doc = &msea_max_doc;
1036
- self->create_weight = &msea_create_weight;
1037
- self->search = &msea_search;
1038
- self->search_each = &msea_search_each;
1039
- self->search_each_w = &msea_search_each_w;
1040
- self->rewrite = &msea_rewrite;
1041
- self->explain = &msea_explain;
1042
- self->explain_w = &msea_explain_w;
1043
- self->get_similarity = &msea_get_similarity;
1044
- self->close = &msea_close;
1045
- return self;
1645
+
1646
+ MSEA(self)->s_cnt = s_cnt;
1647
+ MSEA(self)->searchers = searchers;
1648
+ MSEA(self)->starts = starts;
1649
+ MSEA(self)->max_doc = max_doc;
1650
+ MSEA(self)->close_subs = close_subs;
1651
+
1652
+ self->similarity = sim_create_default();
1653
+ self->doc_freq = &msea_doc_freq;
1654
+ self->get_doc = &msea_get_doc;
1655
+ self->get_lazy_doc = &msea_get_lazy_doc;
1656
+ self->max_doc = &msea_max_doc;
1657
+ self->create_weight = &msea_create_weight;
1658
+ self->search = &msea_search;
1659
+ self->search_w = &msea_search_w;
1660
+ self->search_each = &msea_search_each;
1661
+ self->search_each_w = &msea_search_each_w;
1662
+ self->rewrite = &msea_rewrite;
1663
+ self->explain = &msea_explain;
1664
+ self->explain_w = &msea_explain_w;
1665
+ self->get_term_vector = &msea_get_term_vector;
1666
+ self->get_similarity = &msea_get_similarity;
1667
+ self->close = &msea_close;
1668
+ return self;
1046
1669
  }