isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,1824 @@
1
+ #include <string.h>
2
+ #include <limits.h>
3
+ #include "frt_search.h"
4
+ #include "frt_array.h"
5
+
6
+ /***************************************************************************
7
+ *
8
+ * FrtExplanation - Used to give details for query scores
9
+ *
10
+ ***************************************************************************/
11
+
12
+ FrtExplanation *frt_expl_new(float value, const char *description, ...)
13
+ {
14
+ FrtExplanation *expl = FRT_ALLOC(FrtExplanation);
15
+
16
+ va_list args;
17
+ va_start(args, description);
18
+ expl->description = frt_vstrfmt(description, args);
19
+ va_end(args);
20
+
21
+ expl->value = value;
22
+ expl->details = frt_ary_new_type_capa(FrtExplanation *,
23
+ FRT_EXPLANATION_DETAILS_START_SIZE);
24
+ return expl;
25
+ }
26
+
27
+ void frt_expl_destroy(FrtExplanation *expl)
28
+ {
29
+ frt_ary_destroy((void **)expl->details, (frt_free_ft)frt_expl_destroy);
30
+ free(expl->description);
31
+ free(expl);
32
+ }
33
+
34
+ FrtExplanation *frt_expl_add_detail(FrtExplanation *expl, FrtExplanation *detail)
35
+ {
36
+ frt_ary_push(expl->details, detail);
37
+ return expl;
38
+ }
39
+
40
+ char *frt_expl_to_s_depth(FrtExplanation *expl, int depth)
41
+ {
42
+ int i;
43
+ char *buffer = FRT_ALLOC_N(char, depth * 2 + 1);
44
+ const int num_details = frt_ary_size(expl->details);
45
+
46
+ memset(buffer, ' ', sizeof(char) * depth * 2);
47
+ buffer[depth*2] = 0;
48
+
49
+ buffer = frt_estrcat(buffer, frt_strfmt("%f = %s\n",
50
+ expl->value, expl->description));
51
+ for (i = 0; i < num_details; i++) {
52
+ buffer = frt_estrcat(buffer, frt_expl_to_s_depth(expl->details[i], depth + 1));
53
+ }
54
+
55
+ return buffer;
56
+ }
57
+
58
+ char *frt_expl_to_html(FrtExplanation *expl)
59
+ {
60
+ int i;
61
+ char *buffer;
62
+ const int num_details = frt_ary_size(expl->details);
63
+
64
+ buffer = frt_strfmt("<ul>\n<li>%f = %s</li>\n", expl->value, expl->description);
65
+
66
+ for (i = 0; i < num_details; i++) {
67
+ frt_estrcat(buffer, frt_expl_to_html(expl->details[i]));
68
+ }
69
+
70
+ FRT_REALLOC_N(buffer, char, strlen(buffer) + 10);
71
+ return strcat(buffer, "</ul>\n");
72
+ }
73
+
74
+ /***************************************************************************
75
+ *
76
+ * Hit
77
+ *
78
+ ***************************************************************************/
79
+
80
+ static bool hit_lt(FrtHit *hit1, FrtHit *hit2)
81
+ {
82
+ if (hit1->score == hit2->score) {
83
+ return hit1->doc > hit2->doc;
84
+ } else {
85
+ return hit1->score < hit2->score;
86
+ }
87
+ }
88
+
89
+ static void hit_pq_down(FrtPriorityQueue *pq)
90
+ {
91
+ register int i = 1;
92
+ register int j = 2; /* i << 1; */
93
+ register int k = 3; /* j + 1; */
94
+ FrtHit **heap = (FrtHit **)pq->heap;
95
+ FrtHit *node = heap[i]; /* save top node */
96
+
97
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
98
+ j = k;
99
+ }
100
+
101
+ while ((j <= pq->size) && hit_lt(heap[j], node)) {
102
+ heap[i] = heap[j]; /* shift up child */
103
+ i = j;
104
+ j = i << 1;
105
+ k = j + 1;
106
+ if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
107
+ j = k;
108
+ }
109
+ }
110
+ heap[i] = node;
111
+ }
112
+
113
+ static FrtHit *hit_pq_pop(FrtPriorityQueue *pq)
114
+ {
115
+ if (pq->size > 0) {
116
+ FrtHit **heap = (FrtHit **)pq->heap;
117
+ FrtHit *result = heap[1]; /* save first value */
118
+ heap[1] = heap[pq->size]; /* move last to first */
119
+ heap[pq->size] = NULL;
120
+ pq->size--;
121
+ hit_pq_down(pq); /* adjust heap */
122
+ return result;
123
+ }
124
+ else {
125
+ return NULL;
126
+ }
127
+ }
128
+
129
+ static void hit_pq_up(FrtPriorityQueue *pq)
130
+ {
131
+ FrtHit **heap = (FrtHit **)pq->heap;
132
+ FrtHit *node;
133
+ int i = pq->size;
134
+ int j = i >> 1;
135
+ node = heap[i];
136
+
137
+ while ((j > 0) && hit_lt(node, heap[j])) {
138
+ heap[i] = heap[j];
139
+ i = j;
140
+ j = j >> 1;
141
+ }
142
+ heap[i] = node;
143
+ }
144
+
145
+ static void hit_pq_insert(FrtPriorityQueue *pq, FrtHit *hit)
146
+ {
147
+ if (pq->size < pq->capa) {
148
+ FrtHit *new_hit = FRT_ALLOC(FrtHit);
149
+ memcpy(new_hit, hit, sizeof(FrtHit));
150
+ pq->size++;
151
+ if (pq->size >= pq->mem_capa) {
152
+ pq->mem_capa <<= 1;
153
+ FRT_REALLOC_N(pq->heap, void *, pq->mem_capa);
154
+ }
155
+ pq->heap[pq->size] = new_hit;
156
+ hit_pq_up(pq);
157
+ } else if (pq->size > 0 && hit_lt((FrtHit *)pq->heap[1], hit)) {
158
+ memcpy(pq->heap[1], hit, sizeof(FrtHit));
159
+ hit_pq_down(pq);
160
+ }
161
+ }
162
+
163
+ static void hit_pq_multi_insert(FrtPriorityQueue *pq, FrtHit *hit)
164
+ {
165
+ hit_pq_insert(pq, hit);
166
+ free(hit);
167
+ }
168
+
169
+ /***************************************************************************
170
+ *
171
+ * TopDocs
172
+ *
173
+ ***************************************************************************/
174
+
175
+ FrtTopDocs *frt_td_new(int total_hits, int size, FrtHit **hits, float max_score)
176
+ {
177
+ FrtTopDocs *td = FRT_ALLOC(FrtTopDocs);
178
+ td->total_hits = total_hits;
179
+ td->size = size;
180
+ td->hits = hits;
181
+ td->max_score = max_score;
182
+ return td;
183
+ }
184
+
185
+ void frt_td_destroy(FrtTopDocs *td)
186
+ {
187
+ int i;
188
+
189
+ for (i = 0; i < td->size; i++) {
190
+ free(td->hits[i]);
191
+ }
192
+ free(td->hits);
193
+ free(td);
194
+ }
195
+
196
+ char *frt_td_to_s(FrtTopDocs *td)
197
+ {
198
+ int i;
199
+ FrtHit *hit;
200
+ char *buffer = frt_strfmt("%d hits sorted by <score, doc_num>\n",
201
+ td->total_hits);
202
+ for (i = 0; i < td->size; i++) {
203
+ hit = td->hits[i];
204
+ frt_estrcat(buffer, frt_strfmt("\t%d:%f\n", hit->doc, hit->score));
205
+ }
206
+ return buffer;
207
+ }
208
+
209
+ /***************************************************************************
210
+ *
211
+ * Weight
212
+ *
213
+ ***************************************************************************/
214
+
215
+ FrtQuery *frt_w_get_query(FrtWeight *self)
216
+ {
217
+ return self->query;
218
+ }
219
+
220
+ float frt_w_get_value(FrtWeight *self)
221
+ {
222
+ return self->value;
223
+ }
224
+
225
+ float frt_w_sum_of_squared_weights(FrtWeight *self)
226
+ {
227
+ self->qweight = self->idf * self->query->boost;
228
+ return self->qweight * self->qweight; /* square it */
229
+ }
230
+
231
+ void frt_w_normalize(FrtWeight *self, float normalization_factor)
232
+ {
233
+ self->qnorm = normalization_factor;
234
+ self->qweight *= normalization_factor; /* normalize query weight */
235
+ self->value = self->qweight * self->idf;/* idf for document */
236
+ }
237
+
238
+ void frt_w_destroy(FrtWeight *self)
239
+ {
240
+ frt_q_deref(self->query);
241
+ free(self);
242
+ }
243
+
244
+ FrtWeight *frt_w_create(size_t size, FrtQuery *query)
245
+ {
246
+ FrtWeight *self = (FrtWeight *)frt_ecalloc(size);
247
+ #ifdef DEBUG
248
+ if (size < sizeof(FrtWeight)) {
249
+ FRT_RAISE(FRT_ARG_ERROR, "size of weight <%d> should be at least <%d>",
250
+ (int)size, (int)sizeof(FrtWeight));
251
+ }
252
+ #endif
253
+ FRT_REF(query);
254
+ self->query = query;
255
+ self->get_query = &frt_w_get_query;
256
+ self->get_value = &frt_w_get_value;
257
+ self->normalize = &frt_w_normalize;
258
+ self->destroy = &frt_w_destroy;
259
+ self->sum_of_squared_weights = &frt_w_sum_of_squared_weights;
260
+ return self;
261
+ }
262
+
263
+ /***************************************************************************
264
+ *
265
+ * Query
266
+ *
267
+ ***************************************************************************/
268
+
269
+ static const char *QUERY_NAMES[] = {
270
+ "TermQuery",
271
+ "MultiTermQuery",
272
+ "BooleanQuery",
273
+ "PhraseQuery",
274
+ "ConstantScoreQuery",
275
+ "FilteredQuery",
276
+ "MatchAllQuery",
277
+ "RangeQuery",
278
+ "WildCardQuery",
279
+ "FuzzyQuery",
280
+ "PrefixQuery",
281
+ "SpanTermQuery",
282
+ "SpanMultiTermQuery",
283
+ "SpanPrefixQuery",
284
+ "SpanFirstQuery",
285
+ "SpanOrQuery",
286
+ "SpanNotQuery",
287
+ "SpanNearQuery"
288
+ };
289
+
290
+ static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
291
+
292
+ const char *frt_q_get_query_name(FrtQueryType type) {
293
+ if (type >= FRT_NELEMS(QUERY_NAMES)) {
294
+ return UNKNOWN_QUERY_NAME;
295
+ }
296
+ else {
297
+ return QUERY_NAMES[type];
298
+ }
299
+ }
300
+
301
+ static FrtQuery *q_rewrite(FrtQuery *self, FrtIndexReader *ir)
302
+ {
303
+ (void)ir;
304
+ self->ref_cnt++;
305
+ return self;
306
+ }
307
+
308
+ static void q_extract_terms(FrtQuery *self, FrtHashSet *terms)
309
+ {
310
+ /* do nothing by default */
311
+ (void)self;
312
+ (void)terms;
313
+ }
314
+
315
+ FrtSimilarity *frt_q_get_similarity_i(FrtQuery *self, FrtSearcher *searcher)
316
+ {
317
+ (void)self;
318
+ return searcher->get_similarity(searcher);
319
+ }
320
+
321
+ void frt_q_destroy_i(FrtQuery *self)
322
+ {
323
+ free(self);
324
+ }
325
+
326
+ void frt_q_deref(FrtQuery *self)
327
+ {
328
+ if (--(self->ref_cnt) == 0) {
329
+ self->destroy_i(self);
330
+ }
331
+ }
332
+
333
+ FrtWeight *frt_q_create_weight_unsup(FrtQuery *self, FrtSearcher *searcher)
334
+ {
335
+ (void)self;
336
+ (void)searcher;
337
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR,
338
+ "Create weight is unsupported for this type of query");
339
+ return NULL;
340
+ }
341
+
342
+ FrtWeight *frt_q_weight(FrtQuery *self, FrtSearcher *searcher)
343
+ {
344
+ FrtQuery *query = searcher->rewrite(searcher, self);
345
+ FrtWeight *weight = query->create_weight_i(query, searcher);
346
+ float sum = weight->sum_of_squared_weights(weight);
347
+ FrtSimilarity *sim = query->get_similarity(query, searcher);
348
+ float norm = frt_sim_query_norm(sim, sum);
349
+ frt_q_deref(query);
350
+
351
+ weight->normalize(weight, norm);
352
+ return self->weight = weight;
353
+ }
354
+
355
+ #define BQ(query) ((FrtBooleanQuery *)(query))
356
+ FrtQuery *frt_q_combine(FrtQuery **queries, int q_cnt)
357
+ {
358
+ int i;
359
+ FrtQuery *q, *ret_q;
360
+ FrtHashSet *uniques = frt_hs_new((frt_hash_ft)&frt_q_hash, (frt_eq_ft)&frt_q_eq, NULL);
361
+ for (i = 0; i < q_cnt; i++) {
362
+ q = queries[i];
363
+ if (q->type == BOOLEAN_QUERY) {
364
+ int j;
365
+ bool splittable = true;
366
+ if (BQ(q)->coord_disabled == false) {
367
+ splittable = false;
368
+ } else {
369
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
370
+ if (BQ(q)->clauses[j]->occur != FRT_BC_SHOULD) {
371
+ splittable = false;
372
+ break;
373
+ }
374
+ }
375
+ }
376
+ if (splittable) {
377
+ for (j = 0; j < BQ(q)->clause_cnt; j++) {
378
+ FrtQuery *sub_q = BQ(q)->clauses[j]->query;
379
+ frt_hs_add(uniques, sub_q);
380
+ }
381
+ } else {
382
+ frt_hs_add(uniques, q);
383
+ }
384
+ } else {
385
+ frt_hs_add(uniques, q);
386
+ }
387
+ }
388
+
389
+ if (uniques->size == 1) {
390
+ ret_q = (FrtQuery *)uniques->first->elem;
391
+ FRT_REF(ret_q);
392
+ } else {
393
+ FrtHashSetEntry *hse;
394
+ ret_q = frt_bq_new(true);
395
+ for (hse = uniques->first; hse; hse = hse->next) {
396
+ q = (FrtQuery *)hse->elem;
397
+ frt_bq_add_query(ret_q, q, FRT_BC_SHOULD);
398
+ }
399
+ }
400
+ frt_hs_destroy(uniques);
401
+
402
+ return ret_q;
403
+ }
404
+
405
+ unsigned long long frt_q_hash(FrtQuery *self)
406
+ {
407
+ return (self->hash(self) << 5) | self->type;
408
+ }
409
+
410
+ int frt_q_eq(FrtQuery *self, FrtQuery *o)
411
+ {
412
+ return (self == o)
413
+ || ((self->type == o->type)
414
+ && (self->boost == o->boost)
415
+ && self->eq(self, o));
416
+ }
417
+
418
+ static FrtMatchVector *q_get_matchv_i(FrtQuery *self, FrtMatchVector *mv, FrtTermVector *tv)
419
+ {
420
+ /* be default we don't add any matches */
421
+ (void)self; (void)tv;
422
+ return mv;
423
+ }
424
+
425
+ FrtQuery *frt_q_create(size_t size)
426
+ {
427
+ FrtQuery *self = (FrtQuery *)frt_ecalloc(size);
428
+ #ifdef DEBUG
429
+ if (size < sizeof(FrtQuery)) {
430
+ FRT_RAISE(FRT_ARG_ERROR, "Size of a query <%d> should never be smaller than "
431
+ "the size of a Query struct <%d>", (int)size, (int)sizeof(FrtQuery));
432
+ }
433
+ #endif
434
+ self->boost = 1.0f;
435
+ self->rewrite = &q_rewrite;
436
+ self->get_similarity = &frt_q_get_similarity_i;
437
+ self->extract_terms = &q_extract_terms;
438
+ self->get_matchv_i = &q_get_matchv_i;
439
+ self->weight = NULL;
440
+ self->ref_cnt = 1;
441
+ return self;
442
+ }
443
+
444
+ /***************************************************************************
445
+ *
446
+ * Scorer
447
+ *
448
+ ***************************************************************************/
449
+
450
+ void frt_scorer_destroy_i(FrtScorer *scorer)
451
+ {
452
+ free(scorer);
453
+ }
454
+
455
+ FrtScorer *frt_scorer_create(size_t size, FrtSimilarity *similarity)
456
+ {
457
+ FrtScorer *self = (FrtScorer *)frt_ecalloc(size);
458
+ #ifdef DEBUG
459
+ if (size < sizeof(FrtScorer)) {
460
+ FRT_RAISE(FRT_ARG_ERROR, "size of scorer <%d> should be at least <%d>",
461
+ (int)size, (int)sizeof(FrtScorer));
462
+ }
463
+ #endif
464
+ self->destroy = &frt_scorer_destroy_i;
465
+ self->similarity = similarity;
466
+ return self;
467
+ }
468
+
469
+ bool frt_scorer_doc_less_than(const FrtScorer *s1, const FrtScorer *s2)
470
+ {
471
+ return s1->doc < s2->doc;
472
+ }
473
+
474
+ int frt_scorer_doc_cmp(const void *p1, const void *p2)
475
+ {
476
+ return (*(FrtScorer **)p1)->doc - (*(FrtScorer **)p2)->doc;
477
+ }
478
+
479
+ /***************************************************************************
480
+ *
481
+ * Highlighter
482
+ *
483
+ ***************************************************************************/
484
+
485
+ /* ** MatchRange ** */
486
+ static int match_range_cmp(const void *p1, const void *p2)
487
+ {
488
+ int diff = ((FrtMatchRange *)p1)->start - ((FrtMatchRange *)p2)->start;
489
+ if (diff != 0) {
490
+ return diff;
491
+ }
492
+ else {
493
+ return ((FrtMatchRange *)p2)->end - ((FrtMatchRange *)p1)->end;
494
+ }
495
+ }
496
+
497
+
498
+
499
+ /* ** FrtMatchVector ** */
500
+ FrtMatchVector *frt_matchv_new()
501
+ {
502
+ FrtMatchVector *matchv = FRT_ALLOC(FrtMatchVector);
503
+
504
+ matchv->size = 0;
505
+ matchv->capa = FRT_MATCH_VECTOR_INIT_CAPA;
506
+ matchv->matches = FRT_ALLOC_N(FrtMatchRange, FRT_MATCH_VECTOR_INIT_CAPA);
507
+
508
+ return matchv;
509
+ }
510
+
511
+ FrtMatchVector *frt_matchv_add(FrtMatchVector *self, int start, int end)
512
+ {
513
+ if (self->size >= self->capa) {
514
+ self->capa <<= 1;
515
+ FRT_REALLOC_N(self->matches, FrtMatchRange, self->capa);
516
+ }
517
+ self->matches[self->size].start = start;
518
+ self->matches[self->size].end = end;
519
+ self->matches[self->size].score = 1.0;
520
+ self->size++;
521
+ return self;
522
+ }
523
+
524
+ FrtMatchVector *frt_matchv_sort(FrtMatchVector *self)
525
+ {
526
+ qsort(self->matches, self->size, sizeof(FrtMatchRange), &match_range_cmp);
527
+ return self;
528
+ }
529
+
530
+ FrtMatchVector *frt_matchv_compact(FrtMatchVector *self)
531
+ {
532
+ int left, right;
533
+ frt_matchv_sort(self);
534
+ for (right = left = 0; right < self->size; right++) {
535
+ /* Note the end + 1. This compacts a range 3:5 and 6:8 inleft 3:8 */
536
+ if (self->matches[right].start > self->matches[left].end + 1) {
537
+ left++;
538
+ self->matches[left].start = self->matches[right].start;
539
+ self->matches[left].end = self->matches[right].end;
540
+ self->matches[left].score = self->matches[right].score;
541
+ }
542
+ else if (self->matches[right].end > self->matches[left].end) {
543
+ self->matches[left].end = self->matches[right].end;
544
+ }
545
+ else {
546
+ self->matches[left].score += self->matches[right].score;
547
+ }
548
+ }
549
+ self->size = left + 1;
550
+ return self;
551
+ }
552
+
553
+ FrtMatchVector *frt_matchv_compact_with_breaks(FrtMatchVector *self)
554
+ {
555
+ int left, right;
556
+ frt_matchv_sort(self);
557
+ for (right = left = 0; right < self->size; right++) {
558
+ /* Note: no end + 1. Unlike above won't compact ranges 3:5 and 6:8 */
559
+ if (self->matches[right].start > self->matches[left].end) {
560
+ left++;
561
+ self->matches[left].start = self->matches[right].start;
562
+ self->matches[left].end = self->matches[right].end;
563
+ self->matches[left].score = self->matches[right].score;
564
+ }
565
+ else if (self->matches[right].end > self->matches[left].end) {
566
+ self->matches[left].end = self->matches[right].end;
567
+ self->matches[left].score += self->matches[right].score;
568
+ }
569
+ else if (right > left) {
570
+ self->matches[left].score += self->matches[right].score;
571
+ }
572
+ }
573
+ self->size = left + 1;
574
+ return self;
575
+ }
576
+
577
+
578
+ static FrtMatchVector *matchv_set_offsets(FrtMatchVector *mv, FrtOffset *offsets)
579
+ {
580
+ int i;
581
+ for (i = 0; i < mv->size; i++) {
582
+ mv->matches[i].start_offset = offsets[mv->matches[i].start].start;
583
+ mv->matches[i].end_offset = offsets[mv->matches[i].end].end;
584
+ }
585
+ return mv;
586
+ }
587
+
588
+ void frt_matchv_destroy(FrtMatchVector *self)
589
+ {
590
+ free(self->matches);
591
+ free(self);
592
+ }
593
+
594
+ /***************************************************************************
595
+ *
596
+ * Searcher
597
+ *
598
+ ***************************************************************************/
599
+
600
+ FrtMatchVector *frt_searcher_get_match_vector(FrtSearcher *self,
601
+ FrtQuery *query,
602
+ const int doc_num,
603
+ FrtSymbol field)
604
+ {
605
+ FrtMatchVector *mv = frt_matchv_new();
606
+ bool rewrite = query->get_matchv_i == q_get_matchv_i;
607
+ FrtTermVector *tv = self->get_term_vector(self, doc_num, field);
608
+ if (rewrite) {
609
+ query = self->rewrite(self, query);
610
+ }
611
+ if (tv && tv->term_cnt > 0 && tv->terms[0].positions != NULL) {
612
+ mv = query->get_matchv_i(query, mv, tv);
613
+ frt_tv_destroy(tv);
614
+ }
615
+ if (rewrite) {
616
+ frt_q_deref(query);
617
+ }
618
+ return mv;
619
+ }
620
+
621
+ typedef struct Excerpt
622
+ {
623
+ int start;
624
+ int end;
625
+ int start_pos;
626
+ int end_pos;
627
+ int start_offset;
628
+ int end_offset;
629
+ double score;
630
+ } Excerpt;
631
+
632
+ /*
633
+ static int excerpt_cmp(const void *p1, const void *p2)
634
+ {
635
+ double score1 = (*((Excerpt **)p1))->score;
636
+ double score2 = (*((Excerpt **)p2))->score;
637
+ if (score1 > score2) return 1;
638
+ if (score1 < score2) return -1;
639
+ return 0;
640
+ }
641
+ */
642
+
643
+ static int excerpt_start_cmp(const void *p1, const void *p2)
644
+ {
645
+ return (*((Excerpt **)p1))->start - (*((Excerpt **)p2))->start;
646
+ }
647
+
648
+ static int excerpt_lt(Excerpt *e1, Excerpt *e2)
649
+ {
650
+ return e1->score > e2->score; /* want the highest score at top */
651
+ }
652
+
653
+ static Excerpt *excerpt_new(int start, int end, double score)
654
+ {
655
+ Excerpt *excerpt = FRT_ALLOC_AND_ZERO(Excerpt);
656
+ excerpt->start = start;
657
+ excerpt->end = end;
658
+ excerpt->score = score;
659
+ return excerpt;
660
+ }
661
+
662
+ static Excerpt *excerpt_recalc_score(Excerpt *e, FrtMatchVector *mv)
663
+ {
664
+ int i;
665
+ double score = 0.0;
666
+ for (i = e->start; i <= e->end; i++) {
667
+ score += mv->matches[i].score;
668
+ }
669
+ e->score = score;
670
+ return e;
671
+ }
672
+
673
+ /* expand an excerpt to it's largest possible size */
674
+ static Excerpt *excerpt_expand(Excerpt *e, const int len, FrtTermVector *tv)
675
+ {
676
+ FrtOffset *offsets = tv->offsets;
677
+ int offset_cnt = tv->offset_cnt;
678
+ bool did_expansion = true;
679
+ int i;
680
+ /* fill in skipped offsets */
681
+ for (i = 1; i < offset_cnt; i++) {
682
+ if (offsets[i].start == 0) {
683
+ offsets[i].start = offsets[i-1].start;
684
+ }
685
+ if (offsets[i].end == 0) {
686
+ offsets[i].end = offsets[i-1].end;
687
+ }
688
+ }
689
+
690
+ while (did_expansion) {
691
+ did_expansion = false;
692
+ if (e->start_pos > 0
693
+ && (e->end_offset - offsets[e->start_pos - 1].start) < len) {
694
+ e->start_pos--;
695
+ e->start_offset = offsets[e->start_pos].start;
696
+ did_expansion = true;
697
+ }
698
+ if (e->end_pos < (offset_cnt - 1)
699
+ && (offsets[e->end_pos + 1].end - e->start_offset) < len) {
700
+ e->end_pos++;
701
+ e->end_offset = offsets[e->end_pos].end;
702
+ did_expansion = true;
703
+ }
704
+ }
705
+ return e;
706
+ }
707
+
708
+ static char *excerpt_get_str(Excerpt *e, FrtMatchVector *mv,
709
+ FrtLazyDocField *lazy_df,
710
+ const char *pre_tag,
711
+ const char *post_tag,
712
+ const char *ellipsis)
713
+ {
714
+ int i, len;
715
+ int last_offset = e->start_offset;
716
+ const int num_matches = e->end - e->start + 1;
717
+ const int pre_tag_len = (int)strlen(pre_tag);
718
+ const int post_tag_len = (int)strlen(post_tag);
719
+ const int ellipsis_len = (int)strlen(ellipsis);
720
+ char *excerpt_str = FRT_ALLOC_N(char,
721
+ 10 + e->end_offset - e->start_offset
722
+ + (num_matches * (pre_tag_len + post_tag_len))
723
+ + (2 * ellipsis_len));
724
+ char *e_ptr = excerpt_str;
725
+ if (e->start_offset > 0) {
726
+ memcpy(e_ptr, ellipsis, ellipsis_len);
727
+ e_ptr += ellipsis_len;
728
+ }
729
+ for (i = e->start; i <= e->end; i++) {
730
+ FrtMatchRange *mr = mv->matches + i;
731
+ len = mr->start_offset - last_offset;
732
+ if (len) {
733
+ frt_lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
734
+ e_ptr += len;
735
+ }
736
+ memcpy(e_ptr, pre_tag, pre_tag_len);
737
+ e_ptr += pre_tag_len;
738
+ len = mr->end_offset - mr->start_offset;
739
+ if (len) {
740
+ frt_lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
741
+ e_ptr += len;
742
+ }
743
+ memcpy(e_ptr, post_tag, post_tag_len);
744
+ e_ptr += post_tag_len;
745
+ last_offset = mr->end_offset;
746
+ }
747
+ if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
748
+ /* no point using ellipsis if it takes up more space */
749
+ e->end_offset = lazy_df->len;
750
+ }
751
+ len = e->end_offset - last_offset;
752
+ if (len) {
753
+ frt_lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
754
+ e_ptr += len;
755
+ }
756
+ if (e->end_offset < lazy_df->len) {
757
+ memcpy(e_ptr, ellipsis, ellipsis_len);
758
+ e_ptr += ellipsis_len;
759
+ }
760
+ *e_ptr = '\0';
761
+ return excerpt_str;
762
+ }
763
+
764
+ static char *highlight_field(FrtMatchVector *mv,
765
+ FrtLazyDocField *lazy_df,
766
+ FrtTermVector *tv,
767
+ const char *pre_tag,
768
+ const char *post_tag)
769
+ {
770
+ const int pre_len = (int)strlen(pre_tag);
771
+ const int post_len = (int)strlen(post_tag);
772
+ char *excerpt_str =
773
+ FRT_ALLOC_N(char, 10 + lazy_df->len + (mv->size * (pre_len + post_len)));
774
+ if (mv->size > 0) {
775
+ int last_offset = 0;
776
+ int i, len;
777
+ char *e_ptr = excerpt_str;
778
+ frt_matchv_compact_with_breaks(mv);
779
+ matchv_set_offsets(mv, tv->offsets);
780
+ for (i = 0; i < mv->size; i++) {
781
+ FrtMatchRange *mr = mv->matches + i;
782
+ len = mr->start_offset - last_offset;
783
+ if (len) {
784
+ frt_lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
785
+ e_ptr += len;
786
+ }
787
+ memcpy(e_ptr, pre_tag, pre_len);
788
+ e_ptr += pre_len;
789
+ len = mr->end_offset - mr->start_offset;
790
+ if (len) {
791
+ frt_lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
792
+ e_ptr += len;
793
+ }
794
+ memcpy(e_ptr, post_tag, post_len);
795
+ e_ptr += post_len;
796
+ last_offset = mr->end_offset;
797
+ }
798
+ len = lazy_df->len - last_offset;
799
+ if (len) {
800
+ frt_lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
801
+ e_ptr += len;
802
+ }
803
+ *e_ptr = '\0';
804
+ }
805
+ else {
806
+ frt_lazy_df_get_bytes(lazy_df, excerpt_str, 0, lazy_df->len);
807
+ excerpt_str[lazy_df->len] = '\0';
808
+ }
809
+ return excerpt_str;
810
+ }
811
+
812
+ char **frt_searcher_highlight(FrtSearcher *self,
813
+ FrtQuery *query,
814
+ const int doc_num,
815
+ FrtSymbol field,
816
+ const int excerpt_len,
817
+ const int num_excerpts,
818
+ const char *pre_tag,
819
+ const char *post_tag,
820
+ const char *ellipsis)
821
+ {
822
+ char **excerpt_strs = NULL;
823
+ FrtTermVector *tv = self->get_term_vector(self, doc_num, field);
824
+ FrtLazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
825
+ FrtLazyDocField *lazy_df = NULL;
826
+ if (lazy_doc) {
827
+ lazy_df = frt_lazy_doc_get(lazy_doc, field);
828
+ }
829
+ if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
830
+ && tv->offsets != NULL) {
831
+ FrtMatchVector *mv;
832
+ query = self->rewrite(self, query);
833
+ mv = query->get_matchv_i(query, frt_matchv_new(), tv);
834
+ frt_q_deref(query);
835
+ if (lazy_df->len < (excerpt_len * num_excerpts)) {
836
+ excerpt_strs = frt_ary_new_type_capa(char *, 1);
837
+ frt_ary_push(excerpt_strs,
838
+ highlight_field(mv, lazy_df, tv, pre_tag, post_tag));
839
+ }
840
+ else if (mv->size > 0) {
841
+ Excerpt **excerpts = FRT_ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
842
+ int e_start, e_end, i, j;
843
+ FrtMatchRange *matches = mv->matches;
844
+ double running_score = 0.0;
845
+ FrtOffset *offsets = tv->offsets;
846
+ FrtPriorityQueue *excerpt_pq;
847
+
848
+ frt_matchv_compact_with_breaks(mv);
849
+ matchv_set_offsets(mv, offsets);
850
+ excerpt_pq = frt_pq_new(mv->size, (frt_lt_ft)&excerpt_lt, &free);
851
+ /* add all possible excerpts to the priority queue */
852
+
853
+ for (e_start = e_end = 0; e_start < mv->size; e_start++) {
854
+ const int start_offset = matches[e_start].start_offset;
855
+ if (e_start > e_end) {
856
+ running_score = 0.0;
857
+ e_end = e_start;
858
+ }
859
+ while (e_end < mv->size && (matches[e_end].end_offset
860
+ <= start_offset + excerpt_len)) {
861
+ running_score += matches[e_end].score;
862
+ e_end++;
863
+ }
864
+ frt_pq_push(excerpt_pq,
865
+ excerpt_new(e_start, e_end - 1, running_score));
866
+ /* - 0.1 so that earlier matches take priority */
867
+ running_score -= matches[e_start].score;
868
+ }
869
+
870
+ for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
871
+ excerpts[i] = (Excerpt *)frt_pq_pop(excerpt_pq);
872
+ if (i < num_excerpts - 1) {
873
+ /* set match ranges alread included to 0 */
874
+ Excerpt *e = excerpts[i];
875
+ for (j = e->start; j <= e->end; j++) {
876
+ matches[j].score = 0.0;
877
+ }
878
+ e = NULL;
879
+ while (e != (Excerpt *)frt_pq_top(excerpt_pq)) {
880
+ e = (Excerpt *)frt_pq_top(excerpt_pq);
881
+ excerpt_recalc_score(e, mv);
882
+ frt_pq_down(excerpt_pq);
883
+ }
884
+ }
885
+ }
886
+
887
+ qsort(excerpts, i, sizeof(Excerpt *), &excerpt_start_cmp);
888
+ for (j = 0; j < i; j++) {
889
+ Excerpt *e = excerpts[j];
890
+ e->start_pos = matches[e->start].start;
891
+ e->end_pos = matches[e->end].end;
892
+ e->start_offset = offsets[e->start_pos].start;
893
+ e->end_offset = offsets[e->end_pos].end;
894
+ }
895
+
896
+ if (i < num_excerpts) {
897
+ const int diff = num_excerpts - i;
898
+ memmove(excerpts + (diff), excerpts,
899
+ i * sizeof(Excerpt *));
900
+ for (j = 0; j < diff; j++) {
901
+ /* these new excerpts will grow into one long excerpt at
902
+ * the start */
903
+ excerpts[j] = FRT_ALLOC_AND_ZERO(Excerpt);
904
+ excerpts[j]->end = -1;
905
+ }
906
+ }
907
+
908
+ excerpt_strs = frt_ary_new_type_capa(char *, num_excerpts);
909
+ /* merge excerpts where possible */
910
+ for (i = 0; i < num_excerpts;) {
911
+ Excerpt *ei = excerpts[i];
912
+ int merged = 1; /* 1 means a single excerpt, ie no merges */
913
+ for (j = i + 1; j < num_excerpts; j++) {
914
+ Excerpt *ej = excerpts[j];
915
+ if ((ej->end_offset - ei->start_offset)
916
+ < (j - i + 1) * excerpt_len) {
917
+ ei->end = ej->end;
918
+ ei->end_pos = ej->end_pos;
919
+ ei->end_offset = ej->end_offset;
920
+ merged = j - i + 1;
921
+ }
922
+ }
923
+ excerpt_expand(ei, merged * excerpt_len, tv);
924
+ frt_ary_push(excerpt_strs,
925
+ excerpt_get_str(ei, mv, lazy_df,
926
+ pre_tag, post_tag, ellipsis));
927
+ i += merged;
928
+ }
929
+ for (i = 0; i < num_excerpts; i++) {
930
+ free(excerpts[i]);
931
+ }
932
+ free(excerpts);
933
+ frt_pq_destroy(excerpt_pq);
934
+ }
935
+ frt_matchv_destroy(mv);
936
+ }
937
+ if (tv) frt_tv_destroy(tv);
938
+ if (lazy_doc) frt_lazy_doc_close(lazy_doc);
939
+ return excerpt_strs;
940
+ }
941
+
942
+ static FrtWeight *sea_create_weight(FrtSearcher *self, FrtQuery *query)
943
+ {
944
+ return frt_q_weight(query, self);
945
+ }
946
+
947
+ static void sea_check_args(int num_docs, int first_doc)
948
+ {
949
+ if (num_docs <= 0) {
950
+ FRT_RAISE(FRT_ARG_ERROR, ":num_docs was set to %d but should be greater "
951
+ "than 0 : %d <= 0", num_docs, num_docs);
952
+ }
953
+
954
+ if (first_doc < 0) {
955
+ FRT_RAISE(FRT_ARG_ERROR, ":first_doc was set to %d but should be greater "
956
+ "than or equal to 0 : %d < 0", first_doc, first_doc);
957
+ }
958
+ }
959
+
960
+ static FrtSimilarity *sea_get_similarity(FrtSearcher *self)
961
+ {
962
+ return self->similarity;
963
+ }
964
+
965
+ /***************************************************************************
966
+ *
967
+ * IndexSearcher
968
+ *
969
+ ***************************************************************************/
970
+
971
+ #define ISEA(searcher) ((FrtIndexSearcher *)(searcher))
972
+
973
+ int frt_isea_doc_freq(FrtSearcher *self, FrtSymbol field, const char *term)
974
+ {
975
+ return frt_ir_doc_freq(ISEA(self)->ir, field, term);
976
+ }
977
+
978
+ static FrtDocument *isea_get_doc(FrtSearcher *self, int doc_num)
979
+ {
980
+ FrtIndexReader *ir = ISEA(self)->ir;
981
+ return ir->get_doc(ir, doc_num);
982
+ }
983
+
984
+ static FrtLazyDoc *isea_get_lazy_doc(FrtSearcher *self, int doc_num)
985
+ {
986
+ FrtIndexReader *ir = ISEA(self)->ir;
987
+ return ir->get_lazy_doc(ir, doc_num);
988
+ }
989
+
990
+ static int isea_max_doc(FrtSearcher *self)
991
+ {
992
+ FrtIndexReader *ir = ISEA(self)->ir;
993
+ return ir->max_doc(ir);
994
+ }
995
+
996
+ #define IS_FILTERED(bits, post_filter, scorer, searcher) \
997
+ ((bits && !frt_bv_get(bits, scorer->doc))\
998
+ || (post_filter \
999
+ && !(filter_factor = \
1000
+ post_filter->filter_func(scorer->doc, scorer->score(scorer),\
1001
+ searcher, post_filter->arg))))
1002
+
1003
+ static FrtTopDocs *isea_search_w(FrtSearcher *self,
1004
+ FrtWeight *weight,
1005
+ int first_doc,
1006
+ int num_docs,
1007
+ FrtFilter *filter,
1008
+ FrtSort *sort,
1009
+ FrtPostFilter *post_filter,
1010
+ bool load_fields)
1011
+ {
1012
+ int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
1013
+ int i;
1014
+ int total_hits = 0;
1015
+ FrtHit **score_docs = NULL;
1016
+
1017
+ FrtPriorityQueue *hq;
1018
+ FrtHit *(*hq_pop)(FrtPriorityQueue *pq);
1019
+ void (*hq_insert)(FrtPriorityQueue *pq, FrtHit *hit);
1020
+ void (*hq_destroy)(FrtPriorityQueue *self);
1021
+
1022
+ FrtScorer *scorer;
1023
+ FrtHit hit;
1024
+
1025
+ float max_score = 0.0f;
1026
+ float score = 0.0f;
1027
+ float filter_factor = 1.0f;
1028
+
1029
+ FrtBitVector *bits = (filter ? frt_filt_get_bv(filter, ISEA(self)->ir) : NULL);
1030
+
1031
+ sea_check_args(num_docs, first_doc);
1032
+
1033
+ if (sort) {
1034
+ hq = frt_fshq_pq_new(max_size, sort, ISEA(self)->ir);
1035
+ hq_insert = &frt_fshq_pq_insert;
1036
+ hq_destroy = &frt_fshq_pq_destroy;
1037
+ if (load_fields) {
1038
+ hq_pop = &frt_fshq_pq_pop_fd;
1039
+ } else {
1040
+ hq_pop = &frt_fshq_pq_pop;
1041
+ }
1042
+ } else {
1043
+ hq = frt_pq_new(max_size, (frt_lt_ft)&hit_lt, &free);
1044
+ hq_pop = &hit_pq_pop;
1045
+ hq_insert = &hit_pq_insert;
1046
+ hq_destroy = &frt_pq_destroy;
1047
+ }
1048
+
1049
+ scorer = weight->scorer(weight, ISEA(self)->ir);
1050
+ if (!scorer || 0 == ISEA(self)->ir->num_docs(ISEA(self)->ir)) {
1051
+ if (scorer) scorer->destroy(scorer);
1052
+ return frt_td_new(0, 0, NULL, 0.0);
1053
+ }
1054
+
1055
+ while (scorer->next(scorer)) {
1056
+ if (bits && !frt_bv_get(bits, scorer->doc)) continue;
1057
+ score = scorer->score(scorer);
1058
+ if (post_filter &&
1059
+ !(filter_factor = post_filter->filter_func(scorer->doc,
1060
+ score,
1061
+ self,
1062
+ post_filter->arg))) {
1063
+ continue;
1064
+ }
1065
+ total_hits++;
1066
+ if (filter_factor < 1.0f) score *= filter_factor;
1067
+ if (score > max_score) max_score = score;
1068
+ hit.doc = scorer->doc; hit.score = score;
1069
+ hq_insert(hq, &hit);
1070
+ }
1071
+ scorer->destroy(scorer);
1072
+ if (hq->size > first_doc) {
1073
+ if ((hq->size - first_doc) < num_docs) {
1074
+ num_docs = hq->size - first_doc;
1075
+ }
1076
+ score_docs = FRT_ALLOC_N(FrtHit *, num_docs);
1077
+ for (i = num_docs - 1; i >= 0; i--) {
1078
+ score_docs[i] = hq_pop(hq);
1079
+ }
1080
+ } else {
1081
+ num_docs = 0;
1082
+ }
1083
+ frt_pq_clear(hq);
1084
+ hq_destroy(hq);
1085
+ return frt_td_new(total_hits, num_docs, score_docs, max_score);
1086
+ }
1087
+
1088
+ static FrtTopDocs *isea_search(FrtSearcher *self,
1089
+ FrtQuery *query,
1090
+ int first_doc,
1091
+ int num_docs,
1092
+ FrtFilter *filter,
1093
+ FrtSort *sort,
1094
+ FrtPostFilter *post_filter,
1095
+ bool load_fields)
1096
+ {
1097
+ FrtTopDocs *td;
1098
+ FrtWeight *weight = frt_q_weight(query, self);
1099
+ td = isea_search_w(self, weight, first_doc, num_docs, filter, sort, post_filter, load_fields);
1100
+ weight->destroy(weight);
1101
+ return td;
1102
+ }
1103
+
1104
+ static void isea_search_each_w(FrtSearcher *self, FrtWeight *weight, FrtFilter *filter,
1105
+ FrtPostFilter *post_filter,
1106
+ void (*fn)(FrtSearcher *, int, float, void *),
1107
+ void *arg)
1108
+ {
1109
+ FrtScorer *scorer;
1110
+ float filter_factor = 1.0f;
1111
+ FrtBitVector *bits = (filter
1112
+ ? frt_filt_get_bv(filter, ISEA(self)->ir)
1113
+ : NULL);
1114
+
1115
+ scorer = weight->scorer(weight, ISEA(self)->ir);
1116
+ if (!scorer) {
1117
+ return;
1118
+ }
1119
+
1120
+ while (scorer->next(scorer)) {
1121
+ float score;
1122
+ if (bits && !frt_bv_get(bits, scorer->doc)) continue;
1123
+ score = scorer->score(scorer);
1124
+ if (post_filter &&
1125
+ !(filter_factor = post_filter->filter_func(scorer->doc,
1126
+ score,
1127
+ self,
1128
+ post_filter->arg))) {
1129
+ continue;
1130
+ }
1131
+ fn(self, scorer->doc, filter_factor * score, arg);
1132
+ }
1133
+ scorer->destroy(scorer);
1134
+ }
1135
+
1136
+ static void isea_search_each(FrtSearcher *self, FrtQuery *query, FrtFilter *filter,
1137
+ FrtPostFilter *post_filter,
1138
+ void (*fn)(FrtSearcher *, int, float, void *),
1139
+ void *arg)
1140
+ {
1141
+ FrtWeight *weight = frt_q_weight(query, self);
1142
+ isea_search_each_w(self, weight, filter, post_filter, fn, arg);
1143
+ weight->destroy(weight);
1144
+ }
1145
+
1146
+ /*
1147
+ * Scan the index for all documents that match a query and write the results
1148
+ * to a buffer. It will stop scanning once the limit is reached and it starts
1149
+ * scanning from offset_docnum.
1150
+ *
1151
+ * Note: Unlike the offset_docnum in other search methods, this offset_docnum
1152
+ * refers to document number and not hit.
1153
+ */
1154
+ static int isea_search_unscored_w(FrtSearcher *self,
1155
+ FrtWeight *weight,
1156
+ int *buf,
1157
+ int limit,
1158
+ int offset_docnum)
1159
+ {
1160
+ int count = 0;
1161
+ FrtScorer *scorer = weight->scorer(weight, ISEA(self)->ir);
1162
+ if (scorer) {
1163
+ if (scorer->skip_to(scorer, offset_docnum)) {
1164
+ do {
1165
+ buf[count++] = scorer->doc;
1166
+ } while (count < limit && scorer->next(scorer));
1167
+ }
1168
+ scorer->destroy(scorer);
1169
+ }
1170
+ return count;
1171
+ }
1172
+
1173
+ static int isea_search_unscored(FrtSearcher *self,
1174
+ FrtQuery *query,
1175
+ int *buf,
1176
+ int limit,
1177
+ int offset_docnum)
1178
+ {
1179
+ int count;
1180
+ FrtWeight *weight = frt_q_weight(query, self);
1181
+ count = isea_search_unscored_w(self, weight, buf, limit, offset_docnum);
1182
+ weight->destroy(weight);
1183
+ return count;
1184
+ }
1185
+
1186
+ static FrtQuery *isea_rewrite(FrtSearcher *self, FrtQuery *original)
1187
+ {
1188
+ int q_is_destroyed = false;
1189
+ FrtQuery *query = original;
1190
+ FrtQuery *rewritten_query = query->rewrite(query, ISEA(self)->ir);
1191
+ while (q_is_destroyed || (query != rewritten_query)) {
1192
+ query = rewritten_query;
1193
+ rewritten_query = query->rewrite(query, ISEA(self)->ir);
1194
+ q_is_destroyed = (query->ref_cnt <= 1);
1195
+ frt_q_deref(query); /* destroy intermediate queries */
1196
+ }
1197
+ return query;
1198
+ }
1199
+
1200
+ static FrtExplanation *isea_explain(FrtSearcher *self,
1201
+ FrtQuery *query,
1202
+ int doc_num)
1203
+ {
1204
+ FrtWeight *weight = frt_q_weight(query, self);
1205
+ FrtExplanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
1206
+ weight->destroy(weight);
1207
+ return e;
1208
+ }
1209
+
1210
+ static FrtExplanation *isea_explain_w(FrtSearcher *self, FrtWeight *w, int doc_num)
1211
+ {
1212
+ return w->explain(w, ISEA(self)->ir, doc_num);
1213
+ }
1214
+
1215
+ static FrtTermVector *isea_get_term_vector(FrtSearcher *self,
1216
+ const int doc_num,
1217
+ FrtSymbol field)
1218
+ {
1219
+ FrtIndexReader *ir = ISEA(self)->ir;
1220
+ return ir->term_vector(ir, doc_num, field);
1221
+ }
1222
+
1223
+ static void isea_close(FrtSearcher *self)
1224
+ {
1225
+ if (ISEA(self)->ir && ISEA(self)->close_ir) {
1226
+ frt_ir_close(ISEA(self)->ir);
1227
+ }
1228
+ free(self);
1229
+ }
1230
+
1231
+ FrtSearcher *frt_isea_new(FrtIndexReader *ir)
1232
+ {
1233
+ FrtSearcher *self = (FrtSearcher *)FRT_ALLOC(FrtIndexSearcher);
1234
+
1235
+ ISEA(self)->ir = ir;
1236
+ ISEA(self)->close_ir = true;
1237
+
1238
+ self->similarity = frt_sim_create_default();
1239
+ self->doc_freq = &frt_isea_doc_freq;
1240
+ self->get_doc = &isea_get_doc;
1241
+ self->get_lazy_doc = &isea_get_lazy_doc;
1242
+ self->max_doc = &isea_max_doc;
1243
+ self->create_weight = &sea_create_weight;
1244
+ self->search = &isea_search;
1245
+ self->search_w = &isea_search_w;
1246
+ self->search_each = &isea_search_each;
1247
+ self->search_each_w = &isea_search_each_w;
1248
+ self->search_unscored = &isea_search_unscored;
1249
+ self->search_unscored_w = &isea_search_unscored_w;
1250
+ self->rewrite = &isea_rewrite;
1251
+ self->explain = &isea_explain;
1252
+ self->explain_w = &isea_explain_w;
1253
+ self->get_term_vector = &isea_get_term_vector;
1254
+ self->get_similarity = &sea_get_similarity;
1255
+ self->close = &isea_close;
1256
+
1257
+ return self;
1258
+ }
1259
+
1260
+ /***************************************************************************
1261
+ *
1262
+ * CachedDFSearcher
1263
+ *
1264
+ ***************************************************************************/
1265
+
1266
+ #define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
1267
+ typedef struct CachedDFSearcher
1268
+ {
1269
+ FrtSearcher super;
1270
+ FrtHash *df_map;
1271
+ int max_doc;
1272
+ } CachedDFSearcher;
1273
+
1274
+ static int cdfsea_doc_freq(FrtSearcher *self, FrtSymbol field, const char *text)
1275
+ {
1276
+ FrtTerm term;
1277
+ int *df;
1278
+ term.field = field;
1279
+ term.text = (char *)text;
1280
+ df = (int *)frt_h_get(CDFSEA(self)->df_map, &term);
1281
+ return df ? *df : 0;
1282
+ }
1283
+
1284
+ static FrtDocument *cdfsea_get_doc(FrtSearcher *self, int doc_num)
1285
+ {
1286
+ (void)self; (void)doc_num;
1287
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1288
+ return NULL;
1289
+ }
1290
+
1291
+ static int cdfsea_max_doc(FrtSearcher *self)
1292
+ {
1293
+ (void)self;
1294
+ return CDFSEA(self)->max_doc;
1295
+ }
1296
+
1297
+ static FrtWeight *cdfsea_create_weight(FrtSearcher *self, FrtQuery *query)
1298
+ {
1299
+ (void)self; (void)query;
1300
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1301
+ return NULL;
1302
+ }
1303
+
1304
+ static FrtTopDocs *cdfsea_search_w(FrtSearcher *self, FrtWeight *w, int fd, int nd,
1305
+ FrtFilter *f, FrtSort *s, FrtPostFilter *pf, bool load)
1306
+ {
1307
+ (void)self; (void)w; (void)fd; (void)nd;
1308
+ (void)f; (void)s; (void)pf; (void)load;
1309
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1310
+ return NULL;
1311
+ }
1312
+
1313
+ static FrtTopDocs *cdfsea_search(FrtSearcher *self, FrtQuery *q, int fd, int nd,
1314
+ FrtFilter *f, FrtSort *s, FrtPostFilter *pf, bool load)
1315
+ {
1316
+ (void)self; (void)q; (void)fd; (void)nd;
1317
+ (void)f; (void)s; (void)pf; (void)load;
1318
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1319
+ return NULL;
1320
+ }
1321
+
1322
+ static void cdfsea_search_each(FrtSearcher *self, FrtQuery *query, FrtFilter *filter,
1323
+ FrtPostFilter *pf,
1324
+ void (*fn)(FrtSearcher *, int, float, void *),
1325
+ void *arg)
1326
+ {
1327
+ (void)self; (void)query; (void)filter; (void)pf; (void)fn; (void)arg;
1328
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1329
+ }
1330
+
1331
+ static void cdfsea_search_each_w(FrtSearcher *self, FrtWeight *w, FrtFilter *filter,
1332
+ FrtPostFilter *pf,
1333
+ void (*fn)(FrtSearcher *, int, float, void *),
1334
+ void *arg)
1335
+ {
1336
+ (void)self; (void)w; (void)filter; (void)pf; (void)fn; (void)arg;
1337
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1338
+ }
1339
+
1340
+ static FrtQuery *cdfsea_rewrite(FrtSearcher *self, FrtQuery *original)
1341
+ {
1342
+ (void)self;
1343
+ original->ref_cnt++;
1344
+ return original;
1345
+ }
1346
+
1347
+ static FrtExplanation *cdfsea_explain(FrtSearcher *self, FrtQuery *query, int doc_num)
1348
+ {
1349
+ (void)self; (void)query; (void)doc_num;
1350
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1351
+ return NULL;
1352
+ }
1353
+
1354
+ static FrtExplanation *cdfsea_explain_w(FrtSearcher *self, FrtWeight *w, int doc_num)
1355
+ {
1356
+ (void)self; (void)w; (void)doc_num;
1357
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1358
+ return NULL;
1359
+ }
1360
+
1361
+ static FrtTermVector *cdfsea_get_term_vector(FrtSearcher *self, const int doc_num,
1362
+ FrtSymbol field)
1363
+ {
1364
+ (void)self; (void)doc_num; (void)field;
1365
+ FRT_RAISE(FRT_UNSUPPORTED_ERROR, "%s", FRT_UNSUPPORTED_ERROR_MSG);
1366
+ return NULL;
1367
+ }
1368
+
1369
+ static FrtSimilarity *cdfsea_get_similarity(FrtSearcher *self)
1370
+ {
1371
+ return self->similarity;
1372
+ }
1373
+
1374
+ static void cdfsea_close(FrtSearcher *self)
1375
+ {
1376
+ frt_h_destroy(CDFSEA(self)->df_map);
1377
+ free(self);
1378
+ }
1379
+
1380
+ static FrtSearcher *cdfsea_new(FrtHash *df_map, int max_doc)
1381
+ {
1382
+ FrtSearcher *self = (FrtSearcher *)FRT_ALLOC(CachedDFSearcher);
1383
+
1384
+ CDFSEA(self)->df_map = df_map;
1385
+ CDFSEA(self)->max_doc = max_doc;
1386
+
1387
+ self->similarity = frt_sim_create_default();
1388
+ self->doc_freq = &cdfsea_doc_freq;
1389
+ self->get_doc = &cdfsea_get_doc;
1390
+ self->max_doc = &cdfsea_max_doc;
1391
+ self->create_weight = &cdfsea_create_weight;
1392
+ self->search = &cdfsea_search;
1393
+ self->search_w = &cdfsea_search_w;
1394
+ self->search_each = &cdfsea_search_each;
1395
+ self->search_each_w = &cdfsea_search_each_w;
1396
+ self->rewrite = &cdfsea_rewrite;
1397
+ self->explain = &cdfsea_explain;
1398
+ self->explain_w = &cdfsea_explain_w;
1399
+ self->get_term_vector = &cdfsea_get_term_vector;
1400
+ self->get_similarity = &cdfsea_get_similarity;
1401
+ self->close = &cdfsea_close;
1402
+ return self;
1403
+ }
1404
+
1405
+ /***************************************************************************
1406
+ *
1407
+ * MultiSearcher
1408
+ *
1409
+ ***************************************************************************/
1410
+
1411
+ #define MSEA(searcher) ((FrtMultiSearcher *)(searcher))
1412
+ static int msea_get_searcher_index(FrtSearcher *self, int n)
1413
+ {
1414
+ FrtMultiSearcher *msea = MSEA(self);
1415
+ int lo = 0; /* search starts array */
1416
+ int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
1417
+ int mid, mid_val;
1418
+
1419
+ while (hi >= lo) {
1420
+ mid = (lo + hi) >> 1;
1421
+ mid_val = msea->starts[mid];
1422
+ if (n < mid_val) {
1423
+ hi = mid - 1;
1424
+ }
1425
+ else if (n > mid_val) {
1426
+ lo = mid + 1;
1427
+ }
1428
+ else { /* found a match */
1429
+ while (((mid+1) < msea->s_cnt)
1430
+ && (msea->starts[mid+1] == mid_val)) {
1431
+ mid++; /* scan to last match */
1432
+ }
1433
+ return mid;
1434
+ }
1435
+ }
1436
+ return hi;
1437
+ }
1438
+
1439
+ static int msea_doc_freq(FrtSearcher *self, FrtSymbol field, const char *term)
1440
+ {
1441
+ int i;
1442
+ int doc_freq = 0;
1443
+ FrtMultiSearcher *msea = MSEA(self);
1444
+ for (i = 0; i < msea->s_cnt; i++) {
1445
+ FrtSearcher *s = msea->searchers[i];
1446
+ doc_freq += s->doc_freq(s, field, term);
1447
+ }
1448
+
1449
+ return doc_freq;
1450
+ }
1451
+
1452
+ static FrtDocument *msea_get_doc(FrtSearcher *self, int doc_num)
1453
+ {
1454
+ FrtMultiSearcher *msea = MSEA(self);
1455
+ int i = msea_get_searcher_index(self, doc_num);
1456
+ FrtSearcher *s = msea->searchers[i];
1457
+ return s->get_doc(s, doc_num - msea->starts[i]);
1458
+ }
1459
+
1460
+ static FrtLazyDoc *msea_get_lazy_doc(FrtSearcher *self, int doc_num)
1461
+ {
1462
+ FrtMultiSearcher *msea = MSEA(self);
1463
+ int i = msea_get_searcher_index(self, doc_num);
1464
+ FrtSearcher *s = msea->searchers[i];
1465
+ return s->get_lazy_doc(s, doc_num - msea->starts[i]);
1466
+ }
1467
+
1468
+ static int msea_max_doc(FrtSearcher *self)
1469
+ {
1470
+ return MSEA(self)->max_doc;
1471
+ }
1472
+
1473
+ static int *msea_get_doc_freqs(FrtSearcher *self, FrtHashSet *terms)
1474
+ {
1475
+ int i;
1476
+ FrtHashSetEntry *hse;
1477
+ int *doc_freqs = FRT_ALLOC_N(int, terms->size);
1478
+ for (i = 0, hse = terms->first; hse; ++i, hse = hse->next) {
1479
+ FrtTerm *t = (FrtTerm *)hse->elem;
1480
+ doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
1481
+ }
1482
+ return doc_freqs;
1483
+ }
1484
+
1485
+ static FrtWeight *msea_create_weight(FrtSearcher *self, FrtQuery *query)
1486
+ {
1487
+ int i, *doc_freqs;
1488
+ FrtSearcher *cdfsea;
1489
+ FrtWeight *w;
1490
+ FrtHash *df_map = frt_h_new((frt_hash_ft)&frt_term_hash,
1491
+ (frt_eq_ft)&frt_term_eq,
1492
+ (frt_free_ft)frt_term_destroy,
1493
+ free);
1494
+ FrtQuery *rewritten_query = self->rewrite(self, query);
1495
+ /* terms get copied directly to df_map so no need to free here */
1496
+ FrtHashSet *terms = frt_hs_new((frt_hash_ft)&frt_term_hash,
1497
+ (frt_eq_ft)&frt_term_eq,
1498
+ (frt_free_ft)NULL);
1499
+ FrtHashSetEntry *hse;
1500
+
1501
+ rewritten_query->extract_terms(rewritten_query, terms);
1502
+ doc_freqs = msea_get_doc_freqs(self, terms);
1503
+
1504
+ for (hse = terms->first, i = 0; hse; ++i, hse = hse->next) {
1505
+ frt_h_set(df_map, hse->elem, frt_imalloc(doc_freqs[i]));
1506
+ }
1507
+ frt_hs_destroy(terms);
1508
+ free(doc_freqs);
1509
+
1510
+ cdfsea = cdfsea_new(df_map, MSEA(self)->max_doc);
1511
+
1512
+ w = frt_q_weight(rewritten_query, cdfsea);
1513
+ frt_q_deref(rewritten_query);
1514
+ cdfsea->close(cdfsea);
1515
+
1516
+ return w;
1517
+ }
1518
+
1519
+ struct MultiSearchEachArg {
1520
+ int start;
1521
+ void *arg;
1522
+ void (*fn)(FrtSearcher *, int, float, void *);
1523
+ };
1524
+
1525
+ static void msea_search_each_i(FrtSearcher *self, int doc_num, float score, void *arg)
1526
+ {
1527
+ struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
1528
+
1529
+ mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
1530
+ }
1531
+
1532
+ static void msea_search_each_w(FrtSearcher *self, FrtWeight *w, FrtFilter *filter,
1533
+ FrtPostFilter *post_filter,
1534
+ void (*fn)(FrtSearcher *, int, float, void *),
1535
+ void *arg)
1536
+ {
1537
+ int i;
1538
+ struct MultiSearchEachArg mse_arg;
1539
+ FrtMultiSearcher *msea = MSEA(self);
1540
+ FrtSearcher *s;
1541
+
1542
+ mse_arg.fn = fn;
1543
+ mse_arg.arg = arg;
1544
+ for (i = 0; i < msea->s_cnt; i++) {
1545
+ s = msea->searchers[i];
1546
+ mse_arg.start = msea->starts[i];
1547
+ s->search_each_w(s, w, filter, post_filter,
1548
+ &msea_search_each_i, &mse_arg);
1549
+ }
1550
+ }
1551
+
1552
+ static void msea_search_each(FrtSearcher *self, FrtQuery *query, FrtFilter *filter,
1553
+ FrtPostFilter *post_filter,
1554
+ void (*fn)(FrtSearcher *, int, float, void *),
1555
+ void *arg)
1556
+ {
1557
+ FrtWeight *weight = frt_q_weight(query, self);
1558
+ msea_search_each_w(self, weight, filter, post_filter, fn, arg);
1559
+ weight->destroy(weight);
1560
+ }
1561
+
1562
+ static int msea_search_unscored_w(FrtSearcher *self,
1563
+ FrtWeight *w,
1564
+ int *buf,
1565
+ int limit,
1566
+ int offset_docnum)
1567
+ {
1568
+ int i, count = 0;
1569
+ FrtMultiSearcher *msea = MSEA(self);
1570
+
1571
+ for (i = 0; count < limit && i < msea->s_cnt; i++) {
1572
+ /* if offset_docnum falls in this or previous indexes */
1573
+ if (offset_docnum < msea->starts[i+1]) {
1574
+ FrtSearcher *searcher = msea->searchers[i];
1575
+ const int index_offset = msea->starts[i];
1576
+ int current_limit = limit - count;
1577
+ /* if offset_docnum occurs in the current index then adjust,
1578
+ * otherwise set it to zero as it occurred in a previous index */
1579
+ int current_offset_docnum = offset_docnum > index_offset
1580
+ ? offset_docnum - index_offset
1581
+ : 0;
1582
+
1583
+ /* record current count as we'll need to update docnums by the
1584
+ * index's offset */
1585
+ int j = count;
1586
+ count += searcher->search_unscored_w(searcher, w, buf + count,
1587
+ current_limit,
1588
+ current_offset_docnum);
1589
+ /* update doc nums with the current index's offsets */
1590
+ for (; j < count; j++) {
1591
+ buf[j] += index_offset;
1592
+ }
1593
+ }
1594
+ }
1595
+ return count;
1596
+ }
1597
+
1598
+ static int msea_search_unscored(FrtSearcher *self,
1599
+ FrtQuery *query,
1600
+ int *buf,
1601
+ int limit,
1602
+ int offset_docnum)
1603
+ {
1604
+ int count;
1605
+ FrtWeight *weight = frt_q_weight(query, self);
1606
+ count = msea_search_unscored_w(self, weight, buf, limit, offset_docnum);
1607
+ weight->destroy(weight);
1608
+ return count;
1609
+ }
1610
+
1611
+ struct MultiSearchArg {
1612
+ int total_hits, max_size;
1613
+ FrtPriorityQueue *hq;
1614
+ void (*hq_insert)(FrtPriorityQueue *pq, FrtHit *hit);
1615
+ };
1616
+
1617
+ /*
1618
+ * FIXME Not used anywhere. Is it needed?
1619
+ static void msea_search_i(FrtSearcher *self, int doc_num, float score, void *arg)
1620
+ {
1621
+ struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
1622
+ FrtHit hit;
1623
+ (void)self;
1624
+
1625
+ ms_arg->total_hits++;
1626
+ hit.doc = doc_num;
1627
+ hit.score = score;
1628
+ ms_arg->hq_insert(ms_arg->hq, &hit);
1629
+ }
1630
+ */
1631
+
1632
+ static FrtTopDocs *msea_search_w(FrtSearcher *self,
1633
+ FrtWeight *weight,
1634
+ int first_doc,
1635
+ int num_docs,
1636
+ FrtFilter *filter,
1637
+ FrtSort *sort,
1638
+ FrtPostFilter *post_filter,
1639
+ bool load_fields)
1640
+ {
1641
+ int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
1642
+ int i;
1643
+ int total_hits = 0;
1644
+ FrtHit **score_docs = NULL;
1645
+
1646
+ FrtPriorityQueue *hq;
1647
+ FrtHit *(*hq_pop)(FrtPriorityQueue *pq);
1648
+ void (*hq_insert)(FrtPriorityQueue *pq, FrtHit *hit);
1649
+
1650
+ float max_score = 0.0f;
1651
+
1652
+ (void)load_fields; /* does it automatically */
1653
+
1654
+ sea_check_args(num_docs, first_doc);
1655
+
1656
+ if (sort) {
1657
+ hq = frt_pq_new(max_size, (frt_lt_ft)&frt_fdshq_lt, &free);
1658
+ hq_insert = (void (*)(FrtPriorityQueue *pq, FrtHit *hit))&frt_pq_insert;
1659
+ hq_pop = (FrtHit *(*)(FrtPriorityQueue *pq))&frt_pq_pop;
1660
+ } else {
1661
+ hq = frt_pq_new(max_size, (frt_lt_ft)&hit_lt, &free);
1662
+ hq_insert = &hit_pq_multi_insert;
1663
+ hq_pop = &hit_pq_pop;
1664
+ }
1665
+
1666
+ for (i = 0; i < MSEA(self)->s_cnt; i++) {
1667
+ FrtSearcher *s = MSEA(self)->searchers[i];
1668
+ FrtTopDocs *td = s->search_w(s, weight, 0, max_size, filter, sort, post_filter, true);
1669
+ if (td->size > 0) {
1670
+ int j;
1671
+ int start = MSEA(self)->starts[i];
1672
+ for (j = 0; j < td->size; j++) {
1673
+ FrtHit *hit = td->hits[j];
1674
+ hit->doc += start;
1675
+ hq_insert(hq, hit);
1676
+ }
1677
+ td->size = 0;
1678
+ if (td->max_score > max_score) max_score = td->max_score;
1679
+ }
1680
+ total_hits += td->total_hits;
1681
+ frt_td_destroy(td);
1682
+ }
1683
+
1684
+ if (hq->size > first_doc) {
1685
+ if ((hq->size - first_doc) < num_docs) {
1686
+ num_docs = hq->size - first_doc;
1687
+ }
1688
+ score_docs = FRT_ALLOC_N(FrtHit *, num_docs);
1689
+ for (i = num_docs - 1; i >= 0; i--) {
1690
+ score_docs[i] = hq_pop(hq);
1691
+ }
1692
+ } else {
1693
+ num_docs = 0;
1694
+ }
1695
+ frt_pq_clear(hq);
1696
+ frt_pq_destroy(hq);
1697
+
1698
+ return frt_td_new(total_hits, num_docs, score_docs, max_score);
1699
+ }
1700
+
1701
+ static FrtTopDocs *msea_search(FrtSearcher *self,
1702
+ FrtQuery *query,
1703
+ int first_doc,
1704
+ int num_docs,
1705
+ FrtFilter *filter,
1706
+ FrtSort *sort,
1707
+ FrtPostFilter *post_filter,
1708
+ bool load_fields)
1709
+ {
1710
+ FrtTopDocs *td;
1711
+ FrtWeight *weight = frt_q_weight(query, self);
1712
+ td = msea_search_w(self, weight, first_doc, num_docs, filter,
1713
+ sort, post_filter, load_fields);
1714
+ weight->destroy(weight);
1715
+ return td;
1716
+ }
1717
+
1718
+ static FrtQuery *msea_rewrite(FrtSearcher *self, FrtQuery *original)
1719
+ {
1720
+ int i;
1721
+ FrtSearcher *s;
1722
+ FrtMultiSearcher *msea = MSEA(self);
1723
+ FrtQuery **queries = FRT_ALLOC_N(FrtQuery *, msea->s_cnt), *rewritten;
1724
+
1725
+ for (i = 0; i < msea->s_cnt; i++) {
1726
+ s = msea->searchers[i];
1727
+ queries[i] = s->rewrite(s, original);
1728
+ }
1729
+ rewritten = frt_q_combine(queries, msea->s_cnt);
1730
+
1731
+ for (i = 0; i < msea->s_cnt; i++) {
1732
+ frt_q_deref(queries[i]);
1733
+ }
1734
+ free(queries);
1735
+ return rewritten;
1736
+ }
1737
+
1738
+ static FrtExplanation *msea_explain(FrtSearcher *self, FrtQuery *query, int doc_num)
1739
+ {
1740
+ FrtMultiSearcher *msea = MSEA(self);
1741
+ int i = msea_get_searcher_index(self, doc_num);
1742
+ FrtWeight *w = frt_q_weight(query, self);
1743
+ FrtSearcher *s = msea->searchers[i];
1744
+ FrtExplanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1745
+ w->destroy(w);
1746
+ return e;
1747
+ }
1748
+
1749
+ static FrtExplanation *msea_explain_w(FrtSearcher *self, FrtWeight *w, int doc_num)
1750
+ {
1751
+ FrtMultiSearcher *msea = MSEA(self);
1752
+ int i = msea_get_searcher_index(self, doc_num);
1753
+ FrtSearcher *s = msea->searchers[i];
1754
+ FrtExplanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
1755
+ return e;
1756
+ }
1757
+
1758
+ static FrtTermVector *msea_get_term_vector(FrtSearcher *self, const int doc_num,
1759
+ FrtSymbol field)
1760
+ {
1761
+ FrtMultiSearcher *msea = MSEA(self);
1762
+ int i = msea_get_searcher_index(self, doc_num);
1763
+ FrtSearcher *s = msea->searchers[i];
1764
+ return s->get_term_vector(s, doc_num - msea->starts[i], field);
1765
+ }
1766
+
1767
+ static FrtSimilarity *msea_get_similarity(FrtSearcher *self)
1768
+ {
1769
+ return self->similarity;
1770
+ }
1771
+
1772
+ static void msea_close(FrtSearcher *self)
1773
+ {
1774
+ int i;
1775
+ FrtSearcher *s;
1776
+ FrtMultiSearcher *msea = MSEA(self);
1777
+ if (msea->close_subs) {
1778
+ for (i = 0; i < msea->s_cnt; i++) {
1779
+ s = msea->searchers[i];
1780
+ s->close(s);
1781
+ }
1782
+ }
1783
+ free(msea->searchers);
1784
+ free(msea->starts);
1785
+ free(self);
1786
+ }
1787
+
1788
+ FrtSearcher *frt_msea_new(FrtSearcher **searchers, int s_cnt, bool close_subs)
1789
+ {
1790
+ int i, max_doc = 0;
1791
+ FrtSearcher *self = (FrtSearcher *)FRT_ALLOC(FrtMultiSearcher);
1792
+ int *starts = FRT_ALLOC_N(int, s_cnt + 1);
1793
+ for (i = 0; i < s_cnt; i++) {
1794
+ starts[i] = max_doc;
1795
+ max_doc += searchers[i]->max_doc(searchers[i]);
1796
+ }
1797
+ starts[i] = max_doc;
1798
+
1799
+ MSEA(self)->s_cnt = s_cnt;
1800
+ MSEA(self)->searchers = searchers;
1801
+ MSEA(self)->starts = starts;
1802
+ MSEA(self)->max_doc = max_doc;
1803
+ MSEA(self)->close_subs = close_subs;
1804
+
1805
+ self->similarity = frt_sim_create_default();
1806
+ self->doc_freq = &msea_doc_freq;
1807
+ self->get_doc = &msea_get_doc;
1808
+ self->get_lazy_doc = &msea_get_lazy_doc;
1809
+ self->max_doc = &msea_max_doc;
1810
+ self->create_weight = &msea_create_weight;
1811
+ self->search = &msea_search;
1812
+ self->search_w = &msea_search_w;
1813
+ self->search_each = &msea_search_each;
1814
+ self->search_each_w = &msea_search_each_w;
1815
+ self->search_unscored = &msea_search_unscored;
1816
+ self->search_unscored_w = &msea_search_unscored_w;
1817
+ self->rewrite = &msea_rewrite;
1818
+ self->explain = &msea_explain;
1819
+ self->explain_w = &msea_explain_w;
1820
+ self->get_term_vector = &msea_get_term_vector;
1821
+ self->get_similarity = &msea_get_similarity;
1822
+ self->close = &msea_close;
1823
+ return self;
1824
+ }