ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -2,10 +2,11 @@
2
2
  #include <limits.h>
3
3
  #include "search.h"
4
4
  #include "array.h"
5
+ #include "internal.h"
5
6
 
6
7
  /***************************************************************************
7
8
  *
8
- * Explanation
9
+ * Explanation - Used to give details for query scores
9
10
  *
10
11
  ***************************************************************************/
11
12
 
@@ -46,7 +47,8 @@ char *expl_to_s_depth(Explanation *expl, int depth)
46
47
  memset(buffer, ' ', sizeof(char) * depth * 2);
47
48
  buffer[depth*2] = 0;
48
49
 
49
- buffer = estrcat(buffer, strfmt("%f = %s\n", expl->value, expl->description));
50
+ buffer = estrcat(buffer, strfmt("%f = %s\n",
51
+ expl->value, expl->description));
50
52
  for (i = 0; i < num_details; i++) {
51
53
  buffer = estrcat(buffer, expl_to_s_depth(expl->details[i], depth + 1));
52
54
  }
@@ -300,7 +302,7 @@ static const char *QUERY_NAMES[] = {
300
302
 
301
303
  static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
302
304
 
303
- const char *q_get_query_name(enum QUERY_TYPE type) {
305
+ const char *q_get_query_name(QueryType type) {
304
306
  if (type >= NELEMS(QUERY_NAMES)) {
305
307
  return UNKNOWN_QUERY_NAME;
306
308
  }
@@ -401,13 +403,14 @@ Query *q_combine(Query **queries, int q_cnt)
401
403
  }
402
404
  }
403
405
  if (uniques->size == 1) {
404
- ret_q = (Query *)uniques->elems[0];
406
+ ret_q = (Query *)uniques->first->elem;
405
407
  REF(ret_q);
406
408
  }
407
409
  else {
410
+ HashSetEntry *hse;
408
411
  ret_q = bq_new(true);
409
- for (i = 0; i < uniques->size; i++) {
410
- q = (Query *)uniques->elems[i];
412
+ for (hse = uniques->first; hse; hse = hse->next) {
413
+ q = (Query *)hse->elem;
411
414
  bq_add_query(ret_q, q, BC_SHOULD);
412
415
  }
413
416
  }
@@ -441,8 +444,8 @@ Query *q_create(size_t size)
441
444
  Query *self = (Query *)ecalloc(size);
442
445
  #ifdef DEBUG
443
446
  if (size < sizeof(Query)) {
444
- RAISE(FERRET_ERROR, "Size of a query <%d> should never be smaller than the "
445
- "size of a Query struct <%d>", (int)size, (int)sizeof(Query));
447
+ RAISE(FERRET_ERROR, "Size of a query <%d> should never be smaller than "
448
+ "the size of a Query struct <%d>", (int)size, (int)sizeof(Query));
446
449
  }
447
450
  #endif
448
451
  self->boost = 1.0;
@@ -536,8 +539,9 @@ MatchVector *matchv_add(MatchVector *self, int start, int end)
536
539
  REALLOC_N(self->matches, MatchRange, self->capa);
537
540
  }
538
541
  self->matches[self->size].start = start;
539
- self->matches[self->size].end = end;
540
- self->matches[self->size++].score = 1.0;
542
+ self->matches[self->size].end = end;
543
+ self->matches[self->size].score = 1.0;
544
+ self->size++;
541
545
  return self;
542
546
  }
543
547
 
@@ -620,7 +624,7 @@ void matchv_destroy(MatchVector *self)
620
624
  MatchVector *searcher_get_match_vector(Searcher *self,
621
625
  Query *query,
622
626
  const int doc_num,
623
- const char *field)
627
+ Symbol field)
624
628
  {
625
629
  MatchVector *mv = matchv_new();
626
630
  bool rewrite = query->get_matchv_i == q_get_matchv_i;
@@ -832,7 +836,7 @@ static char *highlight_field(MatchVector *mv,
832
836
  char **searcher_highlight(Searcher *self,
833
837
  Query *query,
834
838
  const int doc_num,
835
- const char *field,
839
+ Symbol field,
836
840
  const int excerpt_len,
837
841
  const int num_excerpts,
838
842
  const char *pre_tag,
@@ -844,7 +848,7 @@ char **searcher_highlight(Searcher *self,
844
848
  LazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
845
849
  LazyDocField *lazy_df = NULL;
846
850
  if (lazy_doc) {
847
- lazy_df = h_get(lazy_doc->field_dict, field);
851
+ lazy_df = lazy_doc_get(lazy_doc, field);
848
852
  }
849
853
  if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
850
854
  && tv->offsets != NULL) {
@@ -888,7 +892,7 @@ char **searcher_highlight(Searcher *self,
888
892
  }
889
893
 
890
894
  for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
891
- excerpts[i] = pq_pop(excerpt_pq);
895
+ excerpts[i] = (Excerpt *)pq_pop(excerpt_pq);
892
896
  if (i < num_excerpts - 1) {
893
897
  /* set match ranges alread included to 0 */
894
898
  Excerpt *e = excerpts[i];
@@ -897,7 +901,7 @@ char **searcher_highlight(Searcher *self,
897
901
  }
898
902
  e = NULL;
899
903
  while (e != (Excerpt *)pq_top(excerpt_pq)) {
900
- e = pq_top(excerpt_pq);
904
+ e = (Excerpt *)pq_top(excerpt_pq);
901
905
  excerpt_recalc_score(e, mv);
902
906
  pq_down(excerpt_pq);
903
907
  }
@@ -990,7 +994,7 @@ static Similarity *sea_get_similarity(Searcher *self)
990
994
 
991
995
  #define ISEA(searcher) ((IndexSearcher *)(searcher))
992
996
 
993
- int isea_doc_freq(Searcher *self, const char *field, const char *term)
997
+ int isea_doc_freq(Searcher *self, Symbol field, const char *term)
994
998
  {
995
999
  return ir_doc_freq(ISEA(self)->ir, field, term);
996
1000
  }
@@ -1013,10 +1017,12 @@ static int isea_max_doc(Searcher *self)
1013
1017
  return ir->max_doc(ir);
1014
1018
  }
1015
1019
 
1016
- #define IS_FILTERED(bits, filter_func, scorer, searcher) \
1020
+ #define IS_FILTERED(bits, post_filter, scorer, searcher) \
1017
1021
  ((bits && !bv_get(bits, scorer->doc))\
1018
- || (filter_func \
1019
- && !filter_func(scorer->doc, scorer->score(scorer), searcher)))
1022
+ || (post_filter \
1023
+ && !(filter_factor = \
1024
+ post_filter->filter_func(scorer->doc, scorer->score(scorer),\
1025
+ searcher, post_filter->arg))))
1020
1026
 
1021
1027
  static TopDocs *isea_search_w(Searcher *self,
1022
1028
  Weight *weight,
@@ -1024,7 +1030,7 @@ static TopDocs *isea_search_w(Searcher *self,
1024
1030
  int num_docs,
1025
1031
  Filter *filter,
1026
1032
  Sort *sort,
1027
- filter_ft filter_func,
1033
+ PostFilter *post_filter,
1028
1034
  bool load_fields)
1029
1035
  {
1030
1036
  int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
@@ -1034,6 +1040,7 @@ static TopDocs *isea_search_w(Searcher *self,
1034
1040
  Hit hit;
1035
1041
  int total_hits = 0;
1036
1042
  float score, max_score = 0.0;
1043
+ float filter_factor = 1.0;
1037
1044
  BitVector *bits = (filter
1038
1045
  ? filt_get_bv(filter, ISEA(self)->ir)
1039
1046
  : NULL);
@@ -1069,11 +1076,17 @@ static TopDocs *isea_search_w(Searcher *self,
1069
1076
  }
1070
1077
 
1071
1078
  while (scorer->next(scorer)) {
1072
- if (IS_FILTERED(bits, filter_func, scorer, self)) {
1079
+ if (bits && !bv_get(bits, scorer->doc)) continue;
1080
+ score = scorer->score(scorer);
1081
+ if (post_filter &&
1082
+ !(filter_factor = post_filter->filter_func(scorer->doc,
1083
+ score,
1084
+ self,
1085
+ post_filter->arg))) {
1073
1086
  continue;
1074
1087
  }
1075
1088
  total_hits++;
1076
- score = scorer->score(scorer);
1089
+ if (filter_factor < 1.0) score *= filter_factor;
1077
1090
  if (score > max_score) max_score = score;
1078
1091
  hit.doc = scorer->doc; hit.score = score;
1079
1092
  hq_insert(hq, &hit);
@@ -1108,23 +1121,24 @@ static TopDocs *isea_search(Searcher *self,
1108
1121
  int num_docs,
1109
1122
  Filter *filter,
1110
1123
  Sort *sort,
1111
- filter_ft filter_func,
1124
+ PostFilter *post_filter,
1112
1125
  bool load_fields)
1113
1126
  {
1114
1127
  TopDocs *td;
1115
1128
  Weight *weight = q_weight(query, self);
1116
1129
  td = isea_search_w(self, weight, first_doc, num_docs, filter,
1117
- sort, filter_func, load_fields);
1130
+ sort, post_filter, load_fields);
1118
1131
  weight->destroy(weight);
1119
1132
  return td;
1120
1133
  }
1121
1134
 
1122
1135
  static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
1123
- filter_ft filter_func,
1136
+ PostFilter *post_filter,
1124
1137
  void (*fn)(Searcher *, int, float, void *),
1125
1138
  void *arg)
1126
1139
  {
1127
1140
  Scorer *scorer;
1141
+ float filter_factor = 1.0;
1128
1142
  BitVector *bits = (filter
1129
1143
  ? filt_get_bv(filter, ISEA(self)->ir)
1130
1144
  : NULL);
@@ -1135,24 +1149,70 @@ static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
1135
1149
  }
1136
1150
 
1137
1151
  while (scorer->next(scorer)) {
1138
- if (IS_FILTERED(bits, filter_func, scorer, self)) {
1152
+ if (bits && !bv_get(bits, scorer->doc)) continue;
1153
+ float score = scorer->score(scorer);
1154
+ if (post_filter &&
1155
+ !(filter_factor = post_filter->filter_func(scorer->doc,
1156
+ score,
1157
+ self,
1158
+ post_filter->arg))) {
1139
1159
  continue;
1140
1160
  }
1141
- fn(self, scorer->doc, scorer->score(scorer), arg);
1161
+ fn(self, scorer->doc, filter_factor * score, arg);
1142
1162
  }
1143
1163
  scorer->destroy(scorer);
1144
1164
  }
1145
1165
 
1146
1166
  static void isea_search_each(Searcher *self, Query *query, Filter *filter,
1147
- filter_ft filter_func,
1167
+ PostFilter *post_filter,
1148
1168
  void (*fn)(Searcher *, int, float, void *),
1149
1169
  void *arg)
1150
1170
  {
1151
1171
  Weight *weight = q_weight(query, self);
1152
- isea_search_each_w(self, weight, filter, filter_func, fn, arg);
1172
+ isea_search_each_w(self, weight, filter, post_filter, fn, arg);
1153
1173
  weight->destroy(weight);
1154
1174
  }
1155
1175
 
1176
+ /*
1177
+ * Scan the index for all documents that match a query and write the results
1178
+ * to a buffer. It will stop scanning once the limit is reached and it starts
1179
+ * scanning from offset_docnum.
1180
+ *
1181
+ * Note: Unlike the offset_docnum in other search methods, this offset_docnum
1182
+ * refers to document number and not hit.
1183
+ */
1184
+ static int isea_search_unscored_w(Searcher *self,
1185
+ Weight *weight,
1186
+ int *buf,
1187
+ int limit,
1188
+ int offset_docnum)
1189
+ {
1190
+ int count = 0;
1191
+ Scorer *scorer = weight->scorer(weight, ISEA(self)->ir);
1192
+ if (scorer) {
1193
+ if (scorer->skip_to(scorer, offset_docnum)) {
1194
+ do {
1195
+ buf[count++] = scorer->doc;
1196
+ } while (count < limit && scorer->next(scorer));
1197
+ }
1198
+ scorer->destroy(scorer);
1199
+ }
1200
+ return count;
1201
+ }
1202
+
1203
+ static int isea_search_unscored(Searcher *self,
1204
+ Query *query,
1205
+ int *buf,
1206
+ int limit,
1207
+ int offset_docnum)
1208
+ {
1209
+ int count;
1210
+ Weight *weight = q_weight(query, self);
1211
+ count = isea_search_unscored_w(self, weight, buf, limit, offset_docnum);
1212
+ weight->destroy(weight);
1213
+ return count;
1214
+ }
1215
+
1156
1216
  static Query *isea_rewrite(Searcher *self, Query *original)
1157
1217
  {
1158
1218
  int q_is_destroyed = false;
@@ -1167,7 +1227,9 @@ static Query *isea_rewrite(Searcher *self, Query *original)
1167
1227
  return query;
1168
1228
  }
1169
1229
 
1170
- static Explanation *isea_explain(Searcher *self, Query *query, int doc_num)
1230
+ static Explanation *isea_explain(Searcher *self,
1231
+ Query *query,
1232
+ int doc_num)
1171
1233
  {
1172
1234
  Weight *weight = q_weight(query, self);
1173
1235
  Explanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
@@ -1182,7 +1244,7 @@ static Explanation *isea_explain_w(Searcher *self, Weight *w, int doc_num)
1182
1244
 
1183
1245
  static TermVector *isea_get_term_vector(Searcher *self,
1184
1246
  const int doc_num,
1185
- const char *field)
1247
+ Symbol field)
1186
1248
  {
1187
1249
  IndexReader *ir = ISEA(self)->ir;
1188
1250
  return ir->term_vector(ir, doc_num, field);
@@ -1198,7 +1260,7 @@ static void isea_close(Searcher *self)
1198
1260
 
1199
1261
  Searcher *isea_new(IndexReader *ir)
1200
1262
  {
1201
- Searcher *self = (Searcher *)ecalloc(sizeof(IndexSearcher));
1263
+ Searcher *self = (Searcher *)ALLOC(IndexSearcher);
1202
1264
 
1203
1265
  ISEA(self)->ir = ir;
1204
1266
  ISEA(self)->close_ir = true;
@@ -1213,6 +1275,8 @@ Searcher *isea_new(IndexReader *ir)
1213
1275
  self->search_w = &isea_search_w;
1214
1276
  self->search_each = &isea_search_each;
1215
1277
  self->search_each_w = &isea_search_each_w;
1278
+ self->search_unscored = &isea_search_unscored;
1279
+ self->search_unscored_w = &isea_search_unscored_w;
1216
1280
  self->rewrite = &isea_rewrite;
1217
1281
  self->explain = &isea_explain;
1218
1282
  self->explain_w = &isea_explain_w;
@@ -1232,16 +1296,16 @@ Searcher *isea_new(IndexReader *ir)
1232
1296
  #define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
1233
1297
  typedef struct CachedDFSearcher
1234
1298
  {
1235
- Searcher super;
1236
- HashTable *df_map;
1237
- int max_doc;
1299
+ Searcher super;
1300
+ Hash *df_map;
1301
+ int max_doc;
1238
1302
  } CachedDFSearcher;
1239
1303
 
1240
- static int cdfsea_doc_freq(Searcher *self, const char *field, const char *text)
1304
+ static int cdfsea_doc_freq(Searcher *self, Symbol field, const char *text)
1241
1305
  {
1242
1306
  Term term;
1243
1307
  int *df;
1244
- term.field = (char *)field;
1308
+ term.field = field;
1245
1309
  term.text = (char *)text;
1246
1310
  df = (int *)h_get(CDFSEA(self)->df_map, &term);
1247
1311
  return df ? *df : 0;
@@ -1250,7 +1314,7 @@ static int cdfsea_doc_freq(Searcher *self, const char *field, const char *text)
1250
1314
  static Document *cdfsea_get_doc(Searcher *self, int doc_num)
1251
1315
  {
1252
1316
  (void)self; (void)doc_num;
1253
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1317
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1254
1318
  return NULL;
1255
1319
  }
1256
1320
 
@@ -1263,44 +1327,44 @@ static int cdfsea_max_doc(Searcher *self)
1263
1327
  static Weight *cdfsea_create_weight(Searcher *self, Query *query)
1264
1328
  {
1265
1329
  (void)self; (void)query;
1266
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1330
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1267
1331
  return NULL;
1268
1332
  }
1269
1333
 
1270
1334
  static TopDocs *cdfsea_search_w(Searcher *self, Weight *w, int fd, int nd,
1271
- Filter *f, Sort *s, filter_ft ff, bool load)
1335
+ Filter *f, Sort *s, PostFilter *pf, bool load)
1272
1336
  {
1273
1337
  (void)self; (void)w; (void)fd; (void)nd;
1274
- (void)f; (void)s; (void)ff; (void)load;
1275
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1338
+ (void)f; (void)s; (void)pf; (void)load;
1339
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1276
1340
  return NULL;
1277
1341
  }
1278
1342
 
1279
1343
  static TopDocs *cdfsea_search(Searcher *self, Query *q, int fd, int nd,
1280
- Filter *f, Sort *s, filter_ft ff, bool load)
1344
+ Filter *f, Sort *s, PostFilter *pf, bool load)
1281
1345
  {
1282
1346
  (void)self; (void)q; (void)fd; (void)nd;
1283
- (void)f; (void)s; (void)ff; (void)load;
1284
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1347
+ (void)f; (void)s; (void)pf; (void)load;
1348
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1285
1349
  return NULL;
1286
1350
  }
1287
1351
 
1288
1352
  static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
1289
- filter_ft ff,
1353
+ PostFilter *pf,
1290
1354
  void (*fn)(Searcher *, int, float, void *),
1291
1355
  void *arg)
1292
1356
  {
1293
- (void)self; (void)query; (void)filter; (void)ff; (void)fn; (void)arg;
1294
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1357
+ (void)self; (void)query; (void)filter; (void)pf; (void)fn; (void)arg;
1358
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1295
1359
  }
1296
1360
 
1297
1361
  static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
1298
- filter_ft ff,
1362
+ PostFilter *pf,
1299
1363
  void (*fn)(Searcher *, int, float, void *),
1300
1364
  void *arg)
1301
1365
  {
1302
- (void)self; (void)w; (void)filter; (void)ff; (void)fn; (void)arg;
1303
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1366
+ (void)self; (void)w; (void)filter; (void)pf; (void)fn; (void)arg;
1367
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1304
1368
  }
1305
1369
 
1306
1370
  static Query *cdfsea_rewrite(Searcher *self, Query *original)
@@ -1313,30 +1377,28 @@ static Query *cdfsea_rewrite(Searcher *self, Query *original)
1313
1377
  static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
1314
1378
  {
1315
1379
  (void)self; (void)query; (void)doc_num;
1316
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1380
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1317
1381
  return NULL;
1318
1382
  }
1319
1383
 
1320
1384
  static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
1321
1385
  {
1322
1386
  (void)self; (void)w; (void)doc_num;
1323
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1387
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1324
1388
  return NULL;
1325
1389
  }
1326
1390
 
1327
1391
  static TermVector *cdfsea_get_term_vector(Searcher *self, const int doc_num,
1328
- const char *field)
1392
+ Symbol field)
1329
1393
  {
1330
1394
  (void)self; (void)doc_num; (void)field;
1331
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1395
+ RAISE(UNSUPPORTED_ERROR, "%s", UNSUPPORTED_ERROR_MSG);
1332
1396
  return NULL;
1333
1397
  }
1334
1398
 
1335
1399
  static Similarity *cdfsea_get_similarity(Searcher *self)
1336
1400
  {
1337
- (void)self;
1338
- RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
1339
- return NULL;
1401
+ return self->similarity;
1340
1402
  }
1341
1403
 
1342
1404
  static void cdfsea_close(Searcher *self)
@@ -1345,13 +1407,14 @@ static void cdfsea_close(Searcher *self)
1345
1407
  free(self);
1346
1408
  }
1347
1409
 
1348
- static Searcher *cdfsea_new(HashTable *df_map, int max_doc)
1410
+ static Searcher *cdfsea_new(Hash *df_map, int max_doc)
1349
1411
  {
1350
- Searcher *self = (Searcher *)ecalloc(sizeof(CachedDFSearcher));
1412
+ Searcher *self = (Searcher *)ALLOC(CachedDFSearcher);
1351
1413
 
1352
1414
  CDFSEA(self)->df_map = df_map;
1353
1415
  CDFSEA(self)->max_doc = max_doc;
1354
1416
 
1417
+ self->similarity = sim_create_default();
1355
1418
  self->doc_freq = &cdfsea_doc_freq;
1356
1419
  self->get_doc = &cdfsea_get_doc;
1357
1420
  self->max_doc = &cdfsea_max_doc;
@@ -1403,7 +1466,7 @@ static INLINE int msea_get_searcher_index(Searcher *self, int n)
1403
1466
  return hi;
1404
1467
  }
1405
1468
 
1406
- static int msea_doc_freq(Searcher *self, const char *field, const char *term)
1469
+ static int msea_doc_freq(Searcher *self, Symbol field, const char *term)
1407
1470
  {
1408
1471
  int i;
1409
1472
  int doc_freq = 0;
@@ -1440,10 +1503,10 @@ static int msea_max_doc(Searcher *self)
1440
1503
  static int *msea_get_doc_freqs(Searcher *self, HashSet *terms)
1441
1504
  {
1442
1505
  int i;
1443
- const int num_terms = terms->size;
1444
- int *doc_freqs = ALLOC_N(int, num_terms);
1445
- for (i = 0; i < num_terms; i++) {
1446
- Term *t = (Term *)terms->elems[i];
1506
+ HashSetEntry *hse;
1507
+ int *doc_freqs = ALLOC_N(int, terms->size);
1508
+ for (i = 0, hse = terms->first; hse; ++i, hse = hse->next) {
1509
+ Term *t = (Term *)hse->elem;
1447
1510
  doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
1448
1511
  }
1449
1512
  return doc_freqs;
@@ -1454,16 +1517,22 @@ static Weight *msea_create_weight(Searcher *self, Query *query)
1454
1517
  int i, *doc_freqs;
1455
1518
  Searcher *cdfsea;
1456
1519
  Weight *w;
1457
- HashTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
1458
- (free_ft)NULL, free);
1520
+ Hash *df_map = h_new((hash_ft)&term_hash,
1521
+ (eq_ft)&term_eq,
1522
+ (free_ft)term_destroy,
1523
+ free);
1459
1524
  Query *rewritten_query = self->rewrite(self, query);
1460
- HashSet *terms = term_set_new();
1525
+ /* terms get copied directly to df_map so no need to free here */
1526
+ HashSet *terms = hs_new((hash_ft)&term_hash,
1527
+ (eq_ft)&term_eq,
1528
+ (free_ft)NULL);
1529
+ HashSetEntry *hse;
1461
1530
 
1462
1531
  rewritten_query->extract_terms(rewritten_query, terms);
1463
1532
  doc_freqs = msea_get_doc_freqs(self, terms);
1464
1533
 
1465
- for (i = 0; i < terms->size; i++) {
1466
- h_set(df_map, terms->elems[i], imalloc(doc_freqs[i]));
1534
+ for (hse = terms->first, i = 0; hse; ++i, hse = hse->next) {
1535
+ h_set(df_map, hse->elem, imalloc(doc_freqs[i]));
1467
1536
  }
1468
1537
  hs_destroy(terms);
1469
1538
  free(doc_freqs);
@@ -1483,7 +1552,7 @@ struct MultiSearchEachArg {
1483
1552
  void (*fn)(Searcher *, int, float, void *);
1484
1553
  };
1485
1554
 
1486
- void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
1555
+ static void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
1487
1556
  {
1488
1557
  struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
1489
1558
 
@@ -1491,7 +1560,7 @@ void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
1491
1560
  }
1492
1561
 
1493
1562
  static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
1494
- filter_ft filter_func,
1563
+ PostFilter *post_filter,
1495
1564
  void (*fn)(Searcher *, int, float, void *),
1496
1565
  void *arg)
1497
1566
  {
@@ -1505,18 +1574,68 @@ static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
1505
1574
  for (i = 0; i < msea->s_cnt; i++) {
1506
1575
  s = msea->searchers[i];
1507
1576
  mse_arg.start = msea->starts[i];
1508
- s->search_each_w(s, w, filter, filter_func,
1577
+ s->search_each_w(s, w, filter, post_filter,
1509
1578
  &msea_search_each_i, &mse_arg);
1510
1579
  }
1511
1580
  }
1512
1581
 
1513
1582
  static void msea_search_each(Searcher *self, Query *query, Filter *filter,
1514
- filter_ft filter_func,
1515
- void (*fn)(Searcher *, int, float, void *), void *arg)
1583
+ PostFilter *post_filter,
1584
+ void (*fn)(Searcher *, int, float, void *),
1585
+ void *arg)
1516
1586
  {
1517
- Weight *w = q_weight(query, self);
1518
- msea_search_each_w(self, w, filter, filter_func, fn, arg);
1519
- w->destroy(w);
1587
+ Weight *weight = q_weight(query, self);
1588
+ msea_search_each_w(self, weight, filter, post_filter, fn, arg);
1589
+ weight->destroy(weight);
1590
+ }
1591
+
1592
+ static int msea_search_unscored_w(Searcher *self,
1593
+ Weight *w,
1594
+ int *buf,
1595
+ int limit,
1596
+ int offset_docnum)
1597
+ {
1598
+ int i, count = 0;
1599
+ MultiSearcher *msea = MSEA(self);
1600
+
1601
+ for (i = 0; count < limit && i < msea->s_cnt; i++) {
1602
+ /* if offset_docnum falls in this or previous indexes */
1603
+ if (offset_docnum < msea->starts[i+1]) {
1604
+ Searcher *searcher = msea->searchers[i];
1605
+ const int index_offset = msea->starts[i];
1606
+ int current_limit = limit - count;
1607
+ /* if offset_docnum occurs in the current index then adjust,
1608
+ * otherwise set it to zero as it occured in a previous index */
1609
+ int current_offset_docnum = offset_docnum > index_offset
1610
+ ? offset_docnum - index_offset
1611
+ : 0;
1612
+
1613
+ /* record current count as we'll need to update docnums by the
1614
+ * index's offset */
1615
+ int j = count;
1616
+ count += searcher->search_unscored_w(searcher, w, buf + count,
1617
+ current_limit,
1618
+ current_offset_docnum);
1619
+ /* update doc nums with the current index's offsets */
1620
+ for (; j < count; j++) {
1621
+ buf[j] += index_offset;
1622
+ }
1623
+ }
1624
+ }
1625
+ return count;
1626
+ }
1627
+
1628
+ static int msea_search_unscored(Searcher *self,
1629
+ Query *query,
1630
+ int *buf,
1631
+ int limit,
1632
+ int offset_docnum)
1633
+ {
1634
+ int count;
1635
+ Weight *weight = q_weight(query, self);
1636
+ count = msea_search_unscored_w(self, weight, buf, limit, offset_docnum);
1637
+ weight->destroy(weight);
1638
+ return count;
1520
1639
  }
1521
1640
 
1522
1641
  struct MultiSearchArg {
@@ -1525,7 +1644,9 @@ struct MultiSearchArg {
1525
1644
  void (*hq_insert)(PriorityQueue *pq, Hit *hit);
1526
1645
  };
1527
1646
 
1528
- void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
1647
+ /*
1648
+ * FIXME Not used anywhere. Is it needed?
1649
+ static void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
1529
1650
  {
1530
1651
  struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
1531
1652
  Hit hit;
@@ -1536,6 +1657,7 @@ void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
1536
1657
  hit.score = score;
1537
1658
  ms_arg->hq_insert(ms_arg->hq, &hit);
1538
1659
  }
1660
+ */
1539
1661
 
1540
1662
  static TopDocs *msea_search_w(Searcher *self,
1541
1663
  Weight *weight,
@@ -1543,7 +1665,7 @@ static TopDocs *msea_search_w(Searcher *self,
1543
1665
  int num_docs,
1544
1666
  Filter *filter,
1545
1667
  Sort *sort,
1546
- filter_ft filter_func,
1668
+ PostFilter *post_filter,
1547
1669
  bool load_fields)
1548
1670
  {
1549
1671
  int max_size = num_docs + (num_docs == INT_MAX ? 0 : first_doc);
@@ -1573,7 +1695,7 @@ static TopDocs *msea_search_w(Searcher *self,
1573
1695
  for (i = 0; i < MSEA(self)->s_cnt; i++) {
1574
1696
  Searcher *s = MSEA(self)->searchers[i];
1575
1697
  TopDocs *td = s->search_w(s, weight, 0, max_size,
1576
- filter, sort, filter_func, true);
1698
+ filter, sort, post_filter, true);
1577
1699
  /*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
1578
1700
  if (td->size > 0) {
1579
1701
  /*printf("td->size = %d %d\n", td->size, num_docs); */
@@ -1622,13 +1744,13 @@ static TopDocs *msea_search(Searcher *self,
1622
1744
  int num_docs,
1623
1745
  Filter *filter,
1624
1746
  Sort *sort,
1625
- filter_ft filter_func,
1747
+ PostFilter *post_filter,
1626
1748
  bool load_fields)
1627
1749
  {
1628
1750
  TopDocs *td;
1629
1751
  Weight *weight = q_weight(query, self);
1630
1752
  td = msea_search_w(self, weight, first_doc, num_docs, filter,
1631
- sort, filter_func, load_fields);
1753
+ sort, post_filter, load_fields);
1632
1754
  weight->destroy(weight);
1633
1755
  return td;
1634
1756
  }
@@ -1674,13 +1796,12 @@ static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
1674
1796
  }
1675
1797
 
1676
1798
  static TermVector *msea_get_term_vector(Searcher *self, const int doc_num,
1677
- const char *field)
1799
+ Symbol field)
1678
1800
  {
1679
1801
  MultiSearcher *msea = MSEA(self);
1680
1802
  int i = msea_get_searcher_index(self, doc_num);
1681
1803
  Searcher *s = msea->searchers[i];
1682
- return s->get_term_vector(s, doc_num - msea->starts[i],
1683
- field);
1804
+ return s->get_term_vector(s, doc_num - msea->starts[i], field);
1684
1805
  }
1685
1806
 
1686
1807
  static Similarity *msea_get_similarity(Searcher *self)
@@ -1707,7 +1828,7 @@ static void msea_close(Searcher *self)
1707
1828
  Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
1708
1829
  {
1709
1830
  int i, max_doc = 0;
1710
- Searcher *self = (Searcher *)ecalloc(sizeof(MultiSearcher));
1831
+ Searcher *self = (Searcher *)ALLOC(MultiSearcher);
1711
1832
  int *starts = ALLOC_N(int, s_cnt + 1);
1712
1833
  for (i = 0; i < s_cnt; i++) {
1713
1834
  starts[i] = max_doc;
@@ -1731,6 +1852,8 @@ Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
1731
1852
  self->search_w = &msea_search_w;
1732
1853
  self->search_each = &msea_search_each;
1733
1854
  self->search_each_w = &msea_search_each_w;
1855
+ self->search_unscored = &msea_search_unscored;
1856
+ self->search_unscored_w = &msea_search_unscored_w;
1734
1857
  self->rewrite = &msea_rewrite;
1735
1858
  self->explain = &msea_explain;
1736
1859
  self->explain_w = &msea_explain_w;