ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_prefix.c CHANGED
@@ -7,83 +7,94 @@
7
7
  *
8
8
  ****************************************************************************/
9
9
 
10
- char *prq_to_s(Query *self, char *field)
10
+ #define PfxQ(query) ((PrefixQuery *)(query))
11
+
12
+ static char *prq_to_s(Query *self, const char *current_field)
11
13
  {
12
- char *buffer, *bptr;
13
- Term *term = (Term *)self->data;
14
- size_t tlen = strlen(term->text);
15
- size_t flen = strlen(term->field);
16
- bptr = buffer = ALLOC_N(char, tlen + flen + 35);
17
-
18
- if (strcmp(term->field, field) != 0) {
19
- sprintf(bptr, "%s:", term->field);
20
- bptr += strlen(bptr);
21
- }
22
- sprintf(bptr, "%s*", term->text);
23
- if (self->boost != 1.0) {
24
- *bptr = '^';
25
- dbl_to_s(++bptr, self->boost);
26
- }
27
-
28
- return buffer;
14
+ char *buffer, *bptr;
15
+ const char *prefix = PfxQ(self)->prefix;
16
+ const char *field = PfxQ(self)->field;
17
+ size_t plen = strlen(prefix);
18
+ size_t flen = strlen(field);
19
+
20
+ bptr = buffer = ALLOC_N(char, plen + flen + 35);
21
+
22
+ if (strcmp(field, current_field) != 0) {
23
+ sprintf(bptr, "%s:", field);
24
+ bptr += flen + 1;
25
+ }
26
+
27
+ sprintf(bptr, "%s*", prefix);
28
+ bptr += plen + 1;
29
+ if (self->boost != 1.0) {
30
+ *bptr = '^';
31
+ dbl_to_s(++bptr, self->boost);
32
+ }
33
+
34
+ return buffer;
29
35
  }
30
36
 
31
- Query *prq_rewrite(Query *self, IndexReader *ir)
37
+ static Query *prq_rewrite(Query *self, IndexReader *ir)
32
38
  {
33
- Term *prefix = (Term *)self->data;
34
- TermEnum *te = ir->terms_from(ir, prefix);
35
- char *prefix_text = prefix->text;
36
- size_t prefix_length = strlen(prefix_text);
37
- char *prefix_field = prefix->field;
38
- Query *tq;
39
- Query *bq = bq_create(true);
40
-
41
- TRY
42
- do {
43
- TermBuffer *tb = te->tb_curr;
44
- if (!tb || strcmp(tb->field, prefix_field) != 0 ||
45
- strncmp(tb->text, prefix_text, prefix_length) != 0) {
46
- break;
47
- }
48
- tq = tq_create(term_create(tb->field, tb->text)); /* found a match */
49
- tq->boost = self->boost; /* set the boost */
50
- bq_add_query(bq, tq, BC_SHOULD); /* add to query */
51
- } while (te->next(te));
52
- XFINALLY
53
- te->close(te);
54
- XENDTRY
55
-
56
- return bq;
39
+ const char *field = PfxQ(self)->field;
40
+ const int field_num = fis_get_field_num(ir->fis, field);
41
+ Query *volatile q = multi_tq_new_conf(field, MTQMaxTerms(self), 0.0);
42
+ q->boost = self->boost; /* set the boost */
43
+
44
+ if (field_num >= 0) {
45
+ const char *prefix = PfxQ(self)->prefix;
46
+ TermEnum *te = ir->terms_from(ir, field_num, prefix);
47
+ const char *term = te->curr_term;
48
+ size_t prefix_len = strlen(prefix);
49
+
50
+ TRY
51
+ do {
52
+ if (strncmp(term, prefix, prefix_len) != 0) {
53
+ break;
54
+ }
55
+ multi_tq_add_term(q, term); /* found a match */
56
+ } while (te->next(te));
57
+ XFINALLY
58
+ te->close(te);
59
+ XENDTRY
60
+ }
61
+
62
+ return q;
57
63
  }
58
64
 
59
65
  static void prq_destroy(Query *self)
60
66
  {
61
- if (self->destroy_all) term_destroy((Term *)self->data);
62
- q_destroy_i(self);
67
+ free(PfxQ(self)->field);
68
+ free(PfxQ(self)->prefix);
69
+ q_destroy_i(self);
63
70
  }
64
71
 
65
- static uint prq_hash(Query *self)
72
+ static ulong prq_hash(Query *self)
66
73
  {
67
- return term_hash((Term *)self->data);
74
+ return str_hash(PfxQ(self)->field) ^ str_hash(PfxQ(self)->prefix);
68
75
  }
69
76
 
70
77
  static int prq_eq(Query *self, Query *o)
71
78
  {
72
- return term_eq((Term *)self->data, (Term *)o->data);
79
+ return (strcmp(PfxQ(self)->prefix, PfxQ(o)->prefix) == 0)
80
+ && (strcmp(PfxQ(self)->field, PfxQ(o)->field) == 0);
73
81
  }
74
82
 
75
- Query *prefixq_create(Term *prefix)
83
+ Query *prefixq_new(const char *field, const char *prefix)
76
84
  {
77
- Query *self = q_create();
78
- self->data = prefix;
79
-
80
- self->type = PREFIX_QUERY;
81
- self->rewrite = &prq_rewrite;
82
- self->to_s = &prq_to_s;
83
- self->hash = &prq_hash;
84
- self->eq = &prq_eq;
85
- self->destroy_i = &prq_destroy;
86
- self->create_weight_i = &q_create_weight_unsup;
87
-
88
- return self;
85
+ Query *self = q_new(PrefixQuery);
86
+
87
+ PfxQ(self)->field = estrdup(field);
88
+ PfxQ(self)->prefix = estrdup(prefix);
89
+ MTQMaxTerms(self) = PREFIX_QUERY_MAX_TERMS;
90
+
91
+ self->type = PREFIX_QUERY;
92
+ self->rewrite = &prq_rewrite;
93
+ self->to_s = &prq_to_s;
94
+ self->hash = &prq_hash;
95
+ self->eq = &prq_eq;
96
+ self->destroy_i = &prq_destroy;
97
+ self->create_weight_i = &q_create_weight_unsup;
98
+
99
+ return self;
89
100
  }
data/ext/q_range.c CHANGED
@@ -1,120 +1,134 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
3
 
4
- static char * const NIL_BOUNDS_ERROR_MSG = "At least one value must be non-nil";
5
- static char * const LOWER_BOUND_ERROR_MSG = "The lower bound must be non-nil to be inclusive";
6
- static char * const UPPER_BOUND_ERROR_MSG = "The upper bound must be non-nil to be inclusive";
7
- static char * const BOUND_ORDER_ERROR_MSG = "The lower bound must less than the upper bound";
8
-
9
4
  /*****************************************************************************
10
5
  *
11
6
  * Range
12
7
  *
13
8
  *****************************************************************************/
14
9
 
15
- char *range_to_s(Range *range, char *field, float boost)
10
+ typedef struct Range
16
11
  {
17
- char *buffer, *b;
18
- size_t flen, llen, ulen;
19
-
20
- flen = strlen(range->field);
21
- llen = range->lower_term ? strlen(range->lower_term) : 0;
22
- ulen = range->upper_term ? strlen(range->upper_term) : 0;
23
- buffer = ALLOC_N(char, flen + llen + ulen + 40);
24
- b = buffer;
25
-
26
- if (strcmp(field, range->field)) {
27
- memcpy(buffer, range->field, flen * sizeof(char));
28
- b += flen;
29
- *b = ':';
30
- b++;
31
- }
32
-
33
- if (range->lower_term) {
34
- *b = range->include_lower ? '[' : '{';
35
- b++;
36
- memcpy(b, range->lower_term, llen);
37
- b += llen;
38
- } else {
39
- *b = '<';
40
- b++;
41
- }
42
-
43
- if (range->upper_term && range->lower_term) {
44
- *b = ' '; b++;
45
- }
46
-
47
- if (range->upper_term) {
48
- memcpy(b, range->upper_term, ulen);
49
- b += ulen;
50
- *b = range->include_upper ? ']' : '}';
51
- b++;
52
- } else {
53
- *b = '>';
54
- b++;
55
- }
56
-
57
- *b = 0;
58
- if (boost != 1.0) {
59
- *b = '^';
60
- dbl_to_s(b + 1, boost);
61
- }
62
- return buffer;
12
+ char *field;
13
+ char *lower_term;
14
+ char *upper_term;
15
+ bool include_lower : 1;
16
+ bool include_upper : 1;
17
+ } Range;
18
+
19
+ static char *range_to_s(Range *range, const char *field, float boost)
20
+ {
21
+ char *buffer, *b;
22
+ size_t flen, llen, ulen;
23
+
24
+ flen = strlen(range->field);
25
+ llen = range->lower_term ? strlen(range->lower_term) : 0;
26
+ ulen = range->upper_term ? strlen(range->upper_term) : 0;
27
+ buffer = ALLOC_N(char, flen + llen + ulen + 40);
28
+ b = buffer;
29
+
30
+ if (strcmp(field, range->field)) {
31
+ memcpy(buffer, range->field, flen * sizeof(char));
32
+ b += flen;
33
+ *b = ':';
34
+ b++;
35
+ }
36
+
37
+ if (range->lower_term) {
38
+ *b = range->include_lower ? '[' : '{';
39
+ b++;
40
+ memcpy(b, range->lower_term, llen);
41
+ b += llen;
42
+ } else {
43
+ *b = '<';
44
+ b++;
45
+ }
46
+
47
+ if (range->upper_term && range->lower_term) {
48
+ *b = ' '; b++;
49
+ }
50
+
51
+ if (range->upper_term) {
52
+ memcpy(b, range->upper_term, ulen);
53
+ b += ulen;
54
+ *b = range->include_upper ? ']' : '}';
55
+ b++;
56
+ } else {
57
+ *b = '>';
58
+ b++;
59
+ }
60
+
61
+ *b = 0;
62
+ if (boost != 1.0) {
63
+ *b = '^';
64
+ dbl_to_s(b + 1, boost);
65
+ }
66
+ return buffer;
63
67
  }
64
68
 
65
- void range_destroy(void *p)
69
+ static void range_destroy(Range *range)
66
70
  {
67
- Range *range = (Range *)p;
68
- free(range->field);
69
- if (range->lower_term) free(range->lower_term);
70
- if (range->upper_term) free(range->upper_term);
71
- free(range);
71
+ free(range->field);
72
+ free(range->lower_term);
73
+ free(range->upper_term);
74
+ free(range);
72
75
  }
73
76
 
74
- static inline uint range_hash(Range *self)
77
+ static ulong range_hash(Range *filt)
75
78
  {
76
- return self->include_lower | (self->include_upper << 1) |
77
- ((str_hash(self->field) ^
78
- (self->lower_term ? str_hash(self->lower_term) : 0) ^
79
- (self->upper_term ? str_hash(self->upper_term) : 0)) << 2);
79
+ return filt->include_lower | (filt->include_upper << 1)
80
+ | ((str_hash(filt->field)
81
+ ^ (filt->lower_term ? str_hash(filt->lower_term) : 0)
82
+ ^ (filt->upper_term ? str_hash(filt->upper_term) : 0)) << 2);
80
83
  }
81
84
 
82
- static inline int str_eq(char *s1, char *s2)
85
+ static int str_eq(char *s1, char *s2)
83
86
  {
84
- return (s1 && s2 && (strcmp(s1, s2) == 0)) || (s1 == s2);
87
+ return (s1 && s2 && (strcmp(s1, s2) == 0)) || (s1 == s2);
85
88
  }
86
89
 
87
- static inline int range_eq(Range *self, Range *o)
90
+ static int range_eq(Range *filt, Range *o)
88
91
  {
89
- return (str_eq(self->field, o->field) &&
90
- str_eq(self->lower_term, o->lower_term) &&
91
- str_eq(self->upper_term, o->upper_term) &&
92
- (self->include_lower == o->include_lower) &&
93
- (self->include_upper == o->include_upper));
92
+ return (str_eq(filt->field, o->field)
93
+ && str_eq(filt->lower_term, o->lower_term)
94
+ && str_eq(filt->upper_term, o->upper_term)
95
+ && (filt->include_lower == o->include_lower)
96
+ && (filt->include_upper == o->include_upper));
94
97
  }
95
98
 
96
- Range *range_create(const char *field, char *lower_term, char *upper_term,
97
- bool include_lower, bool include_upper)
99
+ Range *range_new(const char *field, const char *lower_term,
100
+ const char *upper_term, bool include_lower,
101
+ bool include_upper)
98
102
  {
99
- Range *range;
100
-
101
- if (!lower_term && !upper_term)
102
- RAISE(ARG_ERROR, NIL_BOUNDS_ERROR_MSG);
103
- if (include_lower && !lower_term)
104
- RAISE(ARG_ERROR, LOWER_BOUND_ERROR_MSG);
105
- if (include_upper && !upper_term)
106
- RAISE(ARG_ERROR, UPPER_BOUND_ERROR_MSG);
107
- if (upper_term && lower_term && (strcmp(upper_term, lower_term) < 0))
108
- RAISE(ARG_ERROR, BOUND_ORDER_ERROR_MSG);
109
-
110
- range = ALLOC(Range);
111
-
112
- range->field = estrdup((char *)field);
113
- range->lower_term = lower_term ? estrdup(lower_term) : NULL;
114
- range->upper_term = upper_term ? estrdup(upper_term) : NULL;
115
- range->include_lower = include_lower;
116
- range->include_upper = include_upper;
117
- return range;
103
+ Range *range;
104
+
105
+ if (!lower_term && !upper_term) {
106
+ RAISE(ARG_ERROR, "Nil bounds for range. A range must include either "
107
+ "lower bound or an upper bound");
108
+ }
109
+ if (include_lower && !lower_term) {
110
+ RAISE(ARG_ERROR, "Lower bound must be non-nil to be inclusive. That "
111
+ "is, if you specify :include_lower => true when you create a "
112
+ "range you must include a :lower_term");
113
+ }
114
+ if (include_upper && !upper_term) {
115
+ RAISE(ARG_ERROR, "Upper bound must be non-nil to be inclusive. That "
116
+ "is, if you specify :include_upper => true when you create a "
117
+ "range you must include a :upper_term");
118
+ }
119
+ if (upper_term && lower_term && (strcmp(upper_term, lower_term) < 0)) {
120
+ RAISE(ARG_ERROR, "Upper bound must be greater than lower bound. "
121
+ "\"%s\" < \"%s\"", upper_term, lower_term);
122
+ }
123
+
124
+ range = ALLOC(Range);
125
+
126
+ range->field = estrdup((char *)field);
127
+ range->lower_term = lower_term ? estrdup(lower_term) : NULL;
128
+ range->upper_term = upper_term ? estrdup(upper_term) : NULL;
129
+ range->include_lower = include_lower;
130
+ range->include_upper = include_upper;
131
+ return range;
118
132
  }
119
133
 
120
134
  /***************************************************************************
@@ -123,109 +137,112 @@ Range *range_create(const char *field, char *lower_term, char *upper_term,
123
137
  *
124
138
  ***************************************************************************/
125
139
 
126
- void rfilt_destroy(Filter *self)
140
+ typedef struct RangeFilter
141
+ {
142
+ Filter super;
143
+ Range *range;
144
+ } RangeFilter;
145
+
146
+ #define RF(filt) ((RangeFilter *)(filt))
147
+
148
+ static void rfilt_destroy_i(Filter *filt)
127
149
  {
128
- range_destroy(self->data);
129
- filt_destroy(self);
150
+ range_destroy(RF(filt)->range);
151
+ filt_destroy_i(filt);
130
152
  }
131
153
 
132
- char *rfilt_to_s(Filter *self)
154
+ static char *rfilt_to_s(Filter *filt)
133
155
  {
134
- Range *range = (Range *)self->data;
135
- char *rstr = range_to_s(range, "", 1.0);
136
- char *rfstr = epstrdup("RangeFilter< %s >", strlen(rstr), rstr);
137
- free(rstr);
138
- return rfstr;
156
+ char *rstr = range_to_s(RF(filt)->range, "", 1.0);
157
+ char *rfstr = strfmt("RangeFilter< %s >", rstr);
158
+ free(rstr);
159
+ return rfstr;
139
160
  }
140
161
 
141
- BitVector *rfilt_get_bv(Filter *self, IndexReader *ir)
162
+ static BitVector *rfilt_get_bv_i(Filter *filt, IndexReader *ir)
142
163
  {
143
- BitVector *bv = bv_create_size(ir->max_doc(ir));
144
- Range *range = (Range *)self->data;
145
- char *field = range->field;
146
- char *lower_term = range->lower_term ? range->lower_term : (char *)EMPTY_STRING;
147
- char *upper_term = range->upper_term;
148
- bool include_upper = range->include_upper;
149
-
150
- Term *term_from = term_create(range->field, lower_term);
151
- Term term;
152
- TermBuffer *tb;
153
- TermEnum* te;
154
- TermDocEnum *tde;
155
- bool check_lower;
156
-
157
- te = ir->terms_from(ir, term_from);
158
- if (te->tb_curr == NULL) {
159
- return bv;
160
- }
161
-
162
- check_lower = false;
163
- if (!range->include_lower) // make adjustments to set to exclusive
164
- check_lower = true;
165
-
166
- tde = ir->term_docs(ir);
167
- tb = te->tb_curr;
168
- term.text = tb->text;
169
- do {
170
- if (tb && strcmp(tb->field, field) == 0) {
171
- if (!check_lower || lower_term == EMPTY_STRING ||
172
- strcmp(tb->text, lower_term) > 0) {
173
- check_lower = false;
174
- if (upper_term) {
175
- int compare = strcmp(upper_term, tb->text);
176
- /* if beyond the upper term, or is exclusive and
177
- * this is equal to the upper term, break out */
178
- if ((compare < 0) ||
179
- (!include_upper && compare==0)) {
180
- break;
181
- }
182
- }
183
- /* we have a good term, find the docs */
184
- /* text is already pointing to term buffer text */
185
- term.field = tb->field;
186
- tde->seek(tde, &term);
187
- while (tde->next(tde)) {
188
- bv_set(bv, tde->doc_num(tde));
189
- //printf("Setting %d\n", tde->doc_num(tde));
164
+ BitVector *bv = bv_new_capa(ir->max_doc(ir));
165
+ Range *range = RF(filt)->range;
166
+ FieldInfo *fi = fis_get_field(ir->fis, range->field);
167
+ /* the field info exists we need to add docs to the bit vector, otherwise
168
+ * we just return an empty bit vector */
169
+ if (fi) {
170
+ const char *lower_term =
171
+ range->lower_term ? range->lower_term : EMPTY_STRING;
172
+ const char *upper_term = range->upper_term;
173
+ const bool include_upper = range->include_upper;
174
+ const int field_num = fi->number;
175
+ char *term;
176
+ TermEnum* te;
177
+ TermDocEnum *tde;
178
+ bool check_lower;
179
+
180
+ te = ir->terms(ir, field_num);
181
+ if (te->skip_to(te, lower_term) == NULL) {
182
+ te->close(te);
183
+ return bv;
190
184
  }
191
- }
192
- } else {
193
- break;
185
+
186
+ check_lower = !(range->include_lower || (lower_term == EMPTY_STRING));
187
+
188
+ tde = ir->term_docs(ir);
189
+ term = te->curr_term;
190
+ do {
191
+ if (!check_lower
192
+ || (strcmp(term, lower_term) > 0)) {
193
+ check_lower = false;
194
+ if (upper_term) {
195
+ int compare = strcmp(upper_term, term);
196
+ /* Break if upper term is greater than or equal to upper
197
+ * term and include_upper is false or ther term is fully
198
+ * greater than upper term. This is optimized so that only
199
+ * one check is done except in last check or two */
200
+ if ((compare <= 0)
201
+ && (!include_upper || (compare < 0))) {
202
+ break;
203
+ }
204
+ }
205
+ /* we have a good term, find the docs */
206
+ /* text is already pointing to term buffer text */
207
+ tde->seek_te(tde, te);
208
+ while (tde->next(tde)) {
209
+ bv_set(bv, tde->doc_num(tde));
210
+ /* printf("Setting %d\n", tde->doc_num(tde)); */
211
+ }
212
+ }
213
+ } while (te->next(te));
214
+
215
+ tde->close(tde);
216
+ te->close(te);
194
217
  }
195
- } while (te->next(te));
196
-
197
- tde->close(tde);
198
- te->close(te);
199
- term_destroy(term_from);
200
218
 
201
- return bv;
219
+ return bv;
202
220
  }
203
221
 
204
- uint rfilt_hash(Filter *self)
222
+ static ulong rfilt_hash(Filter *filt)
205
223
  {
206
- return range_hash((Range *)self->data);
224
+ return range_hash(RF(filt)->range);
207
225
  }
208
226
 
209
- int rfilt_eq(Filter *self, Filter *o)
227
+ static int rfilt_eq(Filter *filt, Filter *o)
210
228
  {
211
- return range_eq((Range *)self->data, (Range *)o->data);
229
+ return range_eq(RF(filt)->range, RF(o)->range);
212
230
  }
213
231
 
214
- Filter *rfilt_create(const char *field, char *lower_term, char *upper_term,
215
- bool include_lower, bool include_upper)
232
+ Filter *rfilt_new(const char *field,
233
+ const char *lower_term, const char *upper_term,
234
+ bool include_lower, bool include_upper)
216
235
  {
217
- Filter *self;
218
- Range *range = range_create(field, lower_term, upper_term,
219
- include_lower, include_upper);
220
-
221
- self = filt_create("RangeFilter");
222
- self->data = range;
223
- self->get_bv = &rfilt_get_bv;
224
- self->hash = &rfilt_hash;
225
- self->eq = &rfilt_eq;
226
- self->to_s = &rfilt_to_s;
227
- self->destroy = &rfilt_destroy;
228
- return self;
236
+ Filter *filt = filt_new(RangeFilter);
237
+ RF(filt)->range = range_new(field, lower_term, upper_term,
238
+ include_lower, include_upper);
239
+
240
+ filt->get_bv_i = &rfilt_get_bv_i;
241
+ filt->hash = &rfilt_hash;
242
+ filt->eq = &rfilt_eq;
243
+ filt->to_s = &rfilt_to_s;
244
+ filt->destroy_i = &rfilt_destroy_i;
245
+ return filt;
229
246
  }
230
247
 
231
248
  /*****************************************************************************
@@ -234,61 +251,69 @@ Filter *rfilt_create(const char *field, char *lower_term, char *upper_term,
234
251
  *
235
252
  *****************************************************************************/
236
253
 
237
- char *rq_to_s(Query *self, char *field)
254
+ #define RQ(query) ((RangeQuery *)(query))
255
+ typedef struct RangeQuery
256
+ {
257
+ Query f;
258
+ Range *range;
259
+ } RangeQuery;
260
+
261
+ static char *rq_to_s(Query *self, const char *field)
238
262
  {
239
- Range *range = (Range *)self->data;
240
- return range_to_s(range, field, self->boost);
263
+ return range_to_s(RQ(self)->range, field, self->boost);
241
264
  }
242
265
 
243
- void rq_destroy(Query *self)
266
+ static void rq_destroy(Query *self)
244
267
  {
245
- range_destroy(self->data);
246
- q_destroy_i(self);
268
+ range_destroy(RQ(self)->range);
269
+ q_destroy_i(self);
247
270
  }
248
271
 
249
- Query *rq_rewrite(Query *self, IndexReader *ir)
272
+ static Query *rq_rewrite(Query *self, IndexReader *ir)
250
273
  {
251
- Range *r = (Range *)self->data;
252
- Filter *filter = rfilt_create(r->field, r->lower_term, r->upper_term,
253
- r->include_lower, r->include_upper);
254
- return csq_create(filter);
274
+ Range *r = RQ(self)->range;
275
+ Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
276
+ r->include_lower, r->include_upper);
277
+ (void)ir;
278
+ return csq_new_nr(filter);
255
279
  }
256
280
 
257
- static uint rq_hash(Query *self)
281
+ static ulong rq_hash(Query *self)
258
282
  {
259
- return range_hash((Range *)self->data);
283
+ return range_hash(RQ(self)->range);
260
284
  }
261
285
 
262
286
  static int rq_eq(Query *self, Query *o)
263
287
  {
264
- return range_eq((Range *)self->data, (Range *)o->data);
288
+ return range_eq(RQ(self)->range, RQ(o)->range);
265
289
  }
266
290
 
267
- Query *rq_create_less(const char *field, char *upper_term, bool include_upper)
291
+ Query *rq_new_less(const char *field, const char *upper_term,
292
+ bool include_upper)
268
293
  {
269
- return rq_create(field, NULL, upper_term, false, include_upper);
294
+ return rq_new(field, NULL, upper_term, false, include_upper);
270
295
  }
271
296
 
272
- Query *rq_create_more(const char *field, char *lower_term, bool include_lower)
297
+ Query *rq_new_more(const char *field, const char *lower_term,
298
+ bool include_lower)
273
299
  {
274
- return rq_create(field, lower_term, NULL, include_lower, false);
300
+ return rq_new(field, lower_term, NULL, include_lower, false);
275
301
  }
276
302
 
277
- Query *rq_create(const char *field, char *lower_term, char *upper_term,
278
- bool include_lower, bool include_upper)
303
+ Query *rq_new(const char *field, const char *lower_term,
304
+ const char *upper_term, bool include_lower, bool include_upper)
279
305
  {
280
- Query *self = q_create();
281
- Range *range = range_create(field, lower_term, upper_term,
282
- include_lower, include_upper);
283
-
284
- self->data = range;
285
-
286
- self->type = RANGE_QUERY;
287
- self->rewrite = &rq_rewrite;
288
- self->to_s = &rq_to_s;
289
- self->hash = &rq_hash;
290
- self->eq = &rq_eq;
291
- self->destroy_i = &rq_destroy;
292
- self->create_weight_i = &q_create_weight_unsup;
293
- return self;
306
+ Query *self = q_new(RangeQuery);
307
+
308
+ RQ(self)->range = range_new(field, lower_term, upper_term,
309
+ include_lower, include_upper);
310
+
311
+ self->type = RANGE_QUERY;
312
+ self->rewrite = &rq_rewrite;
313
+ self->to_s = &rq_to_s;
314
+ self->hash = &rq_hash;
315
+ self->eq = &rq_eq;
316
+ self->destroy_i = &rq_destroy;
317
+ self->create_weight_i = &q_create_weight_unsup;
318
+ return self;
294
319
  }