jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/q_prefix.c ADDED
@@ -0,0 +1,98 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+ #include "symbol.h"
4
+ #include "internal.h"
5
+
6
+ /****************************************************************************
7
+ *
8
+ * PrefixQuery
9
+ *
10
+ ****************************************************************************/
11
+
12
+ #define PfxQ(query) ((PrefixQuery *)(query))
13
+
14
+ static char *prq_to_s(Query *self, Symbol default_field)
15
+ {
16
+ char *buffer, *bptr;
17
+ const char *prefix = PfxQ(self)->prefix;
18
+ size_t plen = strlen(prefix);
19
+ size_t flen = sym_len(PfxQ(self)->field);
20
+
21
+ bptr = buffer = ALLOC_N(char, plen + flen + 35);
22
+
23
+ if (PfxQ(self)->field != default_field) {
24
+ bptr += sprintf(bptr, "%s:", S(PfxQ(self)->field));
25
+ }
26
+
27
+ bptr += sprintf(bptr, "%s*", prefix);
28
+ if (self->boost != 1.0) {
29
+ *bptr = '^';
30
+ dbl_to_s(++bptr, self->boost);
31
+ }
32
+
33
+ return buffer;
34
+ }
35
+
36
+ static Query *prq_rewrite(Query *self, IndexReader *ir)
37
+ {
38
+ const int field_num = fis_get_field_num(ir->fis, PfxQ(self)->field);
39
+ Query *volatile q = multi_tq_new_conf(PfxQ(self)->field,
40
+ MTQMaxTerms(self), 0.0);
41
+ q->boost = self->boost; /* set the boost */
42
+
43
+ if (field_num >= 0) {
44
+ const char *prefix = PfxQ(self)->prefix;
45
+ TermEnum *te = ir->terms_from(ir, field_num, prefix);
46
+ const char *term = te->curr_term;
47
+ size_t prefix_len = strlen(prefix);
48
+
49
+ TRY
50
+ do {
51
+ if (strncmp(term, prefix, prefix_len) != 0) {
52
+ break;
53
+ }
54
+ multi_tq_add_term(q, term); /* found a match */
55
+ } while (te->next(te));
56
+ XFINALLY
57
+ te->close(te);
58
+ XENDTRY
59
+ }
60
+
61
+ return q;
62
+ }
63
+
64
+ static void prq_destroy(Query *self)
65
+ {
66
+ free(PfxQ(self)->prefix);
67
+ q_destroy_i(self);
68
+ }
69
+
70
+ static unsigned long prq_hash(Query *self)
71
+ {
72
+ return sym_hash(PfxQ(self)->field) ^ str_hash(PfxQ(self)->prefix);
73
+ }
74
+
75
+ static int prq_eq(Query *self, Query *o)
76
+ {
77
+ return (strcmp(PfxQ(self)->prefix, PfxQ(o)->prefix) == 0)
78
+ && (PfxQ(self)->field == PfxQ(o)->field);
79
+ }
80
+
81
+ Query *prefixq_new(Symbol field, const char *prefix)
82
+ {
83
+ Query *self = q_new(PrefixQuery);
84
+
85
+ PfxQ(self)->field = field;
86
+ PfxQ(self)->prefix = estrdup(prefix);
87
+ MTQMaxTerms(self) = PREFIX_QUERY_MAX_TERMS;
88
+
89
+ self->type = PREFIX_QUERY;
90
+ self->rewrite = &prq_rewrite;
91
+ self->to_s = &prq_to_s;
92
+ self->hash = &prq_hash;
93
+ self->eq = &prq_eq;
94
+ self->destroy_i = &prq_destroy;
95
+ self->create_weight_i = &q_create_weight_unsup;
96
+
97
+ return self;
98
+ }
data/ext/q_range.c ADDED
@@ -0,0 +1,682 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+ #include "symbol.h"
4
+ #include "internal.h"
5
+
6
+ /*****************************************************************************
7
+ *
8
+ * Range
9
+ *
10
+ *****************************************************************************/
11
+
12
+ typedef struct Range
13
+ {
14
+ Symbol field;
15
+ char *lower_term;
16
+ char *upper_term;
17
+ bool include_lower : 1;
18
+ bool include_upper : 1;
19
+ } Range;
20
+
21
+ static char *range_to_s(Range *range, Symbol default_field, float boost)
22
+ {
23
+ char *buffer, *b;
24
+ size_t flen, llen, ulen;
25
+ const char *field = S(range->field);
26
+
27
+ flen = strlen(field);
28
+ llen = range->lower_term ? strlen(range->lower_term) : 0;
29
+ ulen = range->upper_term ? strlen(range->upper_term) : 0;
30
+ buffer = ALLOC_N(char, flen + llen + ulen + 40);
31
+ b = buffer;
32
+
33
+ if (default_field != range->field) {
34
+ memcpy(buffer, field, flen * sizeof(char));
35
+ b += flen;
36
+ *b = ':';
37
+ b++;
38
+ }
39
+
40
+ if (range->lower_term) {
41
+ *b = range->include_lower ? '[' : '{';
42
+ b++;
43
+ memcpy(b, range->lower_term, llen);
44
+ b += llen;
45
+ } else {
46
+ *b = '<';
47
+ b++;
48
+ }
49
+
50
+ if (range->upper_term && range->lower_term) {
51
+ *b = ' '; b++;
52
+ }
53
+
54
+ if (range->upper_term) {
55
+ memcpy(b, range->upper_term, ulen);
56
+ b += ulen;
57
+ *b = range->include_upper ? ']' : '}';
58
+ b++;
59
+ } else {
60
+ *b = '>';
61
+ b++;
62
+ }
63
+
64
+ *b = 0;
65
+ if (boost != 1.0) {
66
+ *b = '^';
67
+ dbl_to_s(b + 1, boost);
68
+ }
69
+ return buffer;
70
+ }
71
+
72
+ static void range_destroy(Range *range)
73
+ {
74
+ free(range->lower_term);
75
+ free(range->upper_term);
76
+ free(range);
77
+ }
78
+
79
+ static unsigned long range_hash(Range *filt)
80
+ {
81
+ return filt->include_lower | (filt->include_upper << 1)
82
+ | ((sym_hash(filt->field)
83
+ ^ (filt->lower_term ? str_hash(filt->lower_term) : 0)
84
+ ^ (filt->upper_term ? str_hash(filt->upper_term) : 0)) << 2);
85
+ }
86
+
87
+ static int str_eq(const char *s1, const char *s2)
88
+ {
89
+ return (s1 && s2 && (strcmp(s1, s2) == 0)) || (s1 == s2);
90
+ }
91
+
92
+ static int range_eq(Range *filt, Range *o)
93
+ {
94
+ return ((filt->field == o->field)
95
+ && str_eq(filt->lower_term, o->lower_term)
96
+ && str_eq(filt->upper_term, o->upper_term)
97
+ && (filt->include_lower == o->include_lower)
98
+ && (filt->include_upper == o->include_upper));
99
+ }
100
+
101
+ static Range *range_new(Symbol field, const char *lower_term,
102
+ const char *upper_term, bool include_lower,
103
+ bool include_upper)
104
+ {
105
+ Range *range;
106
+
107
+ if (!lower_term && !upper_term) {
108
+ RAISE(ARG_ERROR, "Nil bounds for range. A range must include either "
109
+ "lower bound or an upper bound");
110
+ }
111
+ if (include_lower && !lower_term) {
112
+ RAISE(ARG_ERROR, "Lower bound must be non-nil to be inclusive. That "
113
+ "is, if you specify :include_lower => true when you create a "
114
+ "range you must include a :lower_term");
115
+ }
116
+ if (include_upper && !upper_term) {
117
+ RAISE(ARG_ERROR, "Upper bound must be non-nil to be inclusive. That "
118
+ "is, if you specify :include_upper => true when you create a "
119
+ "range you must include a :upper_term");
120
+ }
121
+ if (upper_term && lower_term && (strcmp(upper_term, lower_term) < 0)) {
122
+ RAISE(ARG_ERROR, "Upper bound must be greater than lower bound. "
123
+ "\"%s\" < \"%s\"", upper_term, lower_term);
124
+ }
125
+
126
+ range = ALLOC(Range);
127
+
128
+ range->field = field;
129
+ range->lower_term = lower_term ? estrdup(lower_term) : NULL;
130
+ range->upper_term = upper_term ? estrdup(upper_term) : NULL;
131
+ range->include_lower = include_lower;
132
+ range->include_upper = include_upper;
133
+ return range;
134
+ }
135
+
136
+ static Range *trange_new(Symbol field, const char *lower_term,
137
+ const char *upper_term, bool include_lower,
138
+ bool include_upper)
139
+ {
140
+ Range *range;
141
+ int len;
142
+ double upper_num, lower_num;
143
+
144
+ if (!lower_term && !upper_term) {
145
+ RAISE(ARG_ERROR, "Nil bounds for range. A range must include either "
146
+ "lower bound or an upper bound");
147
+ }
148
+ if (include_lower && !lower_term) {
149
+ RAISE(ARG_ERROR, "Lower bound must be non-nil to be inclusive. That "
150
+ "is, if you specify :include_lower => true when you create a "
151
+ "range you must include a :lower_term");
152
+ }
153
+ if (include_upper && !upper_term) {
154
+ RAISE(ARG_ERROR, "Upper bound must be non-nil to be inclusive. That "
155
+ "is, if you specify :include_upper => true when you create a "
156
+ "range you must include a :upper_term");
157
+ }
158
+ if (upper_term && lower_term) {
159
+ if ((!lower_term ||
160
+ (sscanf(lower_term, "%lg%n", &lower_num, &len) &&
161
+ (int)strlen(lower_term) == len)) &&
162
+ (!upper_term ||
163
+ (sscanf(upper_term, "%lg%n", &upper_num, &len) &&
164
+ (int)strlen(upper_term) == len)))
165
+ {
166
+ if (upper_num < lower_num) {
167
+ RAISE(ARG_ERROR, "Upper bound must be greater than lower bound."
168
+ " numbers \"%lg\" < \"%lg\"", upper_num, lower_num);
169
+ }
170
+ }
171
+ else {
172
+ if (upper_term && lower_term &&
173
+ (strcmp(upper_term, lower_term) < 0)) {
174
+ RAISE(ARG_ERROR, "Upper bound must be greater than lower bound."
175
+ " \"%s\" < \"%s\"", upper_term, lower_term);
176
+ }
177
+ }
178
+ }
179
+
180
+ range = ALLOC(Range);
181
+
182
+ range->field = field;
183
+ range->lower_term = lower_term ? estrdup(lower_term) : NULL;
184
+ range->upper_term = upper_term ? estrdup(upper_term) : NULL;
185
+ range->include_lower = include_lower;
186
+ range->include_upper = include_upper;
187
+ return range;
188
+ }
189
+
190
+ /***************************************************************************
191
+ *
192
+ * RangeFilter
193
+ *
194
+ ***************************************************************************/
195
+
196
+ typedef struct RangeFilter
197
+ {
198
+ Filter super;
199
+ Range *range;
200
+ } RangeFilter;
201
+
202
+ #define RF(filt) ((RangeFilter *)(filt))
203
+
204
+ static void rfilt_destroy_i(Filter *filt)
205
+ {
206
+ range_destroy(RF(filt)->range);
207
+ filt_destroy_i(filt);
208
+ }
209
+
210
+ static char *rfilt_to_s(Filter *filt)
211
+ {
212
+ char *rstr = range_to_s(RF(filt)->range, NULL, 1.0);
213
+ char *rfstr = strfmt("RangeFilter< %s >", rstr);
214
+ free(rstr);
215
+ return rfstr;
216
+ }
217
+
218
+ static BitVector *rfilt_get_bv_i(Filter *filt, IndexReader *ir)
219
+ {
220
+ BitVector *bv = bv_new_capa(ir->max_doc(ir));
221
+ Range *range = RF(filt)->range;
222
+ FieldInfo *fi = fis_get_field(ir->fis, range->field);
223
+ /* the field info exists we need to add docs to the bit vector, otherwise
224
+ * we just return an empty bit vector */
225
+ if (fi) {
226
+ const char *lower_term =
227
+ range->lower_term ? range->lower_term : EMPTY_STRING;
228
+ const char *upper_term = range->upper_term;
229
+ const bool include_upper = range->include_upper;
230
+ const int field_num = fi->number;
231
+ char *term;
232
+ TermEnum* te;
233
+ TermDocEnum *tde;
234
+ bool check_lower;
235
+
236
+ te = ir->terms(ir, field_num);
237
+ if (te->skip_to(te, lower_term) == NULL) {
238
+ te->close(te);
239
+ return bv;
240
+ }
241
+
242
+ check_lower = !(range->include_lower || (lower_term == EMPTY_STRING));
243
+
244
+ tde = ir->term_docs(ir);
245
+ term = te->curr_term;
246
+ do {
247
+ if (!check_lower
248
+ || (strcmp(term, lower_term) > 0)) {
249
+ check_lower = false;
250
+ if (upper_term) {
251
+ int compare = strcmp(upper_term, term);
252
+ /* Break if upper term is greater than or equal to upper
253
+ * term and include_upper is false or ther term is fully
254
+ * greater than upper term. This is optimized so that only
255
+ * one check is done except in last check or two */
256
+ if ((compare <= 0)
257
+ && (!include_upper || (compare < 0))) {
258
+ break;
259
+ }
260
+ }
261
+ /* we have a good term, find the docs */
262
+ /* text is already pointing to term buffer text */
263
+ tde->seek_te(tde, te);
264
+ while (tde->next(tde)) {
265
+ bv_set(bv, tde->doc_num(tde));
266
+ /* printf("Setting %d\n", tde->doc_num(tde)); */
267
+ }
268
+ }
269
+ } while (te->next(te));
270
+
271
+ tde->close(tde);
272
+ te->close(te);
273
+ }
274
+
275
+ return bv;
276
+ }
277
+
278
+ static unsigned long rfilt_hash(Filter *filt)
279
+ {
280
+ return range_hash(RF(filt)->range);
281
+ }
282
+
283
+ static int rfilt_eq(Filter *filt, Filter *o)
284
+ {
285
+ return range_eq(RF(filt)->range, RF(o)->range);
286
+ }
287
+
288
+ Filter *rfilt_new(Symbol field,
289
+ const char *lower_term, const char *upper_term,
290
+ bool include_lower, bool include_upper)
291
+ {
292
+ Filter *filt = filt_new(RangeFilter);
293
+ RF(filt)->range = range_new(field, lower_term, upper_term,
294
+ include_lower, include_upper);
295
+
296
+ filt->get_bv_i = &rfilt_get_bv_i;
297
+ filt->hash = &rfilt_hash;
298
+ filt->eq = &rfilt_eq;
299
+ filt->to_s = &rfilt_to_s;
300
+ filt->destroy_i = &rfilt_destroy_i;
301
+ return filt;
302
+ }
303
+
304
+ /***************************************************************************
305
+ *
306
+ * RangeFilter
307
+ *
308
+ ***************************************************************************/
309
+
310
+ static char *trfilt_to_s(Filter *filt)
311
+ {
312
+ char *rstr = range_to_s(RF(filt)->range, NULL, 1.0);
313
+ char *rfstr = strfmt("TypedRangeFilter< %s >", rstr);
314
+ free(rstr);
315
+ return rfstr;
316
+ }
317
+
318
+ typedef enum {
319
+ TRC_NONE = 0x00,
320
+ TRC_LE = 0x01,
321
+ TRC_LT = 0x02,
322
+ TRC_GE = 0x04,
323
+ TRC_GE_LE = 0x05,
324
+ TRC_GE_LT = 0x06,
325
+ TRC_GT = 0x08,
326
+ TRC_GT_LE = 0x09,
327
+ TRC_GT_LT = 0x0a
328
+ } TypedRangeCheck;
329
+
330
+ #define SET_DOCS(cond)\
331
+ do {\
332
+ if (term[0] > '9') break; /* done */\
333
+ sscanf(term, "%lg%n", &num, &len);\
334
+ if (len == te->curr_term_len) { /* We have a number */\
335
+ if (cond) {\
336
+ tde->seek_te(tde, te);\
337
+ while (tde->next(tde)) {\
338
+ bv_set(bv, tde->doc_num(tde));\
339
+ }\
340
+ }\
341
+ }\
342
+ } while (te->next(te))
343
+
344
+
345
+ static BitVector *trfilt_get_bv_i(Filter *filt, IndexReader *ir)
346
+ {
347
+ Range *range = RF(filt)->range;
348
+ double lnum = 0.0, unum = 0.0;
349
+ int len = 0;
350
+ const char *lt = range->lower_term;
351
+ const char *ut = range->upper_term;
352
+ if ((!lt || (sscanf(lt, "%lg%n", &lnum, &len) && (int)strlen(lt) == len)) &&
353
+ (!ut || (sscanf(ut, "%lg%n", &unum, &len) && (int)strlen(ut) == len)))
354
+ {
355
+ BitVector *bv = bv_new_capa(ir->max_doc(ir));
356
+ FieldInfo *fi = fis_get_field(ir->fis, range->field);
357
+ /* the field info exists we need to add docs to the bit vector,
358
+ * otherwise we just return an empty bit vector */
359
+ if (fi) {
360
+ const int field_num = fi->number;
361
+ char *term;
362
+ double num;
363
+ TermEnum* te;
364
+ TermDocEnum *tde;
365
+ TypedRangeCheck check = TRC_NONE;
366
+
367
+ te = ir->terms(ir, field_num);
368
+ if (te->skip_to(te, "+.") == NULL) {
369
+ te->close(te);
370
+ return bv;
371
+ }
372
+
373
+ tde = ir->term_docs(ir);
374
+ term = te->curr_term;
375
+
376
+ if (lt) {
377
+ check = range->include_lower ? TRC_GE : TRC_GT;
378
+ }
379
+ if (ut) {
380
+ check = (TypedRangeCheck)(check | (range->include_upper
381
+ ? TRC_LE
382
+ : TRC_LT));
383
+ }
384
+
385
+ switch(check) {
386
+ case TRC_LE:
387
+ SET_DOCS(num <= unum);
388
+ break;
389
+ case TRC_LT:
390
+ SET_DOCS(num < unum);
391
+ break;
392
+ case TRC_GE:
393
+ SET_DOCS(num >= lnum);
394
+ break;
395
+ case TRC_GE_LE:
396
+ SET_DOCS(num >= lnum && num <= unum);
397
+ break;
398
+ case TRC_GE_LT:
399
+ SET_DOCS(num >= lnum && num < unum);
400
+ break;
401
+ case TRC_GT:
402
+ SET_DOCS(num > lnum);
403
+ break;
404
+ case TRC_GT_LE:
405
+ SET_DOCS(num > lnum && num <= unum);
406
+ break;
407
+ case TRC_GT_LT:
408
+ SET_DOCS(num > lnum && num < unum);
409
+ break;
410
+ case TRC_NONE:
411
+ /* should never happen. Error should have been raised */
412
+ assert(false);
413
+ }
414
+ tde->close(tde);
415
+ te->close(te);
416
+ }
417
+
418
+ return bv;
419
+ }
420
+ else {
421
+ return rfilt_get_bv_i(filt, ir);
422
+ }
423
+ }
424
+
425
+ Filter *trfilt_new(Symbol field,
426
+ const char *lower_term, const char *upper_term,
427
+ bool include_lower, bool include_upper)
428
+ {
429
+ Filter *filt = filt_new(RangeFilter);
430
+ RF(filt)->range = trange_new(field, lower_term, upper_term,
431
+ include_lower, include_upper);
432
+
433
+ filt->get_bv_i = &trfilt_get_bv_i;
434
+ filt->hash = &rfilt_hash;
435
+ filt->eq = &rfilt_eq;
436
+ filt->to_s = &trfilt_to_s;
437
+ filt->destroy_i = &rfilt_destroy_i;
438
+ return filt;
439
+ }
440
+
441
+ /*****************************************************************************
442
+ *
443
+ * RangeQuery
444
+ *
445
+ *****************************************************************************/
446
+
447
+ #define RQ(query) ((RangeQuery *)(query))
448
+ typedef struct RangeQuery
449
+ {
450
+ Query f;
451
+ Range *range;
452
+ } RangeQuery;
453
+
454
+ static char *rq_to_s(Query *self, Symbol field)
455
+ {
456
+ return range_to_s(RQ(self)->range, field, self->boost);
457
+ }
458
+
459
+ static void rq_destroy(Query *self)
460
+ {
461
+ range_destroy(RQ(self)->range);
462
+ q_destroy_i(self);
463
+ }
464
+
465
+ static MatchVector *rq_get_matchv_i(Query *self, MatchVector *mv,
466
+ TermVector *tv)
467
+ {
468
+ Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
469
+ if (tv->field == range->field) {
470
+ const int term_cnt = tv->term_cnt;
471
+ int i, j;
472
+ char *upper_text = range->upper_term;
473
+ char *lower_text = range->lower_term;
474
+ int upper_limit = range->include_upper ? 1 : 0;
475
+
476
+ i = lower_text ? tv_scan_to_term_index(tv, lower_text) : 0;
477
+ if (i < term_cnt && !range->include_lower && lower_text
478
+ && 0 == strcmp(lower_text, tv->terms[i].text)) {
479
+ i++;
480
+ }
481
+
482
+ for (; i < term_cnt; i++) {
483
+ TVTerm *tv_term = &(tv->terms[i]);
484
+ char *text = tv_term->text;
485
+ const int tv_term_freq = tv_term->freq;
486
+ if (upper_text && strcmp(text, upper_text) >= upper_limit) {
487
+ break;
488
+ }
489
+ for (j = 0; j < tv_term_freq; j++) {
490
+ int pos = tv_term->positions[j];
491
+ matchv_add(mv, pos, pos);
492
+ }
493
+ }
494
+ }
495
+ return mv;
496
+ }
497
+
498
+ static Query *rq_rewrite(Query *self, IndexReader *ir)
499
+ {
500
+ Query *csq;
501
+ Range *r = RQ(self)->range;
502
+ Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
503
+ r->include_lower, r->include_upper);
504
+ (void)ir;
505
+ csq = csq_new_nr(filter);
506
+ ((ConstantScoreQuery *)csq)->original = self;
507
+ csq->get_matchv_i = &rq_get_matchv_i;
508
+ return (Query *)csq;
509
+ }
510
+
511
+ static unsigned long rq_hash(Query *self)
512
+ {
513
+ return range_hash(RQ(self)->range);
514
+ }
515
+
516
+ static int rq_eq(Query *self, Query *o)
517
+ {
518
+ return range_eq(RQ(self)->range, RQ(o)->range);
519
+ }
520
+
521
+ Query *rq_new_less(Symbol field, const char *upper_term,
522
+ bool include_upper)
523
+ {
524
+ return rq_new(field, NULL, upper_term, false, include_upper);
525
+ }
526
+
527
+ Query *rq_new_more(Symbol field, const char *lower_term,
528
+ bool include_lower)
529
+ {
530
+ return rq_new(field, lower_term, NULL, include_lower, false);
531
+ }
532
+
533
+ Query *rq_new(Symbol field, const char *lower_term,
534
+ const char *upper_term, bool include_lower, bool include_upper)
535
+ {
536
+ Query *self;
537
+ Range *range = range_new(field, lower_term, upper_term,
538
+ include_lower, include_upper);
539
+ self = q_new(RangeQuery);
540
+ RQ(self)->range = range;
541
+
542
+ self->type = RANGE_QUERY;
543
+ self->rewrite = &rq_rewrite;
544
+ self->to_s = &rq_to_s;
545
+ self->hash = &rq_hash;
546
+ self->eq = &rq_eq;
547
+ self->destroy_i = &rq_destroy;
548
+ self->create_weight_i = &q_create_weight_unsup;
549
+ return self;
550
+ }
551
+
552
+ /*****************************************************************************
553
+ *
554
+ * TypedRangeQuery
555
+ *
556
+ *****************************************************************************/
557
+
558
+ #define SET_TERMS(cond)\
559
+ for (i = tv->term_cnt - 1; i >= 0; i--) {\
560
+ TVTerm *tv_term = &(tv->terms[i]);\
561
+ char *text = tv_term->text;\
562
+ double num;\
563
+ sscanf(text, "%lg%n", &num, &len);\
564
+ if ((int)strlen(text) == len) { /* We have a number */\
565
+ if (cond) {\
566
+ const int tv_term_freq = tv_term->freq;\
567
+ for (j = 0; j < tv_term_freq; j++) {\
568
+ int pos = tv_term->positions[j];\
569
+ matchv_add(mv, pos, pos);\
570
+ }\
571
+ }\
572
+ }\
573
+ }\
574
+
575
+ static MatchVector *trq_get_matchv_i(Query *self, MatchVector *mv,
576
+ TermVector *tv)
577
+ {
578
+ Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
579
+ if (tv->field == range->field) {
580
+ double lnum = 0.0, unum = 0.0;
581
+ int len = 0;
582
+ const char *lt = range->lower_term;
583
+ const char *ut = range->upper_term;
584
+ if ((!lt
585
+ || (sscanf(lt,"%lg%n",&lnum,&len) && (int)strlen(lt) == len))
586
+ &&
587
+ (!ut
588
+ || (sscanf(ut,"%lg%n",&unum,&len) && (int)strlen(ut) == len)))
589
+ {
590
+ TypedRangeCheck check = TRC_NONE;
591
+ int i = 0, j = 0;
592
+
593
+ if (lt) {
594
+ check = range->include_lower ? TRC_GE : TRC_GT;
595
+ }
596
+ if (ut) {
597
+ check = (TypedRangeCheck)(check | (range->include_upper
598
+ ? TRC_LE
599
+ : TRC_LT));
600
+ }
601
+
602
+ switch(check) {
603
+ case TRC_LE:
604
+ SET_TERMS(num <= unum);
605
+ break;
606
+ case TRC_LT:
607
+ SET_TERMS(num < unum);
608
+ break;
609
+ case TRC_GE:
610
+ SET_TERMS(num >= lnum);
611
+ break;
612
+ case TRC_GE_LE:
613
+ SET_TERMS(num >= lnum && num <= unum);
614
+ break;
615
+ case TRC_GE_LT:
616
+ SET_TERMS(num >= lnum && num < unum);
617
+ break;
618
+ case TRC_GT:
619
+ SET_TERMS(num > lnum);
620
+ break;
621
+ case TRC_GT_LE:
622
+ SET_TERMS(num > lnum && num <= unum);
623
+ break;
624
+ case TRC_GT_LT:
625
+ SET_TERMS(num > lnum && num < unum);
626
+ break;
627
+ case TRC_NONE:
628
+ /* should never happen. Error should have been raised */
629
+ assert(false);
630
+ }
631
+
632
+ }
633
+ else {
634
+ return rq_get_matchv_i(self, mv, tv);
635
+ }
636
+ }
637
+ return mv;
638
+ }
639
+
640
+ static Query *trq_rewrite(Query *self, IndexReader *ir)
641
+ {
642
+ Query *csq;
643
+ Range *r = RQ(self)->range;
644
+ Filter *filter = trfilt_new(r->field, r->lower_term, r->upper_term,
645
+ r->include_lower, r->include_upper);
646
+ (void)ir;
647
+ csq = csq_new_nr(filter);
648
+ ((ConstantScoreQuery *)csq)->original = self;
649
+ csq->get_matchv_i = &trq_get_matchv_i;
650
+ return (Query *)csq;
651
+ }
652
+
653
+ Query *trq_new_less(Symbol field, const char *upper_term,
654
+ bool include_upper)
655
+ {
656
+ return trq_new(field, NULL, upper_term, false, include_upper);
657
+ }
658
+
659
+ Query *trq_new_more(Symbol field, const char *lower_term,
660
+ bool include_lower)
661
+ {
662
+ return trq_new(field, lower_term, NULL, include_lower, false);
663
+ }
664
+
665
+ Query *trq_new(Symbol field, const char *lower_term,
666
+ const char *upper_term, bool include_lower, bool include_upper)
667
+ {
668
+ Query *self;
669
+ Range *range = trange_new(field, lower_term, upper_term,
670
+ include_lower, include_upper);
671
+ self = q_new(RangeQuery);
672
+ RQ(self)->range = range;
673
+
674
+ self->type = TYPED_RANGE_QUERY;
675
+ self->rewrite = &trq_rewrite;
676
+ self->to_s = &rq_to_s;
677
+ self->hash = &rq_hash;
678
+ self->eq = &rq_eq;
679
+ self->destroy_i = &rq_destroy;
680
+ self->create_weight_i = &q_create_weight_unsup;
681
+ return self;
682
+ }