sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,963 @@
1
+ #include "ferret.h"
2
+ #include "bitvector.h"
3
+
4
+ /*****************
5
+ *** BitVector ***
6
+ *****************/
7
+ static VALUE cBitVector;
8
+
9
+ static void
10
+ frt_bv_free(void *p)
11
+ {
12
+ object_del(p);
13
+ bv_destroy((BitVector *)p);
14
+ }
15
+
16
+ static VALUE
17
+ frt_bv_alloc(VALUE klass)
18
+ {
19
+ BitVector *bv = bv_new();
20
+ VALUE rbv = Data_Wrap_Struct(klass, NULL, &frt_bv_free, bv);
21
+ object_add(bv, rbv);
22
+ return rbv;
23
+ }
24
+
25
+ #define GET_BV(bv, self) Data_Get_Struct(self, BitVector, bv)
26
+
27
+ VALUE
28
+ frt_get_bv(BitVector *bv)
29
+ {
30
+ VALUE rbv;
31
+ if ((rbv = object_get(bv)) == Qnil) {
32
+ rbv = Data_Wrap_Struct(cBitVector, NULL, &frt_bv_free, bv);
33
+ REF(bv);
34
+ object_add(bv, rbv);
35
+ }
36
+ return rbv;
37
+ }
38
+
39
+ /*
40
+ * call-seq:
41
+ * BitVector.new() -> new_bv
42
+ *
43
+ * Returns a new empty bit vector object
44
+ */
45
+ static VALUE
46
+ frt_bv_init(VALUE self)
47
+ {
48
+ return self;
49
+ }
50
+
51
+ /*
52
+ * call-seq:
53
+ * bv[i] = bool -> bool
54
+ *
55
+ * Set the bit and _i_ to *val* (+true+ or
56
+ * +false+).
57
+ */
58
+ VALUE
59
+ frt_bv_set(VALUE self, VALUE rindex, VALUE rstate)
60
+ {
61
+ BitVector *bv;
62
+ int index = FIX2INT(rindex);
63
+ GET_BV(bv, self);
64
+ if (index < 0) {
65
+ rb_raise(rb_eIndexError, "%d < 0", index);
66
+ }
67
+ if (RTEST(rstate)) {
68
+ bv_set(bv, index);
69
+ }
70
+ else {
71
+ bv_unset(bv, index);
72
+ }
73
+
74
+ return rstate;
75
+ }
76
+
77
+ /*
78
+ * call-seq:
79
+ * bv.set(i) -> self
80
+ *
81
+ * Set the bit at _i_ to *on* (+true+)
82
+ */
83
+ VALUE
84
+ frt_bv_set_on(VALUE self, VALUE rindex)
85
+ {
86
+ frt_bv_set(self, rindex, Qtrue);
87
+ return self;
88
+ }
89
+
90
+ /*
91
+ * call-seq:
92
+ * bv.unset(i) -> self
93
+ *
94
+ * Set the bit at _i_ to *off* (+false+)
95
+ */
96
+ VALUE
97
+ frt_bv_set_off(VALUE self, VALUE rindex)
98
+ {
99
+ frt_bv_set(self, rindex, Qfalse);
100
+ return self;
101
+ }
102
+
103
+ /*
104
+ * call-seq:
105
+ * bv.get(i) -> bool
106
+ * bv[i] -> bool
107
+ *
108
+ * Get the bit value at _i_
109
+ */
110
+ VALUE
111
+ frt_bv_get(VALUE self, VALUE rindex)
112
+ {
113
+ BitVector *bv;
114
+ int index = FIX2INT(rindex);
115
+ GET_BV(bv, self);
116
+ if (index < 0) {
117
+ rb_raise(rb_eIndexError, "%d < 0", index);
118
+ }
119
+
120
+ return bv_get(bv, index) ? Qtrue : Qfalse;
121
+ }
122
+
123
+ /*
124
+ * call-seq:
125
+ * bv.count -> bit_count
126
+ *
127
+ * Count the number of bits set in the bit vector. If the bit vector has been
128
+ * negated using +#not+ then count the number of unset bits
129
+ * instead.
130
+ */
131
+ VALUE
132
+ frt_bv_count(VALUE self)
133
+ {
134
+ BitVector *bv;
135
+ GET_BV(bv, self);
136
+ return INT2FIX(bv->count);
137
+ }
138
+
139
+ /*
140
+ * call-seq:
141
+ * bv.clear -> self
142
+ *
143
+ * Clears all set bits in the bit vector. Negated bit vectors will still have
144
+ * all bits set to *off*.
145
+ */
146
+ VALUE
147
+ frt_bv_clear(VALUE self)
148
+ {
149
+ BitVector *bv;
150
+ GET_BV(bv, self);
151
+ bv_clear(bv);
152
+ bv_scan_reset(bv);
153
+ return self;
154
+ }
155
+
156
+ /*
157
+ * call-seq:
158
+ * bv1 == bv2 -> bool
159
+ * bv1 != bv2 -> bool
160
+ * bv1.eql(bv2) -> bool
161
+ *
162
+ * Compares two bit vectors and returns true if both bit vectors have the same
163
+ * bits set.
164
+ */
165
+ VALUE
166
+ frt_bv_eql(VALUE self, VALUE other)
167
+ {
168
+ BitVector *bv1, *bv2;
169
+ GET_BV(bv1, self);
170
+ GET_BV(bv2, other);
171
+ return bv_eq(bv1, bv2) ? Qtrue : Qfalse;
172
+ }
173
+
174
+ /*
175
+ * call-seq:
176
+ * bv.hash -> int
177
+ *
178
+ * Used to store bit vectors in Hashes. Especially useful if you want to
179
+ * cache them.
180
+ */
181
+ VALUE
182
+ frt_bv_hash(VALUE self)
183
+ {
184
+ BitVector *bv;
185
+ GET_BV(bv, self);
186
+ return LONG2NUM(bv_hash(bv));
187
+ }
188
+
189
+ /*
190
+ * call-seq:
191
+ * bv1 & bv2 -> anded_bv
192
+ * bv1.and(bv2) -> anded_bv
193
+ *
194
+ * Perform a boolean _and_ operation on +bv1+ and
195
+ * +bv2+
196
+ */
197
+ VALUE
198
+ frt_bv_and(VALUE self, VALUE other)
199
+ {
200
+ BitVector *bv1, *bv2;
201
+ GET_BV(bv1, self);
202
+ GET_BV(bv2, other);
203
+ return Data_Wrap_Struct(cBitVector, NULL, &bv_destroy, bv_and(bv1, bv2));
204
+ }
205
+
206
+ /*
207
+ * call-seq:
208
+ * bv1.and!(bv2) -> self
209
+ *
210
+ * Perform a boolean _and_ operation on +bv1+ and
211
+ * +bv2+ in place on +bv1+
212
+ */
213
+ VALUE
214
+ frt_bv_and_x(VALUE self, VALUE other)
215
+ {
216
+ BitVector *bv1, *bv2;
217
+ GET_BV(bv1, self);
218
+ GET_BV(bv2, other);
219
+ bv_and_x(bv1, bv2);
220
+ return self;
221
+ }
222
+
223
+ /*
224
+ * call-seq:
225
+ * bv1 | bv2 -> ored_bv
226
+ * bv1.or(bv2) -> ored_bv
227
+ *
228
+ * Perform a boolean _or_ operation on +bv1+ and
229
+ * +bv2+
230
+ */
231
+ VALUE
232
+ frt_bv_or(VALUE self, VALUE other)
233
+ {
234
+ BitVector *bv1, *bv2;
235
+ GET_BV(bv1, self);
236
+ GET_BV(bv2, other);
237
+ return Data_Wrap_Struct(cBitVector, NULL, &bv_destroy, bv_or(bv1, bv2));
238
+ }
239
+
240
+ /*
241
+ * call-seq:
242
+ * bv1.or!(bv2) -> self
243
+ *
244
+ * Perform a boolean _or_ operation on +bv1+ and
245
+ * +bv2+ in place on +bv1+
246
+ */
247
+ VALUE
248
+ frt_bv_or_x(VALUE self, VALUE other)
249
+ {
250
+ BitVector *bv1, *bv2;
251
+ GET_BV(bv1, self);
252
+ GET_BV(bv2, other);
253
+ bv_or_x(bv1, bv2);
254
+ return self;
255
+ }
256
+
257
+ /*
258
+ * call-seq:
259
+ * bv1 ^ bv2 -> xored_bv
260
+ * bv1.xor(bv2) -> xored_bv
261
+ *
262
+ * Perform a boolean _xor_ operation on +bv1+ and
263
+ * +bv2+
264
+ */
265
+ VALUE
266
+ frt_bv_xor(VALUE self, VALUE other)
267
+ {
268
+ BitVector *bv1, *bv2;
269
+ GET_BV(bv1, self);
270
+ GET_BV(bv2, other);
271
+ return Data_Wrap_Struct(cBitVector, NULL, &bv_destroy, bv_xor(bv1, bv2));
272
+ }
273
+
274
+ /*
275
+ * call-seq:
276
+ * bv1.xor!(bv2) -> self
277
+ *
278
+ * Perform a boolean _xor_ operation on +bv1+ and
279
+ * +bv2+ in place on +bv1+
280
+ */
281
+ VALUE
282
+ frt_bv_xor_x(VALUE self, VALUE other)
283
+ {
284
+ BitVector *bv1, *bv2;
285
+ GET_BV(bv1, self);
286
+ GET_BV(bv2, other);
287
+ bv_xor_x(bv1, bv2);
288
+ return self;
289
+ }
290
+
291
+ /*
292
+ * call-seq:
293
+ * ~bv -> bv
294
+ * bv.not -> bv
295
+ *
296
+ * Perform a boolean _not_ operation on +bv+
297
+ * */
298
+ VALUE
299
+ frt_bv_not(VALUE self)
300
+ {
301
+ BitVector *bv;
302
+ GET_BV(bv, self);
303
+ return Data_Wrap_Struct(cBitVector, NULL, &bv_destroy, bv_not(bv));
304
+ }
305
+
306
+ /*
307
+ * call-seq:
308
+ * bv.not! -> self
309
+ *
310
+ * Perform a boolean _not_ operation on +bv+ in-place
311
+ */
312
+ VALUE
313
+ frt_bv_not_x(VALUE self)
314
+ {
315
+ BitVector *bv;
316
+ GET_BV(bv, self);
317
+ bv_not_x(bv);
318
+ return self;
319
+ }
320
+
321
+ /*
322
+ * call-seq:
323
+ * bv.reset_scan -> self
324
+ *
325
+ * Resets the BitVector ready for scanning. You should call this method
326
+ * before calling +#next+ or +#next_unset+. It isn't
327
+ * necessary for the other scan methods or for the +#each+ method.
328
+ */
329
+ VALUE
330
+ frt_bv_reset_scan(VALUE self)
331
+ {
332
+ BitVector *bv;
333
+ GET_BV(bv, self);
334
+ bv_scan_reset(bv);
335
+ return self;
336
+ }
337
+
338
+ /*
339
+ * call-seq:
340
+ * bv.next -> bit_num
341
+ *
342
+ * Returns the next set bit in the bit vector scanning from low order to high
343
+ * order. You should call +#reset_scan+ before calling this method
344
+ * if you want to scan from the beginning. It is automatically reset when you
345
+ * first create the bit vector.
346
+ */
347
+ VALUE
348
+ frt_bv_next(VALUE self)
349
+ {
350
+ BitVector *bv;
351
+ GET_BV(bv, self);
352
+ return INT2FIX(bv_scan_next(bv));
353
+ }
354
+
355
+ /*
356
+ * call-seq:
357
+ * bv.next_unset -> bit_num
358
+ *
359
+ * Returns the next unset bit in the bit vector scanning from low order to
360
+ * high order. This method should only be called on bit vectors which have
361
+ * been flipped (negated). You should call +#reset_scan+ before
362
+ * calling this method if you want to scan from the beginning. It is
363
+ * automatically reset when you first create the bit vector.
364
+ */
365
+ VALUE
366
+ frt_bv_next_unset(VALUE self)
367
+ {
368
+ BitVector *bv;
369
+ GET_BV(bv, self);
370
+ return INT2FIX(bv_scan_next_unset(bv));
371
+ }
372
+
373
+ /*
374
+ * call-seq:
375
+ * bv.next_from(from) -> bit_num
376
+ *
377
+ * Returns the next set bit in the bit vector scanning from low order to
378
+ * high order and starting at +from+. The scan is inclusive so if
379
+ * +from+ is equal to 10 and +bv[10]+ is set it will
380
+ * return the number 10. If the bit vector has been negated than you should
381
+ * use the +#next_unset_from+ method.
382
+ */
383
+ VALUE
384
+ frt_bv_next_from(VALUE self, VALUE rfrom)
385
+ {
386
+ BitVector *bv;
387
+ int from = FIX2INT(rfrom);
388
+ GET_BV(bv, self);
389
+ if (from < 0) {
390
+ from = 0;
391
+ }
392
+ return INT2FIX(bv_scan_next_from(bv, from));
393
+ }
394
+
395
+ /*
396
+ * call-seq:
397
+ * bv.next_unset_from(from) -> bit_num
398
+ *
399
+ * Returns the next unset bit in the bit vector scanning from low order to
400
+ * high order and starting at +from+. The scan is inclusive so if
401
+ * +from+ is equal to 10 and +bv[10]+ is unset it will
402
+ * return the number 10. If the bit vector has not been negated than you
403
+ * should use the +#next_from+ method.
404
+ */
405
+ VALUE
406
+ frt_bv_next_unset_from(VALUE self, VALUE rfrom)
407
+ {
408
+ BitVector *bv;
409
+ int from = FIX2INT(rfrom);
410
+ GET_BV(bv, self);
411
+ if (from < 0) {
412
+ from = 0;
413
+ }
414
+ return INT2FIX(bv_scan_next_unset_from(bv, from));
415
+ }
416
+
417
+ /*
418
+ * call-seq:
419
+ * bv.each { |bit_num| }
420
+ *
421
+ * Iterate through all the set bits in the bit vector yielding each one in
422
+ * order
423
+ */
424
+ VALUE
425
+ frt_bv_each(VALUE self)
426
+ {
427
+ BitVector *bv;
428
+ int bit;
429
+ GET_BV(bv, self);
430
+ bv_scan_reset(bv);
431
+ if (bv->extends_as_ones) {
432
+ while ((bit = bv_scan_next_unset(bv)) >= 0) {
433
+ rb_yield(INT2FIX(bit));
434
+ }
435
+ }
436
+ else {
437
+ while ((bit = bv_scan_next(bv)) >= 0) {
438
+ rb_yield(INT2FIX(bit));
439
+ }
440
+ }
441
+ return self;
442
+ }
443
+
444
+ /*
445
+ * call-seq:
446
+ * bv.to_a
447
+ *
448
+ * Iterate through all the set bits in the bit vector adding the index of
449
+ * each set bit to an array. This is useful if you want to perform array
450
+ * methods on the bit vector. If you want to convert an array to a bit_vector
451
+ * simply do this;
452
+ *
453
+ * bv = [1, 12, 45, 367, 455].inject(BitVector.new) {|bv, i| bv.set(i)}
454
+ */
455
+ VALUE
456
+ frt_bv_to_a(VALUE self)
457
+ {
458
+ BitVector *bv;
459
+ int bit;
460
+ VALUE ary;
461
+ GET_BV(bv, self);
462
+ ary = rb_ary_new();
463
+ bv_scan_reset(bv);
464
+ if (bv->extends_as_ones) {
465
+ while ((bit = bv_scan_next_unset(bv)) >= 0) {
466
+ rb_ary_push(ary, INT2FIX(bit));
467
+ }
468
+ }
469
+ else {
470
+ while ((bit = bv_scan_next(bv)) >= 0) {
471
+ rb_ary_push(ary, INT2FIX(bit));
472
+ }
473
+ }
474
+ return ary;
475
+ }
476
+
477
+ static VALUE mUtils;
478
+
479
+ /*
480
+ * Document-class: Ferret::Utils::BitVector
481
+ *
482
+ * == Summary
483
+ *
484
+ * A BitVector is pretty easy to implement in Ruby using Ruby's BigNum class.
485
+ * This BitVector however allows you to count the set bits with the
486
+ * +#count+ method (or unset bits of flipped bit vectors) and also
487
+ * to quickly scan the set bits.
488
+ *
489
+ * == Boolean Operations
490
+ *
491
+ * BitVector handles four boolean operations;
492
+ *
493
+ * * +&+
494
+ * * +|+
495
+ * * +^+
496
+ * * +~+
497
+ *
498
+ * bv1 = BitVector.new
499
+ * bv2 = BitVector.new
500
+ * bv3 = BitVector.new
501
+ *
502
+ * bv4 = (bv1 & bv2) | ~bv3
503
+ *
504
+ * You can also do the operations in-place;
505
+ *
506
+ * * +and!+
507
+ * * +or!+
508
+ * * +xor!+
509
+ * * +not!+
510
+ *
511
+ * bv4.and!(bv5).not!
512
+ *
513
+ * == Set Bit Scanning
514
+ *
515
+ * Perhaps the most useful functionality in BitVector is the ability to
516
+ * quickly scan for set bits. To print all set bits;
517
+ *
518
+ * bv.each {|bit| puts bit }
519
+ *
520
+ * Alternatively you could use the lower level +next+ or
521
+ * +next_unset+ methods. Note that the +each+ method will
522
+ * automatically scan unset bits if the BitVector has been flipped (using
523
+ * +not+).
524
+ */
525
+ static void
526
+ Init_BitVector(void)
527
+ {
528
+ /* BitVector */
529
+ cBitVector = rb_define_class_under(mUtils, "BitVector", rb_cObject);
530
+ rb_define_alloc_func(cBitVector, frt_bv_alloc);
531
+
532
+ rb_define_method(cBitVector, "initialize", frt_bv_init, 0);
533
+ rb_define_method(cBitVector, "set", frt_bv_set_on, 1);
534
+ rb_define_method(cBitVector, "unset", frt_bv_set_off, 1);
535
+ rb_define_method(cBitVector, "[]=", frt_bv_set, 2);
536
+ rb_define_method(cBitVector, "get", frt_bv_get, 1);
537
+ rb_define_method(cBitVector, "[]", frt_bv_get, 1);
538
+ rb_define_method(cBitVector, "count", frt_bv_count, 0);
539
+ rb_define_method(cBitVector, "clear", frt_bv_clear, 0);
540
+ rb_define_method(cBitVector, "eql?", frt_bv_eql, 1);
541
+ rb_define_method(cBitVector, "==", frt_bv_eql, 1);
542
+ rb_define_method(cBitVector, "hash", frt_bv_hash, 0);
543
+ rb_define_method(cBitVector, "and!", frt_bv_and_x, 1);
544
+ rb_define_method(cBitVector, "and", frt_bv_and, 1);
545
+ rb_define_method(cBitVector, "&", frt_bv_and, 1);
546
+ rb_define_method(cBitVector, "or!", frt_bv_or_x, 1);
547
+ rb_define_method(cBitVector, "or", frt_bv_or, 1);
548
+ rb_define_method(cBitVector, "|", frt_bv_or, 1);
549
+ rb_define_method(cBitVector, "xor!", frt_bv_xor_x, 1);
550
+ rb_define_method(cBitVector, "xor", frt_bv_xor, 1);
551
+ rb_define_method(cBitVector, "^", frt_bv_xor, 1);
552
+ rb_define_method(cBitVector, "not!", frt_bv_not_x, 0);
553
+ rb_define_method(cBitVector, "not", frt_bv_not, 0);
554
+ rb_define_method(cBitVector, "~", frt_bv_not, 0);
555
+ rb_define_method(cBitVector, "reset_scan", frt_bv_reset_scan, 0);
556
+ rb_define_method(cBitVector, "next", frt_bv_next, 0);
557
+ rb_define_method(cBitVector, "next_unset", frt_bv_next_unset, 0);
558
+ rb_define_method(cBitVector, "next_from", frt_bv_next_from, 1);
559
+ rb_define_method(cBitVector, "next_unset_from", frt_bv_next_unset_from, 1);
560
+ rb_define_method(cBitVector, "each", frt_bv_each, 0);
561
+ rb_define_method(cBitVector, "to_a", frt_bv_to_a, 0);
562
+ }
563
+
564
+ /*********************
565
+ *** PriorityQueue ***
566
+ *********************/
567
+ typedef struct PriQ
568
+ {
569
+ int size;
570
+ int capa;
571
+ int mem_capa;
572
+ VALUE *heap;
573
+ VALUE proc;
574
+ } PriQ;
575
+
576
+ #define PQ_START_CAPA 32
577
+
578
+ static bool frt_pq_lt(VALUE proc, VALUE v1, VALUE v2)
579
+ {
580
+ if (proc == Qnil) {
581
+ return RTEST(rb_funcall(v1, id_lt, 1, v2));
582
+ }
583
+ else {
584
+ return RTEST(rb_funcall(proc, id_call, 2, v1, v2));
585
+ }
586
+ }
587
+
588
+ static void pq_up(PriQ *pq)
589
+ {
590
+ VALUE *heap = pq->heap;
591
+ VALUE node;
592
+ int i = pq->size;
593
+ int j = i >> 1;
594
+
595
+ node = heap[i];
596
+
597
+ while ((j > 0) && frt_pq_lt(pq->proc, node, heap[j])) {
598
+ heap[i] = heap[j];
599
+ i = j;
600
+ j = j >> 1;
601
+ }
602
+ heap[i] = node;
603
+ }
604
+
605
+ static void pq_down(PriQ *pq)
606
+ {
607
+ register int i = 1;
608
+ register int j = 2; /* i << 1; */
609
+ register int k = 3; /* j + 1; */
610
+ register int size = pq->size;
611
+ VALUE *heap = pq->heap;
612
+ VALUE node = heap[i]; /* save top node */
613
+
614
+ if ((k <= size) && (frt_pq_lt(pq->proc, heap[k], heap[j]))) {
615
+ j = k;
616
+ }
617
+
618
+ while ((j <= size) && frt_pq_lt(pq->proc, heap[j], node)) {
619
+ heap[i] = heap[j]; /* shift up child */
620
+ i = j;
621
+ j = i << 1;
622
+ k = j + 1;
623
+ if ((k <= size) && frt_pq_lt(pq->proc, heap[k], heap[j])) {
624
+ j = k;
625
+ }
626
+ }
627
+ heap[i] = node;
628
+ }
629
+
630
+ static void pq_push(PriQ *pq, VALUE elem)
631
+ {
632
+ pq->size++;
633
+ if (pq->size >= pq->mem_capa) {
634
+ pq->mem_capa <<= 1;
635
+ REALLOC_N(pq->heap, VALUE, pq->mem_capa);
636
+ }
637
+ pq->heap[pq->size] = elem;
638
+ pq_up(pq);
639
+ }
640
+
641
+ static VALUE cPriorityQueue;
642
+
643
+ static void
644
+ frt_pq_mark(void *p)
645
+ {
646
+ PriQ *pq = (PriQ *)p;
647
+ int i;
648
+ for (i = pq->size; i > 0; i--) {
649
+ rb_gc_mark_maybe(pq->heap[i]);
650
+ }
651
+ }
652
+
653
+ static void frt_pq_free(PriQ *pq)
654
+ {
655
+ free(pq->heap);
656
+ free(pq);
657
+ }
658
+
659
+ static VALUE
660
+ frt_pq_alloc(VALUE klass)
661
+ {
662
+ PriQ *pq = ALLOC_AND_ZERO(PriQ);
663
+ pq->capa = PQ_START_CAPA;
664
+ pq->mem_capa = PQ_START_CAPA;
665
+ pq->heap = ALLOC_N(VALUE, PQ_START_CAPA);
666
+ pq->proc = Qnil;
667
+ return Data_Wrap_Struct(klass, &frt_pq_mark, &frt_pq_free, pq);
668
+ }
669
+
670
+ #define GET_PQ(pq, self) Data_Get_Struct(self, PriQ, pq)
671
+ /*
672
+ * call-seq:
673
+ * PriorityQueue.new(capacity = 32) -> new_pq
674
+ * PriorityQueue.new({:capacity => 32,
675
+ * :less_than_proc => lambda{|a, b| a < b}) -> new_pq
676
+ * PriorityQueue.new({:capacity => 32}) {|a, b| a < b} -> new_pq
677
+ *
678
+ * Returns a new empty priority queue object with an optional capacity.
679
+ * Once the capacity is filled, the lowest valued elements will be
680
+ * automatically popped off the top of the queue as more elements are
681
+ * inserted into the queue.
682
+ */
683
+ static VALUE
684
+ frt_pq_init(int argc, VALUE *argv, VALUE self)
685
+ {
686
+ if (argc >= 1) {
687
+ PriQ *pq;
688
+ VALUE options = argv[0];
689
+ VALUE param;
690
+ int capa = PQ_START_CAPA;
691
+ GET_PQ(pq, self);
692
+ switch (TYPE(options)) {
693
+ case T_FIXNUM:
694
+ capa = FIX2INT(options);
695
+ break;
696
+ case T_HASH:
697
+ if (!NIL_P(param = rb_hash_aref(options,
698
+ ID2SYM(id_capacity)))) {
699
+ capa = FIX2INT(param);
700
+ }
701
+ if (!NIL_P(param = rb_hash_aref(options,
702
+ ID2SYM(id_less_than)))) {
703
+ pq->proc = param;
704
+ }
705
+ break;
706
+ default:
707
+ rb_raise(rb_eArgError,
708
+ "PriorityQueue#initialize only takes a Hash or "
709
+ "an integer");
710
+
711
+ break;
712
+ }
713
+ if (capa < 0) {
714
+ rb_raise(rb_eIndexError,
715
+ "PriorityQueue must have a capacity > 0. %d < 0",
716
+ capa);
717
+ }
718
+ pq->capa = capa;
719
+ if (rb_block_given_p()) {
720
+ pq->proc = rb_block_proc();
721
+ }
722
+ if (argc > 1) {
723
+ rb_raise(rb_eArgError,
724
+ "PriorityQueue#initialize only takes one parameter");
725
+ }
726
+ }
727
+
728
+ return self;
729
+ }
730
+
731
+ /*
732
+ * call-seq:
733
+ * pq.clone -> pq_clone
734
+ *
735
+ * Returns a shallow clone of the priority queue. That is only the priority
736
+ * queue is cloned, its contents are not cloned.
737
+ */
738
+ static VALUE
739
+ frt_pq_clone(VALUE self)
740
+ {
741
+ PriQ *pq, *new_pq = ALLOC(PriQ);
742
+ GET_PQ(pq, self);
743
+ memcpy(new_pq, pq, sizeof(PriQ));
744
+ new_pq->heap = ALLOC_N(VALUE, new_pq->mem_capa);
745
+ memcpy(new_pq->heap, pq->heap, sizeof(VALUE) * (new_pq->size + 1));
746
+
747
+ return Data_Wrap_Struct(cPriorityQueue, &frt_pq_mark, &frt_pq_free, new_pq);
748
+ }
749
+
750
+ /*
751
+ * call-seq:
752
+ * pq.clear -> self
753
+ *
754
+ * Clears all elements from the priority queue. The size will be reset to 0.
755
+ */
756
+ static VALUE
757
+ frt_pq_clear(VALUE self)
758
+ {
759
+ PriQ *pq;
760
+ GET_PQ(pq, self);
761
+ pq->size = 0;
762
+ return self;
763
+ }
764
+
765
+ /*
766
+ * call-seq:
767
+ * pq.insert(elem) -> self
768
+ * pq << elem -> self
769
+ *
770
+ * Insert an element into a queue. It will be inserted into the correct
771
+ * position in the queue according to its priority.
772
+ */
773
+ static VALUE
774
+ frt_pq_insert(VALUE self, VALUE elem)
775
+ {
776
+ PriQ *pq;
777
+ GET_PQ(pq, self);
778
+ if (pq->size < pq->capa) {
779
+ pq_push(pq, elem);
780
+ }
781
+ else if (pq->size > 0 && frt_pq_lt(pq->proc, pq->heap[1], elem)) {
782
+ pq->heap[1] = elem;
783
+ pq_down(pq);
784
+ }
785
+ /* else ignore the element */
786
+ return self;
787
+ }
788
+
789
+ /*
790
+ * call-seq:
791
+ * pq.adjust -> self
792
+ *
793
+ * Sometimes you modify the top element in the priority queue so that its
794
+ * priority changes. When you do this you need to reorder the queue and you
795
+ * do this by calling the adjust method.
796
+ */
797
+ static VALUE
798
+ frt_pq_adjust(VALUE self)
799
+ {
800
+ PriQ *pq;
801
+ GET_PQ(pq, self);
802
+ pq_down(pq);
803
+ return self;
804
+ }
805
+
806
+ /*
807
+ * call-seq:
808
+ * pq.top -> elem
809
+ *
810
+ * Returns the top element in the queue but does not remove it from the
811
+ * queue.
812
+ */
813
+ static VALUE
814
+ frt_pq_top(VALUE self)
815
+ {
816
+ PriQ *pq;
817
+ GET_PQ(pq, self);
818
+ return (pq->size > 0) ? pq->heap[1] : Qnil;
819
+ }
820
+
821
+ /*
822
+ * call-seq:
823
+ * pq.pop -> elem
824
+ *
825
+ * Returns the top element in the queue removing it from the queue.
826
+ */
827
+ static VALUE
828
+ frt_pq_pop(VALUE self)
829
+ {
830
+ PriQ *pq;
831
+ GET_PQ(pq, self);
832
+ if (pq->size > 0) {
833
+ VALUE result = pq->heap[1]; /* save first value */
834
+ pq->heap[1] = pq->heap[pq->size]; /* move last to first */
835
+ pq->heap[pq->size] = Qnil;
836
+ pq->size--;
837
+ pq_down(pq); /* adjust heap */
838
+ return result;
839
+ }
840
+ else {
841
+ return Qnil;
842
+ }
843
+ }
844
+
845
+ /*
846
+ * call-seq:
847
+ * pq.size -> integer
848
+ *
849
+ * Returns the size of the queue, ie. the number of elements currently stored
850
+ * in the queue. The _size_ of a PriorityQueue can never be greater than
851
+ * its _capacity_
852
+ */
853
+ static VALUE
854
+ frt_pq_size(VALUE self)
855
+ {
856
+ PriQ *pq;
857
+ GET_PQ(pq, self);
858
+ return INT2FIX(pq->size);
859
+ }
860
+
861
+ /*
862
+ * call-seq:
863
+ * pq.capacity -> integer
864
+ *
865
+ * Returns the capacity of the queue, ie. the number of elements that can be
866
+ * stored in a Priority queue before they start to drop off the end. The
867
+ * _size_ of a PriorityQueue can never be greater than its
868
+ * _capacity_
869
+ */
870
+ static VALUE
871
+ frt_pq_capa(VALUE self)
872
+ {
873
+ PriQ *pq;
874
+ GET_PQ(pq, self);
875
+ return INT2FIX(pq->capa);
876
+ }
877
+
878
+ /*
879
+ * Document-class: Ferret::Utils::PriorityQueue
880
+ *
881
+ * == Summary
882
+ *
883
+ * A PriorityQueue is a very useful data structure and one that needs a fast
884
+ * implementation. Hence this priority queue is implemented in C. It is
885
+ * pretty easy to use; basically you just insert elements into the queue and
886
+ * pop them off.
887
+ *
888
+ * The elements are sorted with the lowest valued elements on the top of
889
+ * the heap, ie the first to be popped off. Elements are ordered using the
890
+ * less_than '<' method. To change the order of the queue you can either
891
+ * reimplement the '<' method pass a block when you initialize the queue.
892
+ *
893
+ * You can also set the capacity of the PriorityQueue. Once you hit the
894
+ * capacity, the lowest values elements are automatically popped of the top
895
+ * of the queue as more elements are added.
896
+ *
897
+ * == Example
898
+ *
899
+ * Here is a toy example that sorts strings by their length and has a capacity
900
+ * of 5;
901
+ *
902
+ * q = PriorityQueue.new(5) {|a, b| a.size < b.size}
903
+ * q << "x"
904
+ * q << "xxxxx"
905
+ * q << "xxx"
906
+ * q << "xxxx"
907
+ * q << "xxxxxx"
908
+ * q << "xx" # hit capacity so "x" will be popped off the top
909
+ *
910
+ * puts q.size #=> 5
911
+ * word = q.pop #=> "xx"
912
+ * q.top << "yyyy" # "xxxyyyy" will still be at the top of the queue
913
+ * q.adjust # move "xxxyyyy" to its correct location in queue
914
+ * word = q.pop #=> "xxxx"
915
+ * word = q.pop #=> "xxxxx"
916
+ * word = q.pop #=> "xxxxxx"
917
+ * word = q.pop #=> "xxxyyyy"
918
+ * word = q.pop #=> nil
919
+ */
920
+ static void
921
+ Init_PriorityQueue(void)
922
+ {
923
+ /* PriorityQueue */
924
+ cPriorityQueue = rb_define_class_under(mUtils, "PriorityQueue", rb_cObject);
925
+ rb_define_alloc_func(cPriorityQueue, frt_pq_alloc);
926
+
927
+ rb_define_method(cPriorityQueue, "initialize", frt_pq_init, -1);
928
+ rb_define_method(cPriorityQueue, "clone", frt_pq_clone, 0);
929
+ rb_define_method(cPriorityQueue, "clear", frt_pq_clear, 0);
930
+ rb_define_method(cPriorityQueue, "insert", frt_pq_insert, 1);
931
+ rb_define_method(cPriorityQueue, "<<", frt_pq_insert, 1);
932
+ rb_define_method(cPriorityQueue, "top", frt_pq_top, 0);
933
+ rb_define_method(cPriorityQueue, "pop", frt_pq_pop, 0);
934
+ rb_define_method(cPriorityQueue, "size", frt_pq_size, 0);
935
+ rb_define_method(cPriorityQueue, "capacity", frt_pq_capa, 0);
936
+ rb_define_method(cPriorityQueue, "adjust", frt_pq_adjust, 0);
937
+ }
938
+
939
+ /* rdoc hack
940
+ extern VALUE mFerret = rb_define_module("Ferret");
941
+ */
942
+
943
+ /*
944
+ * Document-module: Ferret::Utils
945
+ *
946
+ * The Utils module contains a number of helper classes and modules that are
947
+ * useful when indexing with Ferret. They are;
948
+ *
949
+ * * BitVector
950
+ * * PriorityQueue
951
+ * * => more to come
952
+ *
953
+ * These helper classes could also be quite useful outside of Ferret and may
954
+ * one day find themselves in their own separate library.
955
+ */
956
+ void
957
+ Init_Utils(void)
958
+ {
959
+ mUtils = rb_define_module_under(mFerret, "Utils");
960
+
961
+ Init_BitVector();
962
+ Init_PriorityQueue();
963
+ }