sa-ferret 0.11.6.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (193) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1588 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/index.c +6425 -0
  37. data/ext/index.h +961 -0
  38. data/ext/lang.h +48 -0
  39. data/ext/libstemmer.c +92 -0
  40. data/ext/libstemmer.h +79 -0
  41. data/ext/mempool.c +87 -0
  42. data/ext/mempool.h +35 -0
  43. data/ext/modules.h +162 -0
  44. data/ext/multimapper.c +310 -0
  45. data/ext/multimapper.h +51 -0
  46. data/ext/posh.c +1006 -0
  47. data/ext/posh.h +1007 -0
  48. data/ext/priorityqueue.c +151 -0
  49. data/ext/priorityqueue.h +143 -0
  50. data/ext/q_boolean.c +1608 -0
  51. data/ext/q_const_score.c +161 -0
  52. data/ext/q_filtered_query.c +209 -0
  53. data/ext/q_fuzzy.c +268 -0
  54. data/ext/q_match_all.c +148 -0
  55. data/ext/q_multi_term.c +677 -0
  56. data/ext/q_parser.c +2825 -0
  57. data/ext/q_phrase.c +1126 -0
  58. data/ext/q_prefix.c +100 -0
  59. data/ext/q_range.c +350 -0
  60. data/ext/q_span.c +2402 -0
  61. data/ext/q_term.c +337 -0
  62. data/ext/q_wildcard.c +171 -0
  63. data/ext/r_analysis.c +2499 -0
  64. data/ext/r_index.c +3485 -0
  65. data/ext/r_qparser.c +585 -0
  66. data/ext/r_search.c +4107 -0
  67. data/ext/r_store.c +513 -0
  68. data/ext/r_utils.c +963 -0
  69. data/ext/ram_store.c +471 -0
  70. data/ext/search.c +1741 -0
  71. data/ext/search.h +885 -0
  72. data/ext/similarity.c +150 -0
  73. data/ext/similarity.h +82 -0
  74. data/ext/sort.c +983 -0
  75. data/ext/stem_ISO_8859_1_danish.c +338 -0
  76. data/ext/stem_ISO_8859_1_danish.h +16 -0
  77. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  78. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  79. data/ext/stem_ISO_8859_1_english.c +1156 -0
  80. data/ext/stem_ISO_8859_1_english.h +16 -0
  81. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  82. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  83. data/ext/stem_ISO_8859_1_french.c +1276 -0
  84. data/ext/stem_ISO_8859_1_french.h +16 -0
  85. data/ext/stem_ISO_8859_1_german.c +512 -0
  86. data/ext/stem_ISO_8859_1_german.h +16 -0
  87. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  88. data/ext/stem_ISO_8859_1_italian.h +16 -0
  89. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  90. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  91. data/ext/stem_ISO_8859_1_porter.c +776 -0
  92. data/ext/stem_ISO_8859_1_porter.h +16 -0
  93. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  94. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  95. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  96. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  97. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  98. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  99. data/ext/stem_KOI8_R_russian.c +701 -0
  100. data/ext/stem_KOI8_R_russian.h +16 -0
  101. data/ext/stem_UTF_8_danish.c +344 -0
  102. data/ext/stem_UTF_8_danish.h +16 -0
  103. data/ext/stem_UTF_8_dutch.c +653 -0
  104. data/ext/stem_UTF_8_dutch.h +16 -0
  105. data/ext/stem_UTF_8_english.c +1176 -0
  106. data/ext/stem_UTF_8_english.h +16 -0
  107. data/ext/stem_UTF_8_finnish.c +808 -0
  108. data/ext/stem_UTF_8_finnish.h +16 -0
  109. data/ext/stem_UTF_8_french.c +1296 -0
  110. data/ext/stem_UTF_8_french.h +16 -0
  111. data/ext/stem_UTF_8_german.c +526 -0
  112. data/ext/stem_UTF_8_german.h +16 -0
  113. data/ext/stem_UTF_8_italian.c +1113 -0
  114. data/ext/stem_UTF_8_italian.h +16 -0
  115. data/ext/stem_UTF_8_norwegian.c +302 -0
  116. data/ext/stem_UTF_8_norwegian.h +16 -0
  117. data/ext/stem_UTF_8_porter.c +794 -0
  118. data/ext/stem_UTF_8_porter.h +16 -0
  119. data/ext/stem_UTF_8_portuguese.c +1055 -0
  120. data/ext/stem_UTF_8_portuguese.h +16 -0
  121. data/ext/stem_UTF_8_russian.c +709 -0
  122. data/ext/stem_UTF_8_russian.h +16 -0
  123. data/ext/stem_UTF_8_spanish.c +1137 -0
  124. data/ext/stem_UTF_8_spanish.h +16 -0
  125. data/ext/stem_UTF_8_swedish.c +313 -0
  126. data/ext/stem_UTF_8_swedish.h +16 -0
  127. data/ext/stopwords.c +401 -0
  128. data/ext/store.c +692 -0
  129. data/ext/store.h +777 -0
  130. data/ext/term_vectors.c +352 -0
  131. data/ext/threading.h +31 -0
  132. data/ext/utilities.c +446 -0
  133. data/ext/win32.h +54 -0
  134. data/lib/ferret.rb +29 -0
  135. data/lib/ferret/browser.rb +246 -0
  136. data/lib/ferret/browser/s/global.js +192 -0
  137. data/lib/ferret/browser/s/style.css +148 -0
  138. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  139. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  140. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  141. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  142. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  143. data/lib/ferret/browser/views/layout.rhtml +22 -0
  144. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  145. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  146. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  147. data/lib/ferret/browser/webrick.rb +14 -0
  148. data/lib/ferret/document.rb +130 -0
  149. data/lib/ferret/field_infos.rb +44 -0
  150. data/lib/ferret/index.rb +786 -0
  151. data/lib/ferret/number_tools.rb +157 -0
  152. data/lib/ferret_version.rb +3 -0
  153. data/setup.rb +1555 -0
  154. data/test/test_all.rb +5 -0
  155. data/test/test_helper.rb +24 -0
  156. data/test/threading/number_to_spoken.rb +132 -0
  157. data/test/threading/thread_safety_index_test.rb +79 -0
  158. data/test/threading/thread_safety_read_write_test.rb +76 -0
  159. data/test/threading/thread_safety_test.rb +133 -0
  160. data/test/unit/analysis/tc_analyzer.rb +548 -0
  161. data/test/unit/analysis/tc_token_stream.rb +646 -0
  162. data/test/unit/index/tc_index.rb +762 -0
  163. data/test/unit/index/tc_index_reader.rb +699 -0
  164. data/test/unit/index/tc_index_writer.rb +437 -0
  165. data/test/unit/index/th_doc.rb +315 -0
  166. data/test/unit/largefile/tc_largefile.rb +46 -0
  167. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  168. data/test/unit/search/tc_filter.rb +135 -0
  169. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  170. data/test/unit/search/tc_index_searcher.rb +61 -0
  171. data/test/unit/search/tc_multi_searcher.rb +128 -0
  172. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  173. data/test/unit/search/tc_search_and_sort.rb +179 -0
  174. data/test/unit/search/tc_sort.rb +49 -0
  175. data/test/unit/search/tc_sort_field.rb +27 -0
  176. data/test/unit/search/tc_spans.rb +190 -0
  177. data/test/unit/search/tm_searcher.rb +384 -0
  178. data/test/unit/store/tc_fs_store.rb +77 -0
  179. data/test/unit/store/tc_ram_store.rb +35 -0
  180. data/test/unit/store/tm_store.rb +34 -0
  181. data/test/unit/store/tm_store_lock.rb +68 -0
  182. data/test/unit/tc_document.rb +81 -0
  183. data/test/unit/ts_analysis.rb +2 -0
  184. data/test/unit/ts_index.rb +2 -0
  185. data/test/unit/ts_largefile.rb +4 -0
  186. data/test/unit/ts_query_parser.rb +2 -0
  187. data/test/unit/ts_search.rb +2 -0
  188. data/test/unit/ts_store.rb +2 -0
  189. data/test/unit/ts_utils.rb +2 -0
  190. data/test/unit/utils/tc_bit_vector.rb +295 -0
  191. data/test/unit/utils/tc_number_tools.rb +117 -0
  192. data/test/unit/utils/tc_priority_queue.rb +106 -0
  193. metadata +269 -0
data/ext/r_index.c ADDED
@@ -0,0 +1,3485 @@
1
+ #include "ferret.h"
2
+ #include "index.h"
3
+ #include <st.h>
4
+
5
+ VALUE mIndex;
6
+
7
+ VALUE cFieldInfo;
8
+ VALUE cFieldInfos;
9
+
10
+ VALUE cTVOffsets;
11
+ VALUE cTVTerm;
12
+ VALUE cTermVector;
13
+
14
+ VALUE cTermEnum;
15
+ VALUE cTermDocEnum;
16
+
17
+ VALUE cLazyDoc;
18
+ VALUE cLazyDocData;
19
+ VALUE cIndexWriter;
20
+ VALUE cIndexReader;
21
+
22
+ VALUE sym_analyzer;
23
+ static VALUE sym_close_dir;
24
+ static VALUE sym_create;
25
+ static VALUE sym_create_if_missing;
26
+
27
+ static VALUE sym_chunk_size;
28
+ static VALUE sym_max_buffer_memory;
29
+ static VALUE sym_index_interval;
30
+ static VALUE sym_skip_interval;
31
+ static VALUE sym_merge_factor;
32
+ static VALUE sym_max_buffered_docs;
33
+ static VALUE sym_max_merge_docs;
34
+ static VALUE sym_max_field_length;
35
+ static VALUE sym_use_compound_file;
36
+
37
+ static VALUE sym_boost;
38
+ static VALUE sym_field_infos;
39
+
40
+ static VALUE sym_store;
41
+ static VALUE sym_index;
42
+ static VALUE sym_term_vector;
43
+
44
+ static VALUE sym_compress;
45
+ static VALUE sym_compressed;
46
+
47
+ static VALUE sym_untokenized;
48
+ static VALUE sym_omit_norms;
49
+ static VALUE sym_untokenized_omit_norms;
50
+
51
+ static VALUE sym_with_positions;
52
+ static VALUE sym_with_offsets;
53
+ static VALUE sym_with_positions_offsets;
54
+
55
+ static ID id_term;
56
+ static ID id_fields;
57
+ static ID id_fld_num_map;
58
+ static ID id_field_num;
59
+ static ID id_boost;
60
+
61
+ extern void frt_set_term(VALUE rterm, Term *t);
62
+ extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
63
+ extern VALUE frt_get_analyzer(Analyzer *a);
64
+
65
+ /****************************************************************************
66
+ *
67
+ * FieldInfo Methods
68
+ *
69
+ ****************************************************************************/
70
+
71
+ static void
72
+ frt_fi_free(void *p)
73
+ {
74
+ object_del(p);
75
+ fi_deref((FieldInfo *)p);
76
+ }
77
+
78
+ static void
79
+ frt_fi_get_params(VALUE roptions,
80
+ enum StoreValues *store,
81
+ enum IndexValues *index,
82
+ enum TermVectorValues *term_vector,
83
+ float *boost)
84
+ {
85
+ VALUE v;
86
+ Check_Type(roptions, T_HASH);
87
+ v = rb_hash_aref(roptions, sym_boost);
88
+ if (Qnil != v) {
89
+ *boost = (float)NUM2DBL(v);
90
+ } else {
91
+ *boost = 1.0f;
92
+ }
93
+ v = rb_hash_aref(roptions, sym_store);
94
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
95
+ if (v == sym_no || v == sym_false || v == Qfalse) {
96
+ *store = STORE_NO;
97
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
98
+ *store = STORE_YES;
99
+ } else if (v == sym_compress || v == sym_compressed) {
100
+ *store = STORE_COMPRESS;
101
+ } else if (v == Qnil) {
102
+ /* leave as default */
103
+ } else {
104
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
105
+ " Please choose from [:yes, :no, :compressed]",
106
+ rb_id2name(SYM2ID(v)));
107
+ }
108
+
109
+ v = rb_hash_aref(roptions, sym_index);
110
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
111
+ if (v == sym_no || v == sym_false || v == Qfalse) {
112
+ *index = INDEX_NO;
113
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
114
+ *index = INDEX_YES;
115
+ } else if (v == sym_untokenized) {
116
+ *index = INDEX_UNTOKENIZED;
117
+ } else if (v == sym_omit_norms) {
118
+ *index = INDEX_YES_OMIT_NORMS;
119
+ } else if (v == sym_untokenized_omit_norms) {
120
+ *index = INDEX_UNTOKENIZED_OMIT_NORMS;
121
+ } else if (v == Qnil) {
122
+ /* leave as default */
123
+ } else {
124
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
125
+ " Please choose from [:no, :yes, :untokenized, "
126
+ ":omit_norms, :untokenized_omit_norms]",
127
+ rb_id2name(SYM2ID(v)));
128
+ }
129
+
130
+ v = rb_hash_aref(roptions, sym_term_vector);
131
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
132
+ if (v == sym_no || v == sym_false || v == Qfalse) {
133
+ *term_vector = TERM_VECTOR_NO;
134
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
135
+ *term_vector = TERM_VECTOR_YES;
136
+ } else if (v == sym_with_positions) {
137
+ *term_vector = TERM_VECTOR_WITH_POSITIONS;
138
+ } else if (v == sym_with_offsets) {
139
+ *term_vector = TERM_VECTOR_WITH_OFFSETS;
140
+ } else if (v == sym_with_positions_offsets) {
141
+ *term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
142
+ } else if (v == Qnil) {
143
+ /* leave as default */
144
+ } else {
145
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
146
+ ":term_vector. Please choose from [:no, :yes, "
147
+ ":with_positions, :with_offsets, "
148
+ ":with_positions_offsets]",
149
+ rb_id2name(SYM2ID(v)));
150
+ }
151
+ }
152
+
153
+ static VALUE
154
+ frt_get_field_info(FieldInfo *fi)
155
+ {
156
+
157
+ VALUE rfi = Qnil;
158
+ if (fi) {
159
+ rfi = object_get(fi);
160
+ if (rfi == Qnil) {
161
+ rfi = Data_Wrap_Struct(cFieldInfo, NULL, &frt_fi_free, fi);
162
+ REF(fi);
163
+ object_add(fi, rfi);
164
+ }
165
+ }
166
+ return rfi;
167
+ }
168
+
169
+ /*
170
+ * call-seq:
171
+ * FieldInfo.new(name, options = {}) -> field_info
172
+ *
173
+ * Create a new FieldInfo object with the name +name+ and the properties
174
+ * specified in +options+. The available options are [:store, :index,
175
+ * :term_vector, :boost]. See the description of FieldInfo for more
176
+ * information on these properties.
177
+ */
178
+ static VALUE
179
+ frt_fi_init(int argc, VALUE *argv, VALUE self)
180
+ {
181
+ VALUE roptions, rname;
182
+ FieldInfo *fi;
183
+ enum StoreValues store = STORE_YES;
184
+ enum IndexValues index = INDEX_YES;
185
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
186
+ float boost = 1.0f;
187
+
188
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
189
+ if (argc > 1) {
190
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
191
+ }
192
+ fi = fi_new(frt_field(rname), store, index, term_vector);
193
+ fi->boost = boost;
194
+ Frt_Wrap_Struct(self, NULL, &frt_fi_free, fi);
195
+ object_add(fi, self);
196
+ return self;
197
+ }
198
+
199
+ /*
200
+ * call-seq:
201
+ * fi.name -> symbol
202
+ *
203
+ * Return the name of the field
204
+ */
205
+ static VALUE
206
+ frt_fi_name(VALUE self)
207
+ {
208
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
209
+ return ID2SYM(rb_intern(fi->name));
210
+ }
211
+
212
+ /*
213
+ * call-seq:
214
+ * fi.stored? -> bool
215
+ *
216
+ * Return true if the field is stored in the index.
217
+ */
218
+ static VALUE
219
+ frt_fi_is_stored(VALUE self)
220
+ {
221
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
222
+ return fi_is_stored(fi) ? Qtrue : Qfalse;
223
+ }
224
+
225
+ /*
226
+ * call-seq:
227
+ * fi.compressed? -> bool
228
+ *
229
+ * Return true if the field is stored in the index in compressed format.
230
+ */
231
+ static VALUE
232
+ frt_fi_is_compressed(VALUE self)
233
+ {
234
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
235
+ return fi_is_compressed(fi) ? Qtrue : Qfalse;
236
+ }
237
+
238
+ /*
239
+ * call-seq:
240
+ * fi.indexed? -> bool
241
+ *
242
+ * Return true if the field is indexed, ie searchable in the index.
243
+ */
244
+ static VALUE
245
+ frt_fi_is_indexed(VALUE self)
246
+ {
247
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
248
+ return fi_is_indexed(fi) ? Qtrue : Qfalse;
249
+ }
250
+
251
+ /*
252
+ * call-seq:
253
+ * fi.tokenized? -> bool
254
+ *
255
+ * Return true if the field is tokenized. Tokenizing is the process of
256
+ * breaking the field up into tokens. That is "the quick brown fox" becomes:
257
+ *
258
+ * ["the", "quick", "brown", "fox"]
259
+ *
260
+ * A field can only be tokenized if it is indexed.
261
+ */
262
+ static VALUE
263
+ frt_fi_is_tokenized(VALUE self)
264
+ {
265
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
266
+ return fi_is_tokenized(fi) ? Qtrue : Qfalse;
267
+ }
268
+
269
+ /*
270
+ * call-seq:
271
+ * fi.omit_norms? -> bool
272
+ *
273
+ * Return true if the field omits the norm file. The norm file is the file
274
+ * used to store the field boosts for an indexed field. If you do not boost
275
+ * any fields, and you can live without scoring based on field length then
276
+ * you can omit the norms file. This will give the index a slight performance
277
+ * boost and it will use less memory, especially for indexes which have a
278
+ * large number of documents.
279
+ */
280
+ static VALUE
281
+ frt_fi_omit_norms(VALUE self)
282
+ {
283
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
284
+ return fi_omit_norms(fi) ? Qtrue : Qfalse;
285
+ }
286
+
287
+ /*
288
+ * call-seq:
289
+ * fi.store_term_vector? -> bool
290
+ *
291
+ * Return true if the term-vectors are stored for this field.
292
+ */
293
+ static VALUE
294
+ frt_fi_store_term_vector(VALUE self)
295
+ {
296
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
297
+ return fi_store_term_vector(fi) ? Qtrue : Qfalse;
298
+ }
299
+
300
+ /*
301
+ * call-seq:
302
+ * fi.store_positions? -> bool
303
+ *
304
+ * Return true if positions are stored with the term-vectors for this field.
305
+ */
306
+ static VALUE
307
+ frt_fi_store_positions(VALUE self)
308
+ {
309
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
310
+ return fi_store_positions(fi) ? Qtrue : Qfalse;
311
+ }
312
+
313
+ /*
314
+ * call-seq:
315
+ * fi.store_offsets? -> bool
316
+ *
317
+ * Return true if offsets are stored with the term-vectors for this field.
318
+ */
319
+ static VALUE
320
+ frt_fi_store_offsets(VALUE self)
321
+ {
322
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
323
+ return fi_store_offsets(fi) ? Qtrue : Qfalse;
324
+ }
325
+
326
+ /*
327
+ * call-seq:
328
+ * fi.has_norms? -> bool
329
+ *
330
+ * Return true if this field has a norms file. This is the same as calling;
331
+ *
332
+ * fi.indexed? and not fi.omit_norms?
333
+ */
334
+ static VALUE
335
+ frt_fi_has_norms(VALUE self)
336
+ {
337
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
338
+ return fi_has_norms(fi) ? Qtrue : Qfalse;
339
+ }
340
+
341
+ /*
342
+ * call-seq:
343
+ * fi.boost -> boost
344
+ *
345
+ * Return the default boost for this field
346
+ */
347
+ static VALUE
348
+ frt_fi_boost(VALUE self)
349
+ {
350
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
351
+ return rb_float_new((double)fi->boost);
352
+ }
353
+
354
+ /*
355
+ * call-seq:
356
+ * fi.to_s -> string
357
+ *
358
+ * Return a string representation of the FieldInfo object.
359
+ */
360
+ static VALUE
361
+ frt_fi_to_s(VALUE self)
362
+ {
363
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
364
+ char *fi_s = fi_to_s(fi);
365
+ VALUE rfi_s = rb_str_new2(fi_s);
366
+ free(fi_s);
367
+ return rfi_s;
368
+ }
369
+
370
+ /****************************************************************************
371
+ *
372
+ * FieldInfos Methods
373
+ *
374
+ ****************************************************************************/
375
+
376
+ static void
377
+ frt_fis_free(void *p)
378
+ {
379
+ object_del(p);
380
+ fis_deref((FieldInfos *)p);
381
+ }
382
+
383
+ static void
384
+ frt_fis_mark(void *p)
385
+ {
386
+ int i;
387
+ FieldInfos *fis = (FieldInfos *)p;
388
+
389
+ for (i = 0; i < fis->size; i++) {
390
+ frt_gc_mark(fis->fields[i]);
391
+ }
392
+ }
393
+
394
+ static VALUE
395
+ frt_get_field_infos(FieldInfos *fis)
396
+ {
397
+
398
+ VALUE rfis = Qnil;
399
+ if (fis) {
400
+ rfis = object_get(fis);
401
+ if (rfis == Qnil) {
402
+ rfis = Data_Wrap_Struct(cFieldInfos, &frt_fis_mark, &frt_fis_free,
403
+ fis);
404
+ REF(fis);
405
+ object_add(fis, rfis);
406
+ }
407
+ }
408
+ return rfis;
409
+ }
410
+
411
+ /*
412
+ * call-seq:
413
+ * FieldInfos.new(defaults = {}) -> field_infos
414
+ *
415
+ * Create a new FieldInfos object which uses the default values for fields
416
+ * specified in the +default+ hash parameter. See FieldInfo for available
417
+ * property values.
418
+ */
419
+ static VALUE
420
+ frt_fis_init(int argc, VALUE *argv, VALUE self)
421
+ {
422
+ VALUE roptions;
423
+ FieldInfos *fis;
424
+ enum StoreValues store = STORE_YES;
425
+ enum IndexValues index = INDEX_YES;
426
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
427
+ float boost;
428
+
429
+ rb_scan_args(argc, argv, "01", &roptions);
430
+ if (argc > 0) {
431
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
432
+ }
433
+ fis = fis_new(store, index, term_vector);
434
+ Frt_Wrap_Struct(self, &frt_fis_mark, &frt_fis_free, fis);
435
+ object_add(fis, self);
436
+ return self;
437
+ }
438
+
439
+ /*
440
+ * call-seq:
441
+ * fis.to_a -> array
442
+ *
443
+ * Return an array of the FieldInfo objects contained but this FieldInfos
444
+ * object.
445
+ */
446
+ static VALUE
447
+ frt_fis_to_a(VALUE self)
448
+ {
449
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
450
+ VALUE rary = rb_ary_new();
451
+ int i;
452
+
453
+ for (i = 0; i < fis->size; i++) {
454
+ rb_ary_push(rary, frt_get_field_info(fis->fields[i]));
455
+ }
456
+ return rary;
457
+ }
458
+
459
+ /*
460
+ * call-seq:
461
+ * fis[name] -> field_info
462
+ * fis[number] -> field_info
463
+ *
464
+ * Get the FieldInfo object. FieldInfo objects can be referenced by either
465
+ * their field-number of the field-name (which must be a symbol). For
466
+ * example;
467
+ *
468
+ * fi = fis[:name]
469
+ * fi = fis[2]
470
+ */
471
+ static VALUE
472
+ frt_fis_get(VALUE self, VALUE ridx)
473
+ {
474
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
475
+ VALUE rfi = Qnil;
476
+ switch (TYPE(ridx)) {
477
+ case T_FIXNUM: {
478
+ int index = FIX2INT(ridx);
479
+ if (index < 0) index += fis->size;
480
+ if (index < 0 || index >= fis->size) {
481
+ rb_raise(rb_eArgError, "index of %d is out of range (0..%d)\n",
482
+ index, fis->size);
483
+ }
484
+ rfi = frt_get_field_info(fis->fields[index]);
485
+ break;
486
+ }
487
+ case T_SYMBOL:
488
+ rfi = frt_get_field_info(fis_get_field(fis, frt_field(ridx)));
489
+ break;
490
+ case T_STRING:
491
+ rfi = frt_get_field_info(fis_get_field(fis, StringValuePtr(ridx)));
492
+ break;
493
+ default:
494
+ rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
495
+ rs2s(rb_obj_as_string(ridx)));
496
+ break;
497
+ }
498
+ return rfi;
499
+ }
500
+
501
+ /*
502
+ * call-seq:
503
+ * fis << fi -> fis
504
+ * fis.add(fi) -> fis
505
+ *
506
+ * Add a FieldInfo object. Use the FieldInfos#add_field method where
507
+ * possible.
508
+ */
509
+ static VALUE
510
+ frt_fis_add(VALUE self, VALUE rfi)
511
+ {
512
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
513
+ FieldInfo *fi = (FieldInfo *)frt_rb_data_ptr(rfi);
514
+ fis_add_field(fis, fi);
515
+ REF(fi);
516
+ return self;
517
+ }
518
+
519
+ /*
520
+ * call-seq:
521
+ * fis.add_field(name, properties = {} -> fis
522
+ *
523
+ * Add a new field to the FieldInfos object. See FieldInfo for a description
524
+ * of the available properties.
525
+ */
526
+ static VALUE
527
+ frt_fis_add_field(int argc, VALUE *argv, VALUE self)
528
+ {
529
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
530
+ FieldInfo *fi;
531
+ enum StoreValues store = fis->store;
532
+ enum IndexValues index = fis->index;
533
+ enum TermVectorValues term_vector = fis->term_vector;
534
+ float boost = 1.0f;
535
+ VALUE rname, roptions;
536
+
537
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
538
+ if (argc > 1) {
539
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
540
+ }
541
+ fi = fi_new(frt_field(rname), store, index, term_vector);
542
+ fi->boost = boost;
543
+ fis_add_field(fis, fi);
544
+ return self;
545
+ }
546
+
547
+ /*
548
+ * call-seq:
549
+ * fis.each {|fi| do_something } -> fis
550
+ *
551
+ * Iterate through the FieldInfo objects.
552
+ */
553
+ static VALUE
554
+ frt_fis_each(VALUE self)
555
+ {
556
+ int i;
557
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
558
+
559
+ for (i = 0; i < fis->size; i++) {
560
+ rb_yield(frt_get_field_info(fis->fields[i]));
561
+ }
562
+ return self;
563
+ }
564
+
565
+ /*
566
+ * call-seq:
567
+ * fis.to_s -> string
568
+ *
569
+ * Return a string representation of the FieldInfos object.
570
+ */
571
+ static VALUE
572
+ frt_fis_to_s(VALUE self)
573
+ {
574
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
575
+ char *fis_s = fis_to_s(fis);
576
+ VALUE rfis_s = rb_str_new2(fis_s);
577
+ free(fis_s);
578
+ return rfis_s;
579
+ }
580
+
581
+ /*
582
+ * call-seq:
583
+ * fis.size -> int
584
+ *
585
+ * Return the number of fields in the FieldInfos object.
586
+ */
587
+ static VALUE
588
+ frt_fis_size(VALUE self)
589
+ {
590
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
591
+ return INT2FIX(fis->size);
592
+ }
593
+
594
+ /*
595
+ * call-seq:
596
+ * fis.create_index(dir) -> self
597
+ *
598
+ * Create a new index in the directory specified. The directory +dir+ can
599
+ * either be a string path representing a directory on the file-system or an
600
+ * actual directory object. Care should be taken when using this method. Any
601
+ * existing index (or other files for that matter) will be deleted from the
602
+ * directory and overwritten by the new index.
603
+ */
604
+ static VALUE
605
+ frt_fis_create_index(VALUE self, VALUE rdir)
606
+ {
607
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
608
+ Store *store = NULL;
609
+ if (TYPE(rdir) == T_DATA) {
610
+ store = DATA_PTR(rdir);
611
+ REF(store);
612
+ } else {
613
+ StringValue(rdir);
614
+ frt_create_dir(rdir);
615
+ store = open_fs_store(rs2s(rdir));
616
+ }
617
+ index_create(store, fis);
618
+ store_deref(store);
619
+ return self;
620
+ }
621
+
622
+ /*
623
+ * call-seq:
624
+ * fis.fields -> symbol array
625
+ *
626
+ * Return a list of the field names (as symbols) of all the fields in the
627
+ * index.
628
+ */
629
+ static VALUE
630
+ frt_fis_get_fields(VALUE self)
631
+ {
632
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
633
+ VALUE rfield_names = rb_ary_new();
634
+ int i;
635
+ for (i = 0; i < fis->size; i++) {
636
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
637
+ }
638
+ return rfield_names;
639
+ }
640
+
641
+ /*
642
+ * call-seq:
643
+ * fis.tokenized_fields -> symbol array
644
+ *
645
+ * Return a list of the field names (as symbols) of all the tokenized fields
646
+ * in the index.
647
+ */
648
+ static VALUE
649
+ frt_fis_get_tk_fields(VALUE self)
650
+ {
651
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
652
+ VALUE rfield_names = rb_ary_new();
653
+ int i;
654
+ for (i = 0; i < fis->size; i++) {
655
+ if (!fi_is_tokenized(fis->fields[i])) continue;
656
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
657
+ }
658
+ return rfield_names;
659
+ }
660
+
661
+ /****************************************************************************
662
+ *
663
+ * TermEnum Methods
664
+ *
665
+ ****************************************************************************/
666
+
667
+ static void
668
+ frt_te_free(void *p)
669
+ {
670
+ TermEnum *te = (TermEnum *)p;
671
+ te->close(te);
672
+ }
673
+
674
+ static VALUE
675
+ frt_te_get_set_term(VALUE self, const char *term)
676
+ {
677
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
678
+ VALUE str = term ? rb_str_new(term, te->curr_term_len) : Qnil;
679
+ rb_ivar_set(self, id_term, str);
680
+ return str;
681
+ }
682
+
683
+ static VALUE
684
+ frt_get_te(VALUE rir, TermEnum *te)
685
+ {
686
+ VALUE self = Qnil;
687
+ if (te != NULL) {
688
+ self = Data_Wrap_Struct(cTermEnum, NULL, &frt_te_free, te);
689
+ frt_te_get_set_term(self, te->curr_term);
690
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
691
+ }
692
+ return self;
693
+ }
694
+
695
+ /*
696
+ * call-seq:
697
+ * term_enum.next -> term_string
698
+ *
699
+ * Returns the next term in the enumeration or nil otherwise.
700
+ */
701
+ static VALUE
702
+ frt_te_next(VALUE self)
703
+ {
704
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
705
+ return frt_te_get_set_term(self, te->next(te));
706
+ }
707
+
708
+ /*
709
+ * call-seq:
710
+ * term_enum.term -> term_string
711
+ *
712
+ * Returns the current term pointed to by the enum. This method should only
713
+ * be called after a successful call to TermEnum#next.
714
+ */
715
+ static VALUE
716
+ frt_te_term(VALUE self)
717
+ {
718
+ return rb_ivar_get(self, id_term);
719
+ }
720
+
721
+ /*
722
+ * call-seq:
723
+ * term_enum.doc_freq -> integer
724
+ *
725
+ * Returns the document frequency of the current term pointed to by the enum.
726
+ * That is the number of documents that this term appears in. The method
727
+ * should only be called after a successful call to TermEnum#next.
728
+ */
729
+ static VALUE
730
+ frt_te_doc_freq(VALUE self)
731
+ {
732
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
733
+ return INT2FIX(te->curr_ti.doc_freq);
734
+ }
735
+
736
+ /*
737
+ * call-seq:
738
+ * term_enum.skip_to(target) -> term
739
+ *
740
+ * Skip to term +target+. This method can skip forwards or backwards. If you
741
+ * want to skip back to the start, pass the empty string "". That is;
742
+ *
743
+ * term_enum.skip_to("")
744
+ *
745
+ * Returns the first term greater than or equal to +target+
746
+ */
747
+ static VALUE
748
+ frt_te_skip_to(VALUE self, VALUE rterm)
749
+ {
750
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
751
+ return frt_te_get_set_term(self, te->skip_to(te, frt_field(rterm)));
752
+ }
753
+
754
+ /*
755
+ * call-seq:
756
+ * term_enum.each {|term, doc_freq| do_something() } -> term_count
757
+ *
758
+ * Iterates through all the terms in the field, yielding the term and the
759
+ * document frequency.
760
+ */
761
+ static VALUE
762
+ frt_te_each(VALUE self)
763
+ {
764
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
765
+ char *term;
766
+ int term_cnt = 0;
767
+ VALUE vals = rb_ary_new2(2);
768
+ RARRAY(vals)->len = 2;
769
+ rb_mem_clear(RARRAY(vals)->ptr, 2);
770
+
771
+
772
+ /* each is being called so there will be no current term */
773
+ rb_ivar_set(self, id_term, Qnil);
774
+
775
+
776
+ while (NULL != (term = te->next(te))) {
777
+ term_cnt++;
778
+ RARRAY(vals)->ptr[0] = rb_str_new(term, te->curr_term_len);
779
+ RARRAY(vals)->ptr[1] = INT2FIX(te->curr_ti.doc_freq);
780
+ rb_yield(vals);
781
+ }
782
+ return INT2FIX(term_cnt);
783
+ }
784
+
785
+ /*
786
+ * call-seq:
787
+ * term_enum.set_field(field) -> self
788
+ *
789
+ * Set the field for the term_enum. The field value should be a symbol as
790
+ * usual. For example, to scan all title terms you'd do this;
791
+ *
792
+ * term_enum.set_field(:title).each do |term, doc_freq|
793
+ * do_something()
794
+ * end
795
+ */
796
+ static VALUE
797
+ frt_te_set_field(VALUE self, VALUE rfield)
798
+ {
799
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
800
+ int field_num = 0;
801
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
802
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
803
+ if (rfnum != Qnil) {
804
+ field_num = FIX2INT(rfnum);
805
+ rb_ivar_set(self, id_field_num, rfnum);
806
+ } else {
807
+ Check_Type(rfield, T_SYMBOL);
808
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
809
+ frt_field(rfield));
810
+ }
811
+ te->set_field(te, field_num);
812
+
813
+ return self;
814
+ }
815
+
816
+ /*
817
+ * call-seq:
818
+ * term_enum.to_json() -> string
819
+ *
820
+ * Returns a JSON representation of the term enum. You can speed this up by
821
+ * having the method return arrays instead of objects, simply by passing an
822
+ * argument to the to_json method. For example;
823
+ *
824
+ * term_enum.to_json() #=>
825
+ * # [
826
+ * # {"term":"apple","frequency":12},
827
+ * # {"term":"banana","frequency":2},
828
+ * # {"term":"cantaloupe","frequency":12}
829
+ * # ]
830
+ *
831
+ * term_enum.to_json(:fast) #=>
832
+ * # [
833
+ * # ["apple",12],
834
+ * # ["banana",2],
835
+ * # ["cantaloupe",12]
836
+ * # ]
837
+ */
838
+ static VALUE
839
+ frt_te_to_json(int argc, VALUE *argv, VALUE self)
840
+ {
841
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
842
+ VALUE rjson;
843
+ char *json, *jp;
844
+ char *term;
845
+ int capa = 65536;
846
+ jp = json = ALLOC_N(char, capa);
847
+ *(jp++) = '[';
848
+
849
+ if (argc > 0) {
850
+ while (NULL != (term = te->next(te))) {
851
+ /* enough room for for term after converting " to '"' and frequency
852
+ * plus some extra for good measure */
853
+ *(jp++) = '[';
854
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
855
+ capa <<= 1;
856
+ REALLOC_N(json, char, capa);
857
+ }
858
+ jp = json_concat_string(jp, term);
859
+ *(jp++) = ',';
860
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
861
+ jp += strlen(jp);
862
+ *(jp++) = ']';
863
+ *(jp++) = ',';
864
+ }
865
+ }
866
+ else {
867
+ while (NULL != (term = te->next(te))) {
868
+ /* enough room for for term after converting " to '"' and frequency
869
+ * plus some extra for good measure */
870
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
871
+ capa <<= 1;
872
+ REALLOC_N(json, char, capa);
873
+ }
874
+ *(jp++) = '{';
875
+ memcpy(jp, "\"term\":", 7);
876
+ jp += 7;
877
+ jp = json_concat_string(jp, term);
878
+ *(jp++) = ',';
879
+ memcpy(jp, "\"frequency\":", 12);
880
+ jp += 12;
881
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
882
+ jp += strlen(jp);
883
+ *(jp++) = '}';
884
+ *(jp++) = ',';
885
+ }
886
+ }
887
+ if (*(jp-1) == ',') jp--;
888
+ *(jp++) = ']';
889
+ *jp = '\0';
890
+
891
+ rjson = rb_str_new2(json);
892
+ free(json);
893
+ return rjson;
894
+ }
895
+
896
+ /****************************************************************************
897
+ *
898
+ * TermDocEnum Methods
899
+ *
900
+ ****************************************************************************/
901
+
902
+ static void
903
+ frt_tde_free(void *p)
904
+ {
905
+ TermDocEnum *tde = (TermDocEnum *)p;
906
+ tde->close(tde);
907
+ }
908
+
909
+ static VALUE
910
+ frt_get_tde(VALUE rir, TermDocEnum *tde)
911
+ {
912
+ VALUE self = Data_Wrap_Struct(cTermDocEnum, NULL, &frt_tde_free, tde);
913
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
914
+ return self;
915
+ }
916
+
917
+ /*
918
+ * call-seq:
919
+ * term_doc_enum.seek(field, term) -> self
920
+ *
921
+ * Seek the term +term+ in the index for +field+. After you call this method
922
+ * you can call next or each to skip through the documents and positions of
923
+ * this particular term.
924
+ */
925
+ static VALUE
926
+ frt_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
927
+ {
928
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
929
+ char *term;
930
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
931
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
932
+ int field_num = -1;
933
+ term = StringValuePtr(rterm);
934
+ if (rfnum != Qnil) {
935
+ field_num = FIX2INT(rfnum);
936
+ } else {
937
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
938
+ frt_field(rfield));
939
+ }
940
+ tde->seek(tde, field_num, term);
941
+ return self;
942
+ }
943
+
944
+ /*
945
+ * call-seq:
946
+ * term_doc_enum.seek_term_enum(term_enum) -> self
947
+ *
948
+ * Seek the current term in +term_enum+. You could just use the standard seek
949
+ * method like this;
950
+ *
951
+ * term_doc_enum.seek(term_enum.term)
952
+ *
953
+ * However the +seek_term_enum+ method saves an index lookup so should offer
954
+ * a large performance improvement.
955
+ */
956
+ static VALUE
957
+ frt_tde_seek_te(VALUE self, VALUE rterm_enum)
958
+ {
959
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
960
+ TermEnum *te = (TermEnum *)frt_rb_data_ptr(rterm_enum);
961
+ tde->seek_te(tde, te);
962
+ return self;
963
+ }
964
+
965
+ /*
966
+ * call-seq:
967
+ * term_doc_enum.doc -> doc_id
968
+ *
969
+ * Returns the current document number pointed to by the +term_doc_enum+.
970
+ */
971
+ static VALUE
972
+ frt_tde_doc(VALUE self)
973
+ {
974
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
975
+ return INT2FIX(tde->doc_num(tde));
976
+ }
977
+
978
+ /*
979
+ * call-seq:
980
+ * term_doc_enum.doc -> doc_id
981
+ *
982
+ * Returns the frequency of the current document pointed to by the
983
+ * +term_doc_enum+.
984
+ */
985
+ static VALUE
986
+ frt_tde_freq(VALUE self)
987
+ {
988
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
989
+ return INT2FIX(tde->freq(tde));
990
+ }
991
+
992
+ /*
993
+ * call-seq:
994
+ * term_doc_enum.doc -> doc_id
995
+ *
996
+ * Move forward to the next document in the enumeration. Returns +true+ if
997
+ * there is another document or +false+ otherwise.
998
+ */
999
+ static VALUE
1000
+ frt_tde_next(VALUE self)
1001
+ {
1002
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1003
+ return tde->next(tde) ? Qtrue : Qfalse;
1004
+ }
1005
+
1006
+ /*
1007
+ * call-seq:
1008
+ * term_doc_enum.doc -> doc_id
1009
+ *
1010
+ * Move forward to the next document in the enumeration. Returns +true+ if
1011
+ * there is another document or +false+ otherwise.
1012
+ */
1013
+ static VALUE
1014
+ frt_tde_next_position(VALUE self)
1015
+ {
1016
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1017
+ int pos;
1018
+ if (tde->next_position == NULL) {
1019
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1020
+ "the TermDocEnum with Index#term_positions method rather "
1021
+ "than the Index#term_docs method");
1022
+ }
1023
+ pos = tde->next_position(tde);
1024
+ return pos >= 0 ? INT2FIX(pos) : Qnil;
1025
+ }
1026
+
1027
+ /*
1028
+ * call-seq:
1029
+ * term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
1030
+ *
1031
+ * Iterate through the documents and document frequencies in the
1032
+ * +term_doc_enum+.
1033
+ *
1034
+ * NOTE: this method can only be called once after each seek. If you need to
1035
+ * call +#each+ again then you should call +#seek+ again too.
1036
+ */
1037
+ static VALUE
1038
+ frt_tde_each(VALUE self)
1039
+ {
1040
+ int doc_cnt = 0;
1041
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1042
+ VALUE vals = rb_ary_new2(2);
1043
+ RARRAY(vals)->len = 2;
1044
+ rb_mem_clear(RARRAY(vals)->ptr, 2);
1045
+
1046
+ while (tde->next(tde)) {
1047
+ doc_cnt++;
1048
+ RARRAY(vals)->ptr[0] = INT2FIX(tde->doc_num(tde));
1049
+ RARRAY(vals)->ptr[1] = INT2FIX(tde->freq(tde));
1050
+ rb_yield(vals);
1051
+
1052
+ }
1053
+ return INT2FIX(doc_cnt);
1054
+ }
1055
+
1056
+ /*
1057
+ * call-seq:
1058
+ * term_doc_enum.to_json() -> string
1059
+ *
1060
+ * Returns a json representation of the term doc enum. It will also add the
1061
+ * term positions if they are available. You can speed this up by having the
1062
+ * method return arrays instead of objects, simply by passing an argument to
1063
+ * the to_json method. For example;
1064
+ *
1065
+ * term_doc_enum.to_json() #=>
1066
+ * # [
1067
+ * # {"document":1,"frequency":12},
1068
+ * # {"document":11,"frequency":1},
1069
+ * # {"document":29,"frequency":120},
1070
+ * # {"document":30,"frequency":3}
1071
+ * # ]
1072
+ *
1073
+ * term_doc_enum.to_json(:fast) #=>
1074
+ * # [
1075
+ * # [1,12],
1076
+ * # [11,1],
1077
+ * # [29,120],
1078
+ * # [30,3]
1079
+ * # ]
1080
+ */
1081
+ static VALUE
1082
+ frt_tde_to_json(int argc, VALUE *argv, VALUE self)
1083
+ {
1084
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1085
+ VALUE rjson;
1086
+ char *json, *jp;
1087
+ int capa = 65536;
1088
+ char *format;
1089
+ char close = (argc > 0) ? ']' : '}';
1090
+ bool do_positions = tde->next_position != NULL;
1091
+ jp = json = ALLOC_N(char, capa);
1092
+ *(jp++) = '[';
1093
+
1094
+ if (do_positions) {
1095
+ if (argc == 0) {
1096
+ format = "{\"document\":%d,\"frequency\":%d,\"positions\":[";
1097
+ }
1098
+ else {
1099
+ format = "[%d,%d,[";
1100
+ }
1101
+ }
1102
+ else {
1103
+ if (argc == 0) {
1104
+ format = "{\"document\":%d,\"frequency\":%d},";
1105
+ }
1106
+ else {
1107
+ format = "[%d,%d],";
1108
+ }
1109
+ }
1110
+ while (tde->next(tde)) {
1111
+ /* 100 chars should be enough room for an extra entry */
1112
+ if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) {
1113
+ capa <<= 1;
1114
+ REALLOC_N(json, char, capa);
1115
+ }
1116
+ sprintf(jp, format, tde->doc_num(tde), tde->freq(tde));
1117
+ jp += strlen(jp);
1118
+ if (do_positions) {
1119
+ int pos;
1120
+ while (0 <= (pos = tde->next_position(tde))) {
1121
+ sprintf(jp, "%d,", pos);
1122
+ jp += strlen(jp);
1123
+ }
1124
+ if (*(jp - 1) == ',') jp--;
1125
+ *(jp++) = ']';
1126
+ *(jp++) = close;
1127
+ *(jp++) = ',';
1128
+ }
1129
+ }
1130
+ if (*(jp - 1) == ',') jp--;
1131
+ *(jp++) = ']';
1132
+ *jp = '\0';
1133
+
1134
+ rjson = rb_str_new2(json);
1135
+ free(json);
1136
+ return rjson;
1137
+ }
1138
+
1139
+ /*
1140
+ * call-seq:
1141
+ * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
1142
+ *
1143
+ * Iterate through each of the positions occupied by the current term in the
1144
+ * current document. This can only be called once per document. It can be
1145
+ * used within the each method. For example, to print the terms documents and
1146
+ * positions;
1147
+ *
1148
+ * tde.each do |doc_id, freq|
1149
+ * puts "term appeared #{freq} times in document #{doc_id}:"
1150
+ * positions = []
1151
+ * tde.each_position {|pos| positions << pos}
1152
+ * puts " #{positions.join(', ')}"
1153
+ * end
1154
+ */
1155
+ static VALUE
1156
+ frt_tde_each_position(VALUE self)
1157
+ {
1158
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1159
+ int pos;
1160
+ if (tde->next_position == NULL) {
1161
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1162
+ "the TermDocEnum with Index#term_positions method rather "
1163
+ "than the Index#term_docs method");
1164
+ }
1165
+ while (0 <= (pos = tde->next_position(tde))) {
1166
+ rb_yield(INT2FIX(pos));
1167
+ }
1168
+ return self;
1169
+ }
1170
+
1171
+ /*
1172
+ * call-seq:
1173
+ * term_doc_enum.skip_to(target) -> bool
1174
+ *
1175
+ * Skip to the required document number +target+ and return true if there is
1176
+ * a document >= +target+.
1177
+ */
1178
+ static VALUE
1179
+ frt_tde_skip_to(VALUE self, VALUE rtarget)
1180
+ {
1181
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1182
+ return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
1183
+ }
1184
+
1185
+ /****************************************************************************
1186
+ *
1187
+ * TVOffsets Methods
1188
+ *
1189
+ ****************************************************************************/
1190
+
1191
+ static VALUE
1192
+ frt_get_tv_offsets(Offset *offset)
1193
+ {
1194
+ return rb_struct_new(cTVOffsets,
1195
+ ULL2NUM((unsigned long long)offset->start),
1196
+ ULL2NUM((unsigned long long)offset->end),
1197
+ NULL);
1198
+ }
1199
+
1200
+ /****************************************************************************
1201
+ *
1202
+ * TVTerm Methods
1203
+ *
1204
+ ****************************************************************************/
1205
+
1206
+ static VALUE
1207
+ frt_get_tv_term(TVTerm *tv_term)
1208
+ {
1209
+ int i;
1210
+ const int freq = tv_term->freq;
1211
+ VALUE rtext;
1212
+ VALUE rpositions = Qnil;
1213
+ rtext = rb_str_new2(tv_term->text);
1214
+ if (tv_term->positions) {
1215
+ VALUE *rpos;
1216
+ int *positions = tv_term->positions;
1217
+ rpositions = rb_ary_new2(freq);
1218
+ rpos = RARRAY(rpositions)->ptr;
1219
+ for (i = 0; i < freq; i++) {
1220
+ rpos[i] = INT2FIX(positions[i]);
1221
+ }
1222
+ RARRAY(rpositions)->len = freq;
1223
+ }
1224
+ return rb_struct_new(cTVTerm, rtext, rpositions, NULL);
1225
+ }
1226
+
1227
+ /****************************************************************************
1228
+ *
1229
+ * TermVector Methods
1230
+ *
1231
+ ****************************************************************************/
1232
+
1233
+ static VALUE
1234
+ frt_get_tv(TermVector *tv)
1235
+ {
1236
+ int i;
1237
+ TVTerm *terms = tv->terms;
1238
+ const int t_cnt = tv->term_cnt;
1239
+ const int o_cnt = tv->offset_cnt;
1240
+ VALUE rfield, rterms, *rts;
1241
+ VALUE roffsets = Qnil;
1242
+ rfield = ID2SYM(rb_intern(tv->field));
1243
+
1244
+ rterms = rb_ary_new2(t_cnt);
1245
+ rts = RARRAY(rterms)->ptr;
1246
+ for (i = 0; i < t_cnt; i++) {
1247
+ rts[i] = frt_get_tv_term(&terms[i]);
1248
+ RARRAY(rterms)->len++;
1249
+ }
1250
+
1251
+ if (tv->offsets) {
1252
+ VALUE *ros;
1253
+ Offset *offsets = tv->offsets;
1254
+ roffsets = rb_ary_new2(o_cnt);
1255
+ ros = RARRAY(roffsets)->ptr;
1256
+ for (i = 0; i < o_cnt; i++) {
1257
+ ros[i] = frt_get_tv_offsets(&offsets[i]);
1258
+ RARRAY(roffsets)->len++;
1259
+ }
1260
+ }
1261
+
1262
+ return rb_struct_new(cTermVector, rfield, rterms, roffsets, NULL);
1263
+ }
1264
+
1265
+ /****************************************************************************
1266
+ *
1267
+ * IndexWriter Methods
1268
+ *
1269
+ ****************************************************************************/
1270
+
1271
+ void
1272
+ frt_iw_free(void *p)
1273
+ {
1274
+ iw_close((IndexWriter *)p);
1275
+ }
1276
+
1277
+ void
1278
+ frt_iw_mark(void *p)
1279
+ {
1280
+ IndexWriter *iw = (IndexWriter *)p;
1281
+ frt_gc_mark(iw->analyzer);
1282
+ frt_gc_mark(iw->store);
1283
+ frt_gc_mark(iw->fis);
1284
+ }
1285
+
1286
+ /*
1287
+ * call-seq:
1288
+ * index_writer.close -> nil
1289
+ *
1290
+ * Close the IndexWriter. This will close and free all resources used
1291
+ * exclusively by the index writer. The garbage collector will do this
1292
+ * automatically if not called explicitly.
1293
+ */
1294
+ static VALUE
1295
+ frt_iw_close(VALUE self)
1296
+ {
1297
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1298
+ Frt_Unwrap_Struct(self);
1299
+ iw_close(iw);
1300
+ return Qnil;
1301
+ }
1302
+
1303
+ #define SET_INT_ATTR(attr) \
1304
+ do {\
1305
+ if (RTEST(rval = rb_hash_aref(roptions, sym_##attr)))\
1306
+ config.attr = FIX2INT(rval);\
1307
+ } while (0)
1308
+
1309
+ /*
1310
+ * call-seq:
1311
+ * IndexWriter.new(options = {}) -> index_writer
1312
+ *
1313
+ * Create a new IndexWriter. You should either pass a path or a directory to
1314
+ * this constructor. For example, here are three ways you can create an
1315
+ * IndexWriter;
1316
+ *
1317
+ * dir = RAMDirectory.new()
1318
+ * iw = IndexWriter.new(:dir => dir)
1319
+ *
1320
+ * dir = FSDirectory.new("/path/to/index")
1321
+ * iw = IndexWriter.new(:dir => dir)
1322
+ *
1323
+ * iw = IndexWriter.new(:path => "/path/to/index")
1324
+ *
1325
+ * See IndexWriter for more options.
1326
+ */
1327
+ static VALUE
1328
+ frt_iw_init(int argc, VALUE *argv, VALUE self)
1329
+ {
1330
+ VALUE roptions, rval;
1331
+ bool create = false;
1332
+ bool create_if_missing = true;
1333
+ Store *store = NULL;
1334
+ Analyzer *analyzer = NULL;
1335
+ IndexWriter *volatile iw = NULL;
1336
+ Config config = default_config;
1337
+
1338
+ rb_scan_args(argc, argv, "01", &roptions);
1339
+ if (argc > 0) {
1340
+ Check_Type(roptions, T_HASH);
1341
+
1342
+ if ((rval = rb_hash_aref(roptions, sym_dir)) != Qnil) {
1343
+ Check_Type(rval, T_DATA);
1344
+ store = DATA_PTR(rval);
1345
+ } else if ((rval = rb_hash_aref(roptions, sym_path)) != Qnil) {
1346
+ StringValue(rval);
1347
+ frt_create_dir(rval);
1348
+ store = open_fs_store(rs2s(rval));
1349
+ DEREF(store);
1350
+ }
1351
+
1352
+ /* Let ruby's garbage collector handle the closing of the store
1353
+ if (!close_dir) {
1354
+ close_dir = RTEST(rb_hash_aref(roptions, sym_close_dir));
1355
+ }
1356
+ */
1357
+ /* use_compound_file defaults to true */
1358
+ config.use_compound_file =
1359
+ (rb_hash_aref(roptions, sym_use_compound_file) == Qfalse)
1360
+ ? false
1361
+ : true;
1362
+
1363
+ if ((rval = rb_hash_aref(roptions, sym_analyzer)) != Qnil) {
1364
+ analyzer = frt_get_cwrapped_analyzer(rval);
1365
+ }
1366
+
1367
+ create = RTEST(rb_hash_aref(roptions, sym_create));
1368
+ if ((rval = rb_hash_aref(roptions, sym_create_if_missing)) != Qnil) {
1369
+ create_if_missing = RTEST(rval);
1370
+ }
1371
+ SET_INT_ATTR(chunk_size);
1372
+ SET_INT_ATTR(max_buffer_memory);
1373
+ SET_INT_ATTR(index_interval);
1374
+ SET_INT_ATTR(skip_interval);
1375
+ SET_INT_ATTR(merge_factor);
1376
+ SET_INT_ATTR(max_buffered_docs);
1377
+ SET_INT_ATTR(max_merge_docs);
1378
+ SET_INT_ATTR(max_field_length);
1379
+ }
1380
+ if (NULL == store) {
1381
+ store = open_ram_store();
1382
+ DEREF(store);
1383
+ }
1384
+ if (!create && create_if_missing && !store->exists(store, "segments")) {
1385
+ create = true;
1386
+ }
1387
+ if (create) {
1388
+ FieldInfos *fis;
1389
+ if ((rval = rb_hash_aref(roptions, sym_field_infos)) != Qnil) {
1390
+ Data_Get_Struct(rval, FieldInfos, fis);
1391
+ index_create(store, fis);
1392
+ } else {
1393
+ fis = fis_new(STORE_YES, INDEX_YES,
1394
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS);
1395
+ index_create(store, fis);
1396
+ fis_deref(fis);
1397
+ }
1398
+ }
1399
+
1400
+ iw = iw_open(store, analyzer, &config);
1401
+
1402
+ Frt_Wrap_Struct(self, &frt_iw_mark, &frt_iw_free, iw);
1403
+
1404
+ if (rb_block_given_p()) {
1405
+ rb_yield(self);
1406
+ frt_iw_close(self);
1407
+ return Qnil;
1408
+ } else {
1409
+ return self;
1410
+ }
1411
+ }
1412
+
1413
+ /*
1414
+ * call-seq:
1415
+ * iw.doc_count -> number
1416
+ *
1417
+ * Returns the number of documents in the Index. Note that deletions won't be
1418
+ * taken into account until the IndexWriter has been committed.
1419
+ */
1420
+ static VALUE
1421
+ frt_iw_get_doc_count(VALUE self)
1422
+ {
1423
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1424
+ return INT2FIX(iw_doc_count(iw));
1425
+ }
1426
+
1427
+ static int
1428
+ frt_hash_to_doc_i(VALUE key, VALUE value, VALUE arg)
1429
+ {
1430
+ if (key == Qundef) {
1431
+ return ST_CONTINUE;
1432
+ } else {
1433
+ Document *doc = (Document *)arg;
1434
+ char *field;
1435
+ VALUE val;
1436
+ DocField *df;
1437
+ switch (TYPE(key)) {
1438
+ case T_STRING:
1439
+ field = rs2s(key);
1440
+ break;
1441
+ case T_SYMBOL:
1442
+ field = rb_id2name(SYM2ID(key));
1443
+ break;
1444
+ default:
1445
+ rb_raise(rb_eArgError,
1446
+ "%s cannot be a key to a field. Field keys must "
1447
+ " be symbols.", rs2s(rb_obj_as_string(key)));
1448
+ break;
1449
+ }
1450
+ if (NULL == (df = doc_get_field(doc, field))) {
1451
+ df = df_new(field);
1452
+ }
1453
+ if (rb_respond_to(value, id_boost)) {
1454
+ df->boost = (float)NUM2DBL(rb_funcall(value, id_boost, 0));
1455
+ }
1456
+ switch (TYPE(value)) {
1457
+ case T_ARRAY:
1458
+ {
1459
+ int i;
1460
+ df->destroy_data = true;
1461
+ for (i = 0; i < RARRAY(value)->len; i++) {
1462
+ val = rb_obj_as_string(RARRAY(value)->ptr[i]);
1463
+ df_add_data_len(df, nstrdup(val), RSTRING(val)->len);
1464
+ }
1465
+ }
1466
+ break;
1467
+ case T_STRING:
1468
+ df_add_data_len(df, rs2s(value), RSTRING(value)->len);
1469
+ break;
1470
+ default:
1471
+ val = rb_obj_as_string(value);
1472
+ df->destroy_data = true;
1473
+ df_add_data_len(df, nstrdup(val), RSTRING(val)->len);
1474
+ break;
1475
+ }
1476
+ doc_add_field(doc, df);
1477
+ }
1478
+ return ST_CONTINUE;
1479
+ }
1480
+
1481
+ static Document *
1482
+ frt_get_doc(VALUE rdoc)
1483
+ {
1484
+ VALUE val;
1485
+ Document *doc = doc_new();
1486
+ DocField *df;
1487
+
1488
+ if (rb_respond_to(rdoc, id_boost)) {
1489
+ doc->boost = (float)NUM2DBL(rb_funcall(rdoc, id_boost, 0));
1490
+ }
1491
+
1492
+ switch (TYPE(rdoc)) {
1493
+ case T_HASH:
1494
+ rb_hash_foreach(rdoc, frt_hash_to_doc_i, (VALUE)doc);
1495
+ break;
1496
+ case T_ARRAY:
1497
+ {
1498
+ int i;
1499
+ df = df_new("content");
1500
+ df->destroy_data = true;
1501
+ for (i = 0; i < RARRAY(rdoc)->len; i++) {
1502
+ val = rb_obj_as_string(RARRAY(rdoc)->ptr[i]);
1503
+ df_add_data_len(df, nstrdup(val), RSTRING(val)->len);
1504
+ }
1505
+ doc_add_field(doc, df);
1506
+ }
1507
+ break;
1508
+ case T_SYMBOL:
1509
+ df = df_add_data(df_new("content"), rb_id2name(SYM2ID(rdoc)));
1510
+ doc_add_field(doc, df);
1511
+ break;
1512
+ case T_STRING:
1513
+ df = df_add_data_len(df_new("content"), rs2s(rdoc),
1514
+ RSTRING(rdoc)->len);
1515
+ doc_add_field(doc, df);
1516
+ break;
1517
+ default:
1518
+ val = rb_obj_as_string(rdoc);
1519
+ df = df_add_data_len(df_new("content"), nstrdup(val),
1520
+ RSTRING(val)->len);
1521
+ df->destroy_data = true;
1522
+ doc_add_field(doc, df);
1523
+ break;
1524
+ }
1525
+ return doc;
1526
+ }
1527
+
1528
+ /*
1529
+ * call-seq:
1530
+ * iw << document -> iw
1531
+ * iw.add_document(document) -> iw
1532
+ *
1533
+ * Add a document to the index. See Document. A document can also be a simple
1534
+ * hash object.
1535
+ */
1536
+ static VALUE
1537
+ frt_iw_add_doc(VALUE self, VALUE rdoc)
1538
+ {
1539
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1540
+ Document *doc = frt_get_doc(rdoc);
1541
+ iw_add_doc(iw, doc);
1542
+ doc_destroy(doc);
1543
+ return self;
1544
+ }
1545
+
1546
+ /*
1547
+ * call-seq:
1548
+ * iw.optimize -> iw
1549
+ *
1550
+ * Optimize the index for searching. This commits any unwritten data to the
1551
+ * index and optimizes the index into a single segment to improve search
1552
+ * performance. This is an expensive operation and should not be called too
1553
+ * often. The best time to call this is at the end of a long batch indexing
1554
+ * process. Note that calling the optimize method do not in any way effect
1555
+ * indexing speed (except for the time taken to complete the optimization
1556
+ * process).
1557
+ */
1558
+ static VALUE
1559
+ frt_iw_optimize(VALUE self)
1560
+ {
1561
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1562
+ iw_optimize(iw);
1563
+ return self;
1564
+ }
1565
+
1566
+ /*
1567
+ * call-seq:
1568
+ * iw.commit -> iw
1569
+ *
1570
+ * Explicitly commit any changes to the index that may be hanging around in
1571
+ * memory. You should call this method if you want to read the latest index
1572
+ * with an IndexWriter.
1573
+ */
1574
+ static VALUE
1575
+ frt_iw_commit(VALUE self)
1576
+ {
1577
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1578
+ iw_commit(iw);
1579
+ return self;
1580
+ }
1581
+
1582
+ /*
1583
+ * call-seq:
1584
+ * iw.add_readers(reader_array) -> iw
1585
+ *
1586
+ * Use this method to merge other indexes into the one being written by
1587
+ * IndexWriter. This is useful for parallel indexing. You can have several
1588
+ * indexing processes running in parallel, possibly even on different
1589
+ * machines. Then you can finish by merging all of the indexes into a single
1590
+ * index.
1591
+ */
1592
+ static VALUE
1593
+ frt_iw_add_readers(VALUE self, VALUE rreaders)
1594
+ {
1595
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1596
+ int i;
1597
+ IndexReader **irs;
1598
+ Check_Type(rreaders, T_ARRAY);
1599
+
1600
+ irs = ALLOC_N(IndexReader *, RARRAY(rreaders)->len);
1601
+ i = RARRAY(rreaders)->len;
1602
+ while (i-- > 0) {
1603
+ IndexReader *ir;
1604
+ Data_Get_Struct(RARRAY(rreaders)->ptr[i], IndexReader, ir);
1605
+ irs[i] = ir;
1606
+ }
1607
+ iw_add_readers(iw, irs, RARRAY(rreaders)->len);
1608
+ free(irs);
1609
+ return self;
1610
+ }
1611
+
1612
+ /*
1613
+ * call-seq:
1614
+ * iw.delete(field, term) -> iw
1615
+ *
1616
+ * Delete all documents in the index with the term +term+ in the field
1617
+ * +field+. You should usually have a unique document id which you use with
1618
+ * this method, rather then deleting all documents with the word "the" in
1619
+ * them. You may however use this method to delete spam.
1620
+ */
1621
+ static VALUE
1622
+ frt_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
1623
+ {
1624
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1625
+ iw_delete_term(iw, frt_field(rfield), StringValuePtr(rterm));
1626
+ return self;
1627
+ }
1628
+
1629
+ /*
1630
+ * call-seq:
1631
+ * index_writer.field_infos -> FieldInfos
1632
+ *
1633
+ * Get the FieldInfos object for this IndexWriter. This is useful if you need
1634
+ * to dynamically add new fields to the index with specific properties.
1635
+ */
1636
+ static VALUE
1637
+ frt_iw_field_infos(VALUE self)
1638
+ {
1639
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1640
+ return frt_get_field_infos(iw->fis);
1641
+ }
1642
+
1643
+ /*
1644
+ * call-seq:
1645
+ * index_writer.analyzer -> Analyzer
1646
+ *
1647
+ * Get the Analyzer for this IndexWriter. This is useful if you need
1648
+ * to use the same analyzer in a QueryParser.
1649
+ */
1650
+ static VALUE
1651
+ frt_iw_get_analyzer(VALUE self)
1652
+ {
1653
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1654
+ return frt_get_analyzer(iw->analyzer);
1655
+ }
1656
+
1657
+ /*
1658
+ * call-seq:
1659
+ * index_writer.analyzer -> Analyzer
1660
+ *
1661
+ * Set the Analyzer for this IndexWriter. This is useful if you need to
1662
+ * change the analyzer for a special document. It is risky though as the
1663
+ * same analyzer will be used for all documents during search.
1664
+ */
1665
+ static VALUE
1666
+ frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
1667
+ {
1668
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1669
+
1670
+ a_deref(iw->analyzer);
1671
+ iw->analyzer = frt_get_cwrapped_analyzer(ranalyzer);
1672
+ return ranalyzer;
1673
+ }
1674
+
1675
+ /*
1676
+ * call-seq:
1677
+ * index_writer.version -> int
1678
+ *
1679
+ * Returns the current version of the index writer.
1680
+ */
1681
+ static VALUE
1682
+ frt_iw_version(VALUE self)
1683
+ {
1684
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1685
+ return ULL2NUM(iw->sis->version);
1686
+ }
1687
+
1688
+ /*
1689
+ * call-seq:
1690
+ * iw.chunk_size -> number
1691
+ *
1692
+ * Return the current value of chunk_size
1693
+ */
1694
+ static VALUE
1695
+ frt_iw_get_chunk_size(VALUE self)
1696
+ {
1697
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1698
+ return INT2FIX(iw->config.chunk_size);
1699
+ }
1700
+
1701
+ /*
1702
+ * call-seq:
1703
+ * iw.chunk_size = chunk_size -> chunk_size
1704
+ *
1705
+ * Set the chunk_size parameter
1706
+ */
1707
+ static VALUE
1708
+ frt_iw_set_chunk_size(VALUE self, VALUE rval)
1709
+ {
1710
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1711
+ iw->config.chunk_size = FIX2INT(rval);
1712
+ return rval;
1713
+ }
1714
+
1715
+ /*
1716
+ * call-seq:
1717
+ * iw.max_buffer_memory -> number
1718
+ *
1719
+ * Return the current value of max_buffer_memory
1720
+ */
1721
+ static VALUE
1722
+ frt_iw_get_max_buffer_memory(VALUE self)
1723
+ {
1724
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1725
+ return INT2FIX(iw->config.max_buffer_memory);
1726
+ }
1727
+
1728
+ /*
1729
+ * call-seq:
1730
+ * iw.max_buffer_memory = max_buffer_memory -> max_buffer_memory
1731
+ *
1732
+ * Set the max_buffer_memory parameter
1733
+ */
1734
+ static VALUE
1735
+ frt_iw_set_max_buffer_memory(VALUE self, VALUE rval)
1736
+ {
1737
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1738
+ iw->config.max_buffer_memory = FIX2INT(rval);
1739
+ return rval;
1740
+ }
1741
+
1742
+ /*
1743
+ * call-seq:
1744
+ * iw.term_index_interval -> number
1745
+ *
1746
+ * Return the current value of term_index_interval
1747
+ */
1748
+ static VALUE
1749
+ frt_iw_get_index_interval(VALUE self)
1750
+ {
1751
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1752
+ return INT2FIX(iw->config.index_interval);
1753
+ }
1754
+
1755
+ /*
1756
+ * call-seq:
1757
+ * iw.term_index_interval = term_index_interval -> term_index_interval
1758
+ *
1759
+ * Set the term_index_interval parameter
1760
+ */
1761
+ static VALUE
1762
+ frt_iw_set_index_interval(VALUE self, VALUE rval)
1763
+ {
1764
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1765
+ iw->config.index_interval = FIX2INT(rval);
1766
+ return rval;
1767
+ }
1768
+
1769
+ /*
1770
+ * call-seq:
1771
+ * iw.doc_skip_interval -> number
1772
+ *
1773
+ * Return the current value of doc_skip_interval
1774
+ */
1775
+ static VALUE
1776
+ frt_iw_get_skip_interval(VALUE self)
1777
+ {
1778
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1779
+ return INT2FIX(iw->config.skip_interval);
1780
+ }
1781
+
1782
+ /*
1783
+ * call-seq:
1784
+ * iw.doc_skip_interval = doc_skip_interval -> doc_skip_interval
1785
+ *
1786
+ * Set the doc_skip_interval parameter
1787
+ */
1788
+ static VALUE
1789
+ frt_iw_set_skip_interval(VALUE self, VALUE rval)
1790
+ {
1791
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1792
+ iw->config.skip_interval = FIX2INT(rval);
1793
+ return rval;
1794
+ }
1795
+
1796
+ /*
1797
+ * call-seq:
1798
+ * iw.merge_factor -> number
1799
+ *
1800
+ * Return the current value of merge_factor
1801
+ */
1802
+ static VALUE
1803
+ frt_iw_get_merge_factor(VALUE self)
1804
+ {
1805
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1806
+ return INT2FIX(iw->config.merge_factor);
1807
+ }
1808
+
1809
+ /*
1810
+ * call-seq:
1811
+ * iw.merge_factor = merge_factor -> merge_factor
1812
+ *
1813
+ * Set the merge_factor parameter
1814
+ */
1815
+ static VALUE
1816
+ frt_iw_set_merge_factor(VALUE self, VALUE rval)
1817
+ {
1818
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1819
+ iw->config.merge_factor = FIX2INT(rval);
1820
+ return rval;
1821
+ }
1822
+
1823
+ /*
1824
+ * call-seq:
1825
+ * iw.max_buffered_docs -> number
1826
+ *
1827
+ * Return the current value of max_buffered_docs
1828
+ */
1829
+ static VALUE
1830
+ frt_iw_get_max_buffered_docs(VALUE self)
1831
+ {
1832
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1833
+ return INT2FIX(iw->config.max_buffered_docs);
1834
+ }
1835
+
1836
+ /*
1837
+ * call-seq:
1838
+ * iw.max_buffered_docs = max_buffered_docs -> max_buffered_docs
1839
+ *
1840
+ * Set the max_buffered_docs parameter
1841
+ */
1842
+ static VALUE
1843
+ frt_iw_set_max_buffered_docs(VALUE self, VALUE rval)
1844
+ {
1845
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1846
+ iw->config.max_buffered_docs = FIX2INT(rval);
1847
+ return rval;
1848
+ }
1849
+
1850
+ /*
1851
+ * call-seq:
1852
+ * iw.max_merge_docs -> number
1853
+ *
1854
+ * Return the current value of max_merge_docs
1855
+ */
1856
+ static VALUE
1857
+ frt_iw_get_max_merge_docs(VALUE self)
1858
+ {
1859
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1860
+ return INT2FIX(iw->config.max_merge_docs);
1861
+ }
1862
+
1863
+ /*
1864
+ * call-seq:
1865
+ * iw.max_merge_docs = max_merge_docs -> max_merge_docs
1866
+ *
1867
+ * Set the max_merge_docs parameter
1868
+ */
1869
+ static VALUE
1870
+ frt_iw_set_max_merge_docs(VALUE self, VALUE rval)
1871
+ {
1872
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1873
+ iw->config.max_merge_docs = FIX2INT(rval);
1874
+ return rval;
1875
+ }
1876
+
1877
+ /*
1878
+ * call-seq:
1879
+ * iw.max_field_length -> number
1880
+ *
1881
+ * Return the current value of max_field_length
1882
+ */
1883
+ static VALUE
1884
+ frt_iw_get_max_field_length(VALUE self)
1885
+ {
1886
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1887
+ return INT2FIX(iw->config.max_field_length);
1888
+ }
1889
+
1890
+ /*
1891
+ * call-seq:
1892
+ * iw.max_field_length = max_field_length -> max_field_length
1893
+ *
1894
+ * Set the max_field_length parameter
1895
+ */
1896
+ static VALUE
1897
+ frt_iw_set_max_field_length(VALUE self, VALUE rval)
1898
+ {
1899
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1900
+ iw->config.max_field_length = FIX2INT(rval);
1901
+ return rval;
1902
+ }
1903
+
1904
+ /*
1905
+ * call-seq:
1906
+ * iw.use_compound_file -> number
1907
+ *
1908
+ * Return the current value of use_compound_file
1909
+ */
1910
+ static VALUE
1911
+ frt_iw_get_use_compound_file(VALUE self)
1912
+ {
1913
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1914
+ return iw->config.use_compound_file ? Qtrue : Qfalse;
1915
+ }
1916
+
1917
+ /*
1918
+ * call-seq:
1919
+ * iw.use_compound_file = use_compound_file -> use_compound_file
1920
+ *
1921
+ * Set the use_compound_file parameter
1922
+ */
1923
+ static VALUE
1924
+ frt_iw_set_use_compound_file(VALUE self, VALUE rval)
1925
+ {
1926
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1927
+ iw->config.use_compound_file = RTEST(rval);
1928
+ return rval;
1929
+ }
1930
+
1931
+ /****************************************************************************
1932
+ *
1933
+ * LazyDoc Methods
1934
+ *
1935
+ ****************************************************************************/
1936
+
1937
+ static void
1938
+ frt_lzd_date_free(void *p)
1939
+ {
1940
+ lazy_doc_close((LazyDoc *)p);
1941
+ }
1942
+
1943
+ static VALUE
1944
+ frt_lazy_df_load(VALUE self, VALUE rkey, LazyDocField *lazy_df)
1945
+ {
1946
+ VALUE rdata = Qnil;
1947
+ if (lazy_df) {
1948
+ if (lazy_df->size == 1) {
1949
+ char *data = lazy_df_get_data(lazy_df, 0);
1950
+ rdata = rb_str_new(data, lazy_df->len);
1951
+ } else {
1952
+ int i;
1953
+ rdata = rb_ary_new2(lazy_df->size);
1954
+ for (i = 0; i < lazy_df->size; i++) {
1955
+ char *data = lazy_df_get_data(lazy_df, i);
1956
+ RARRAY(rdata)->ptr[i] =
1957
+ rb_str_new(data, lazy_df->data[i].length);
1958
+ RARRAY(rdata)->len++;
1959
+ }
1960
+ }
1961
+ rb_hash_aset(self, rkey, rdata);
1962
+ }
1963
+ return rdata;
1964
+ }
1965
+
1966
+ /*
1967
+ * call-seq:
1968
+ * lazy_doc.default(key) -> string
1969
+ *
1970
+ * This method is used internally to lazily load fields. You should never
1971
+ * really need to call it yourself.
1972
+ */
1973
+ static VALUE
1974
+ frt_lzd_default(VALUE self, VALUE rkey)
1975
+ {
1976
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1977
+ char *field = NULL;
1978
+ switch (TYPE(rkey)) {
1979
+ case T_STRING:
1980
+ field = rs2s(rkey);
1981
+ rkey = ID2SYM(rb_intern(field));
1982
+ break;
1983
+ case T_SYMBOL:
1984
+ field = frt_field(rkey);
1985
+ break;
1986
+ default:
1987
+ rb_raise(rb_eArgError,
1988
+ "%s cannot be a key to a field. Field keys must "
1989
+ " be symbols.", rs2s(rb_obj_as_string(rkey)));
1990
+ break;
1991
+ }
1992
+ return frt_lazy_df_load(self, rkey, h_get(lazy_doc->field_dict, field));
1993
+ }
1994
+
1995
+ /*
1996
+ * call-seq:
1997
+ * lazy_doc.fields -> array of available fields
1998
+ *
1999
+ * Returns the list of fields stored for this particular document. If you try
2000
+ * to access any of these fields in the document the field will be loaded.
2001
+ * Try to access any other field an nil will be returned.
2002
+ */
2003
+ static VALUE
2004
+ frt_lzd_fields(VALUE self)
2005
+ {
2006
+ return rb_ivar_get(self, id_fields);
2007
+ }
2008
+
2009
+ /*
2010
+ * call-seq:
2011
+ * lazy_doc.load -> lazy_doc
2012
+ *
2013
+ * Load all unloaded fields in the document from the index.
2014
+ */
2015
+ static VALUE
2016
+ frt_lzd_load(VALUE self)
2017
+ {
2018
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
2019
+ int i;
2020
+ for (i = 0; i < lazy_doc->size; i++) {
2021
+ LazyDocField *lazy_df = lazy_doc->fields[i];
2022
+ frt_lazy_df_load(self, ID2SYM(rb_intern(lazy_df->name)), lazy_df);
2023
+ }
2024
+ return self;
2025
+ }
2026
+
2027
+ VALUE
2028
+ frt_get_lazy_doc(LazyDoc *lazy_doc)
2029
+ {
2030
+ int i;
2031
+ VALUE rfields = rb_ary_new2(lazy_doc->size);
2032
+
2033
+ VALUE self, rdata;
2034
+ self = rb_hash_new();
2035
+ OBJSETUP(self, cLazyDoc, T_HASH);
2036
+
2037
+ rdata = Data_Wrap_Struct(cLazyDocData, NULL, &frt_lzd_date_free, lazy_doc);
2038
+ rb_ivar_set(self, id_data, rdata);
2039
+
2040
+ for (i = 0; i < lazy_doc->size; i++) {
2041
+ RARRAY(rfields)->ptr[i] = ID2SYM(rb_intern(lazy_doc->fields[i]->name));
2042
+ RARRAY(rfields)->len++;
2043
+ }
2044
+ rb_ivar_set(self, id_fields, rfields);
2045
+
2046
+ return self;
2047
+ }
2048
+
2049
+ /****************************************************************************
2050
+ *
2051
+ * IndexReader Methods
2052
+ *
2053
+ ****************************************************************************/
2054
+
2055
+ void
2056
+ frt_ir_free(void *p)
2057
+ {
2058
+ object_del(p);
2059
+ ir_close((IndexReader *)p);
2060
+ }
2061
+
2062
+ void
2063
+ frt_ir_mark(void *p)
2064
+ {
2065
+ IndexReader *ir = (IndexReader *)p;
2066
+ frt_gc_mark(ir->store);
2067
+ }
2068
+
2069
+ static VALUE frt_ir_close(VALUE self);
2070
+
2071
+ void
2072
+ frt_mr_mark(void *p)
2073
+ {
2074
+ MultiReader *mr = (MultiReader *)p;
2075
+ int i;
2076
+ for (i = 0; i < mr->r_cnt; i++) {
2077
+ frt_gc_mark(mr->sub_readers[i]);
2078
+ }
2079
+ }
2080
+
2081
+ /*
2082
+ * call-seq:
2083
+ * IndexReader.new(dir) -> index_reader
2084
+ *
2085
+ * Create a new IndexReader. You can either pass a string path to a
2086
+ * file-system directory or an actual Ferret::Store::Directory object. For
2087
+ * example;
2088
+ *
2089
+ * dir = RAMDirectory.new()
2090
+ * iw = IndexReader.new(dir)
2091
+ *
2092
+ * dir = FSDirectory.new("/path/to/index")
2093
+ * iw = IndexReader.new(dir)
2094
+ *
2095
+ * iw = IndexReader.new("/path/to/index")
2096
+ *
2097
+ * You can also create a what used to be known as a MultiReader by passing an
2098
+ * array of IndexReader objects, Ferret::Store::Directory objects or
2099
+ * file-system paths;
2100
+ *
2101
+ * iw = IndexReader.new([dir, dir2, dir3])
2102
+ *
2103
+ * iw = IndexReader.new([reader1, reader2, reader3])
2104
+ *
2105
+ * iw = IndexReader.new(["/path/to/index1", "/path/to/index2"])
2106
+ */
2107
+ static VALUE
2108
+ frt_ir_init(VALUE self, VALUE rdir)
2109
+ {
2110
+ Store *store = NULL;
2111
+ IndexReader *ir;
2112
+ int i;
2113
+ FieldInfos *fis;
2114
+ VALUE rfield_num_map = rb_hash_new();
2115
+
2116
+ if (TYPE(rdir) == T_ARRAY) {
2117
+ VALUE rdirs = rdir;
2118
+ const int reader_cnt = RARRAY(rdir)->len;
2119
+ IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
2120
+ int i;
2121
+ for (i = 0; i < reader_cnt; i++) {
2122
+ rdir = RARRAY(rdirs)->ptr[i];
2123
+ switch (TYPE(rdir)) {
2124
+ case T_DATA:
2125
+ if (CLASS_OF(rdir) == cIndexReader) {
2126
+ Data_Get_Struct(rdir, IndexReader, sub_readers[i]);
2127
+ REF(sub_readers[i]);
2128
+ continue;
2129
+ } else if (RTEST(rb_obj_is_kind_of(rdir, cDirectory))) {
2130
+ store = DATA_PTR(rdir);
2131
+ } else {
2132
+ rb_raise(rb_eArgError, "A Multi-IndexReader can only "
2133
+ "be created from other IndexReaders, "
2134
+ "Directory objects or file-system paths. "
2135
+ "Not %s",
2136
+ rs2s(rb_obj_as_string(rdir)));
2137
+ }
2138
+ break;
2139
+ case T_STRING:
2140
+ frt_create_dir(rdir);
2141
+ store = open_fs_store(rs2s(rdir));
2142
+ DEREF(store);
2143
+ break;
2144
+ default:
2145
+ rb_raise(rb_eArgError, "%s isn't a valid directory "
2146
+ "argument. You should use either a String or "
2147
+ "a Directory",
2148
+ rs2s(rb_obj_as_string(rdir)));
2149
+ break;
2150
+ }
2151
+ sub_readers[i] = ir_open(store);
2152
+ }
2153
+ ir = mr_open(sub_readers, reader_cnt);
2154
+ Frt_Wrap_Struct(self, &frt_mr_mark, &frt_ir_free, ir);
2155
+ } else {
2156
+ switch (TYPE(rdir)) {
2157
+ case T_DATA:
2158
+ store = DATA_PTR(rdir);
2159
+ break;
2160
+ case T_STRING:
2161
+ frt_create_dir(rdir);
2162
+ store = open_fs_store(rs2s(rdir));
2163
+ DEREF(store);
2164
+ break;
2165
+ default:
2166
+ rb_raise(rb_eArgError, "%s isn't a valid directory argument. "
2167
+ "You should use either a String or a Directory",
2168
+ rs2s(rb_obj_as_string(rdir)));
2169
+ break;
2170
+ }
2171
+ ir = ir_open(store);
2172
+ Frt_Wrap_Struct(self, &frt_ir_mark, &frt_ir_free, ir);
2173
+ }
2174
+ object_add(ir, self);
2175
+
2176
+ fis = ir->fis;
2177
+ for (i = 0; i < fis->size; i++) {
2178
+ FieldInfo *fi = fis->fields[i];
2179
+ rb_hash_aset(rfield_num_map,
2180
+ ID2SYM(rb_intern(fi->name)),
2181
+ INT2FIX(fi->number));
2182
+ }
2183
+ rb_ivar_set(self, id_fld_num_map, rfield_num_map);
2184
+
2185
+ return self;
2186
+ }
2187
+
2188
+ /*
2189
+ * call-seq:
2190
+ * index_reader.set_norm(doc_id, field, val)
2191
+ *
2192
+ * Expert: change the boost value for a +field+ in document at +doc_id+.
2193
+ * +val+ should be an integer in the range 0..255 which corresponds to an
2194
+ * encoded float value.
2195
+ */
2196
+ static VALUE
2197
+ frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
2198
+ {
2199
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2200
+ ir_set_norm(ir, FIX2INT(rdoc_id), frt_field(rfield), (uchar)NUM2CHR(rval));
2201
+ return self;
2202
+ }
2203
+
2204
+ /*
2205
+ * call-seq:
2206
+ * index_reader.norms(field) -> string
2207
+ *
2208
+ * Expert: Returns a string containing the norm values for a field. The
2209
+ * string length will be equal to the number of documents in the index and it
2210
+ * could have null bytes.
2211
+ */
2212
+ static VALUE
2213
+ frt_ir_norms(VALUE self, VALUE rfield)
2214
+ {
2215
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2216
+ uchar *norms;
2217
+ norms = ir_get_norms(ir, frt_field(rfield));
2218
+ if (norms) {
2219
+ return rb_str_new((char *)norms, ir->max_doc(ir));
2220
+ } else {
2221
+ return Qnil;
2222
+ }
2223
+ }
2224
+
2225
+ /*
2226
+ * call-seq:
2227
+ * index_reader.get_norms_into(field, buffer, offset) -> buffer
2228
+ *
2229
+ * Expert: Get the norm values into a string +buffer+ starting at +offset+.
2230
+ */
2231
+ static VALUE
2232
+ frt_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
2233
+ {
2234
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2235
+ int offset;
2236
+ offset = FIX2INT(roffset);
2237
+ Check_Type(rnorms, T_STRING);
2238
+ if (RSTRING(rnorms)->len < offset + ir->max_doc(ir)) {
2239
+ rb_raise(rb_eArgError, "supplied a string of length:%d to "
2240
+ "IndexReader#get_norms_into but needed a string of length "
2241
+ "offset:%d + maxdoc:%d",
2242
+ RSTRING(rnorms)->len, offset, ir->max_doc(ir));
2243
+ }
2244
+
2245
+ ir_get_norms_into(ir, frt_field(rfield),
2246
+ (uchar *)rs2s(rnorms) + offset);
2247
+ return rnorms;
2248
+ }
2249
+
2250
+ /*
2251
+ * call-seq:
2252
+ * index_reader.commit -> index_reader
2253
+ *
2254
+ * Commit any deletes made by this particular IndexReader to the index. This
2255
+ * will use open a Commit lock.
2256
+ */
2257
+ static VALUE
2258
+ frt_ir_commit(VALUE self)
2259
+ {
2260
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2261
+ ir_commit(ir);
2262
+ return self;
2263
+ }
2264
+
2265
+ /*
2266
+ * call-seq:
2267
+ * index_reader.close -> index_reader
2268
+ *
2269
+ * Close the IndexReader. This method also commits any deletions made by this
2270
+ * IndexReader. This method will be called explicitly by the garbage
2271
+ * collector but you should call it explicitly to commit any changes as soon
2272
+ * as possible and to close any locks held by the object to prevent locking
2273
+ * errors.
2274
+ */
2275
+ static VALUE
2276
+ frt_ir_close(VALUE self)
2277
+ {
2278
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2279
+ object_del(ir);
2280
+ Frt_Unwrap_Struct(self);
2281
+ ir_close(ir);
2282
+ return self;
2283
+ }
2284
+
2285
+ /*
2286
+ * call-seq:
2287
+ * index_reader.has_deletions? -> bool
2288
+ *
2289
+ * Return true if the index has any deletions, either uncommitted by this
2290
+ * IndexReader or committed by any other IndexReader.
2291
+ */
2292
+ static VALUE
2293
+ frt_ir_has_deletions(VALUE self)
2294
+ {
2295
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2296
+ return ir->has_deletions(ir) ? Qtrue : Qfalse;
2297
+ }
2298
+
2299
+ /*
2300
+ * call-seq:
2301
+ * index_reader.delete(doc_id) -> index_reader
2302
+ *
2303
+ * Delete document referenced internally by document id +doc_id+. The
2304
+ * document_id is the number used to reference documents in the index and is
2305
+ * returned by search methods.
2306
+ */
2307
+ static VALUE
2308
+ frt_ir_delete(VALUE self, VALUE rdoc_id)
2309
+ {
2310
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2311
+ ir_delete_doc(ir, FIX2INT(rdoc_id));
2312
+ return self;
2313
+ }
2314
+
2315
+ /*
2316
+ * call-seq:
2317
+ * index_reader.deleted?(doc_id) -> bool
2318
+ *
2319
+ * Returns true if the document at +doc_id+ has been deleted.
2320
+ */
2321
+ static VALUE
2322
+ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
2323
+ {
2324
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2325
+ return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse;
2326
+ }
2327
+
2328
+ /*
2329
+ * call-seq:
2330
+ * index_reader.max_doc -> number
2331
+ *
2332
+ * Returns 1 + the maximum document id in the index. It is the
2333
+ * document_id that will be used by the next document added to the index. If
2334
+ * there are no deletions, this number also refers to the number of documents
2335
+ * in the index.
2336
+ */
2337
+ static VALUE
2338
+ frt_ir_max_doc(VALUE self)
2339
+ {
2340
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2341
+ return INT2FIX(ir->max_doc(ir));
2342
+ }
2343
+
2344
+ /*
2345
+ * call-seq:
2346
+ * index_reader.num_docs -> number
2347
+ *
2348
+ * Returns the number of accessible (not deleted) documents in the index.
2349
+ * This will be equal to IndexReader#max_doc if there have been no documents
2350
+ * deleted from the index.
2351
+ */
2352
+ static VALUE
2353
+ frt_ir_num_docs(VALUE self)
2354
+ {
2355
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2356
+ return INT2FIX(ir->num_docs(ir));
2357
+ }
2358
+
2359
+ /*
2360
+ * call-seq:
2361
+ * index_reader.undelete_all -> index_reader
2362
+ *
2363
+ * Undelete all deleted documents in the index. This is kind of like a
2364
+ * rollback feature. Not that once an index is committed or a merge happens
2365
+ * during index, deletions will be committed and undelete_all will have no
2366
+ * effect on these documents.
2367
+ */
2368
+ static VALUE
2369
+ frt_ir_undelete_all(VALUE self)
2370
+ {
2371
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2372
+ ir_undelete_all(ir);
2373
+ return self;
2374
+ }
2375
+
2376
+ static VALUE
2377
+ frt_get_doc_range(IndexReader *ir, int pos, int len, int max)
2378
+ {
2379
+ VALUE ary;
2380
+ int i;
2381
+ max = min2(max, pos+len);
2382
+ len = max - pos;
2383
+ ary = rb_ary_new2(len);
2384
+ for (i = 0; i < len; i++) {
2385
+ RARRAY(ary)->ptr[i] = frt_get_lazy_doc(ir->get_lazy_doc(ir, i + pos));
2386
+ RARRAY(ary)->len++;
2387
+ }
2388
+ return ary;
2389
+ }
2390
+
2391
+ /*
2392
+ * call-seq:
2393
+ * index_reader.get_document(doc_id) -> LazyDoc
2394
+ * index_reader[doc_id] -> LazyDoc
2395
+ *
2396
+ * Retrieve a document from the index. See LazyDoc for more details on the
2397
+ * document returned. Documents are referenced internally by document ids
2398
+ * which are returned by the Searchers search methods.
2399
+ */
2400
+ static VALUE
2401
+ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
2402
+ {
2403
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2404
+ VALUE arg1, arg2;
2405
+ long pos, len;
2406
+ long max = ir->max_doc(ir);
2407
+ rb_scan_args(argc, argv, "11", &arg1, &arg2);
2408
+ if (argc == 1) {
2409
+ if (FIXNUM_P(arg1)) {
2410
+ pos = FIX2INT(arg1);
2411
+ pos = (pos < 0) ? (max + pos) : pos;
2412
+ if (pos < 0 || pos >= max) {
2413
+ rb_raise(rb_eArgError, ":%d is out of range [%d..%d] for "
2414
+ "IndexReader#[]", pos, 0, max,
2415
+ rb_id2name(SYM2ID(argv)));
2416
+ }
2417
+ return frt_get_lazy_doc(ir->get_lazy_doc(ir, pos));
2418
+ }
2419
+
2420
+ /* check if idx is Range */
2421
+ switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) {
2422
+ case Qfalse:
2423
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
2424
+ "IndexReader.get_document(index)",
2425
+ rb_id2name(SYM2ID(argv)));
2426
+ case Qnil:
2427
+ return Qnil;
2428
+ default:
2429
+ return frt_get_doc_range(ir, pos, len, max);
2430
+ }
2431
+ }
2432
+ else {
2433
+ pos = FIX2LONG(arg1);
2434
+ len = FIX2LONG(arg2);
2435
+ return frt_get_doc_range(ir, pos, len, max);
2436
+ }
2437
+ }
2438
+
2439
+ /*
2440
+ * call-seq:
2441
+ * index_reader.is_latest? -> bool
2442
+ *
2443
+ * Return true if the index version referenced by this IndexReader is the
2444
+ * latest version of the index. If it isn't you should close and reopen the
2445
+ * index to search the latest documents added to the index.
2446
+ */
2447
+ static VALUE
2448
+ frt_ir_is_latest(VALUE self)
2449
+ {
2450
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2451
+ return ir_is_latest(ir) ? Qtrue : Qfalse;
2452
+ }
2453
+
2454
+ /*
2455
+ * call-seq:
2456
+ * index_reader.term_vector(doc_id, field) -> TermVector
2457
+ *
2458
+ * Return the TermVector for the field +field+ in the document at +doc_id+ in
2459
+ * the index. Return nil of no such term_vector exists. See TermVector.
2460
+ */
2461
+ static VALUE
2462
+ frt_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield)
2463
+ {
2464
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2465
+ TermVector *tv;
2466
+ VALUE rtv;
2467
+ tv = ir->term_vector(ir, FIX2INT(rdoc_id), frt_field(rfield));
2468
+ if (tv) {
2469
+ rtv = frt_get_tv(tv);
2470
+ tv_destroy(tv);
2471
+ return rtv;
2472
+ }
2473
+ else {
2474
+ return Qnil;
2475
+ }
2476
+ }
2477
+
2478
+ static void
2479
+ frt_add_each_tv(void *key, void *value, void *rtvs)
2480
+ {
2481
+ rb_hash_aset((VALUE)rtvs, ID2SYM(rb_intern(key)), frt_get_tv(value));
2482
+ }
2483
+
2484
+ /*
2485
+ * call-seq:
2486
+ * index_reader.term_vectors(doc_id) -> hash of TermVector
2487
+ *
2488
+ * Return the TermVectors for the document at +doc_id+ in the index. The
2489
+ * value returned is a hash of the TermVectors for each field in the document
2490
+ * and they are referenced by field names (as symbols).
2491
+ */
2492
+ static VALUE
2493
+ frt_ir_term_vectors(VALUE self, VALUE rdoc_id)
2494
+ {
2495
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2496
+ HashTable *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id));
2497
+ VALUE rtvs = rb_hash_new();
2498
+ h_each(tvs, &frt_add_each_tv, (void *)rtvs);
2499
+ h_destroy(tvs);
2500
+
2501
+ return rtvs;
2502
+ }
2503
+
2504
+ /*
2505
+ * call-seq:
2506
+ * index_reader.term_docs -> TermDocEnum
2507
+ *
2508
+ * Builds a TermDocEnum (term-document enumerator) for the index. You can use
2509
+ * this object to iterate through the documents in which certain terms occur.
2510
+ * See TermDocEnum for more info.
2511
+ */
2512
+ static VALUE
2513
+ frt_ir_term_docs(VALUE self)
2514
+ {
2515
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2516
+ return frt_get_tde(self, ir->term_docs(ir));
2517
+ }
2518
+
2519
+ /*
2520
+ * call-seq:
2521
+ * index_reader.term_docs_for(field, term) -> TermDocEnum
2522
+ *
2523
+ * Builds a TermDocEnum to iterate through the documents that contain the
2524
+ * term +term+ in the field +field+. See TermDocEnum for more info.
2525
+ */
2526
+ static VALUE
2527
+ frt_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm)
2528
+ {
2529
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2530
+ return frt_get_tde(self, ir_term_docs_for(ir,
2531
+ frt_field(rfield),
2532
+ StringValuePtr(rterm)));
2533
+ }
2534
+
2535
+ /*
2536
+ * call-seq:
2537
+ * index_reader.term_positions -> TermDocEnum
2538
+ *
2539
+ * Same as IndexReader#term_docs except the TermDocEnum will also allow you
2540
+ * to scan through the positions at which a term occurs. See TermDocEnum for
2541
+ * more info.
2542
+ */
2543
+ static VALUE
2544
+ frt_ir_term_positions(VALUE self)
2545
+ {
2546
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2547
+ return frt_get_tde(self, ir->term_positions(ir));
2548
+ }
2549
+
2550
+ /*
2551
+ * call-seq:
2552
+ * index_reader.term_positions_for(field, term) -> TermDocEnum
2553
+ *
2554
+ * Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will
2555
+ * also allow you to scan through the positions at which a term occurs. See
2556
+ * TermDocEnum for more info.
2557
+ */
2558
+ static VALUE
2559
+ frt_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm)
2560
+ {
2561
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2562
+ return frt_get_tde(self, ir_term_positions_for(ir,
2563
+ frt_field(rfield),
2564
+ StringValuePtr(rterm)));
2565
+ }
2566
+
2567
+ /*
2568
+ * call-seq:
2569
+ * index_reader.doc_freq(field, term) -> integer
2570
+ *
2571
+ * Return the number of documents in which the term +term+ appears in the
2572
+ * field +field+.
2573
+ */
2574
+ static VALUE
2575
+ frt_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
2576
+ {
2577
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2578
+ return INT2FIX(ir_doc_freq(ir,
2579
+ frt_field(rfield),
2580
+ StringValuePtr(rterm)));
2581
+ }
2582
+
2583
+ /*
2584
+ * call-seq:
2585
+ * index_reader.terms(field) -> TermEnum
2586
+ *
2587
+ * Returns a term enumerator which allows you to iterate through all the
2588
+ * terms in the field +field+ in the index.
2589
+ */
2590
+ static VALUE
2591
+ frt_ir_terms(VALUE self, VALUE rfield)
2592
+ {
2593
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2594
+ return frt_get_te(self, ir_terms(ir, frt_field(rfield)));
2595
+ }
2596
+
2597
+ /*
2598
+ * call-seq:
2599
+ * index_reader.terms_from(field, term) -> TermEnum
2600
+ *
2601
+ * Same as IndexReader#terms(fields) except that it starts the enumerator off
2602
+ * at term +term+.
2603
+ */
2604
+ static VALUE
2605
+ frt_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2606
+ {
2607
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2608
+ return frt_get_te(self, ir_terms_from(ir,
2609
+ frt_field(rfield),
2610
+ StringValuePtr(rterm)));
2611
+ }
2612
+
2613
+ /*
2614
+ * call-seq:
2615
+ * index_reader.term_count(field) -> int
2616
+ *
2617
+ * Same return a count of the number of terms in the field
2618
+ */
2619
+ static VALUE
2620
+ frt_ir_term_count(VALUE self, VALUE rfield)
2621
+ {
2622
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2623
+ TermEnum *te = ir_terms(ir, frt_field(rfield));
2624
+ int count = 0;
2625
+ while (te->next(te)) {
2626
+ count++;
2627
+ }
2628
+ te->close(te);
2629
+ return INT2FIX(count);
2630
+ }
2631
+
2632
+ /*
2633
+ * call-seq:
2634
+ * index_reader.fields -> array of field-names
2635
+ *
2636
+ * Returns an array of field names in the index. This can be used to pass to
2637
+ * the QueryParser so that the QueryParser knows how to expand the "*"
2638
+ * wild-card to all fields in the index. A list of field names can also be
2639
+ * gathered from the FieldInfos object.
2640
+ */
2641
+ static VALUE
2642
+ frt_ir_fields(VALUE self)
2643
+ {
2644
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2645
+ FieldInfos *fis = ir->fis;
2646
+ VALUE rfield_names = rb_ary_new();
2647
+ int i;
2648
+ for (i = 0; i < fis->size; i++) {
2649
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
2650
+ }
2651
+ return rfield_names;
2652
+ }
2653
+
2654
+ /*
2655
+ * call-seq:
2656
+ * index_reader.field_infos -> FieldInfos
2657
+ *
2658
+ * Get the FieldInfos object for this IndexReader.
2659
+ */
2660
+ static VALUE
2661
+ frt_ir_field_infos(VALUE self)
2662
+ {
2663
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2664
+ return frt_get_field_infos(ir->fis);
2665
+ }
2666
+
2667
+ /*
2668
+ * call-seq:
2669
+ * index_reader.tokenized_fields -> array of field-names
2670
+ *
2671
+ * Returns an array of field names of all of the tokenized fields in the
2672
+ * index. This can be used to pass to the QueryParser so that the QueryParser
2673
+ * knows how to expand the "*" wild-card to all fields in the index. A list
2674
+ * of field names can also be gathered from the FieldInfos object.
2675
+ */
2676
+ static VALUE
2677
+ frt_ir_tk_fields(VALUE self)
2678
+ {
2679
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2680
+ FieldInfos *fis = ir->fis;
2681
+ VALUE rfield_names = rb_ary_new();
2682
+ int i;
2683
+ for (i = 0; i < fis->size; i++) {
2684
+ if (!fi_is_tokenized(fis->fields[i])) continue;
2685
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
2686
+ }
2687
+ return rfield_names;
2688
+ }
2689
+
2690
+ /*
2691
+ * call-seq:
2692
+ * index_reader.version -> int
2693
+ *
2694
+ * Returns the current version of the index reader.
2695
+ */
2696
+ static VALUE
2697
+ frt_ir_version(VALUE self)
2698
+ {
2699
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2700
+ return ULL2NUM(ir->sis->version);
2701
+ }
2702
+
2703
+ /****************************************************************************
2704
+ *
2705
+ * Init Functions
2706
+ *
2707
+ ****************************************************************************/
2708
+
2709
+
2710
+ /*
2711
+ * Document-class: Ferret::Index::FieldInfo
2712
+ *
2713
+ * == Summary
2714
+ *
2715
+ * The FieldInfo class is the field descriptor for the index. It specifies
2716
+ * whether a field is compressed or not or whether it should be indexed and
2717
+ * tokenized. Every field has a name which must be a symbol. There are three
2718
+ * properties that you can set, +:store+, +:index+ and +:term_vector+. You
2719
+ * can also set the default +:boost+ for a field as well.
2720
+ *
2721
+ * == Properties
2722
+ *
2723
+ * === :store
2724
+ *
2725
+ * The +:store+ property allows you to specify how a field is stored. You can
2726
+ * leave a field unstored (+:no+), store it in it's original format (+:yes+)
2727
+ * or store it in compressed format (+:compressed+). By default the document
2728
+ * is stored in its original format. If the field is large and it is stored
2729
+ * elsewhere where it is easily accessible you might want to leave it
2730
+ * unstored. This will keep the index size a lot smaller and make the
2731
+ * indexing process a lot faster. For example, you should probably leave the
2732
+ * +:content+ field unstored when indexing all the documents in your
2733
+ * file-system.
2734
+ *
2735
+ * === :index
2736
+ *
2737
+ * The +:index+ property allows you to specify how a field is indexed. A
2738
+ * field must be indexed to be searchable. However, a field doesn't need to
2739
+ * be indexed to be store in the Ferret index. You may want to use the index
2740
+ * as a simple database and store things like images or MP3s in the index. By
2741
+ * default each field is indexed and tokenized (split into tokens) (+:yes+).
2742
+ * If you don't want to index the field use +:no+. If you want the field
2743
+ * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2744
+ * wish to sort by. There are two other values for +:index+; +:omit_norms+
2745
+ * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
2746
+ * +:untokenized+ respectively and are useful if you are not boosting any
2747
+ * fields and you'd like to speed up the index. The norms file is the file
2748
+ * which contains the boost values for each document for a particular field.
2749
+ *
2750
+ * === :term_vector
2751
+ *
2752
+ * See TermVector for a description of term-vectors. You can specify whether
2753
+ * or not you would like to store term-vectors. The available options are
2754
+ * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2755
+ * +:with_positions_offsets+. Note that you need to store the positions to
2756
+ * associate offsets with individual terms in the term_vector.
2757
+ *
2758
+ * == Property Table
2759
+ *
2760
+ * Property Value Description
2761
+ * ------------------------------------------------------------------------
2762
+ * :store | :no | Don't store field
2763
+ * | |
2764
+ * | :yes (default) | Store field in its original
2765
+ * | | format. Use this value if you
2766
+ * | | want to highlight matches.
2767
+ * | | or print match excerpts a la
2768
+ * | | Google search.
2769
+ * | |
2770
+ * | :compressed | Store field in compressed
2771
+ * | | format.
2772
+ * -------------|-------------------------|------------------------------
2773
+ * :index | :no | Do not make this field
2774
+ * | | searchable.
2775
+ * | |
2776
+ * | :yes (default) | Make this field searchable and
2777
+ * | | tokenized its contents.
2778
+ * | |
2779
+ * | :untokenized | Make this field searchable but
2780
+ * | | do not tokenize its contents.
2781
+ * | | use this value for fields you
2782
+ * | | wish to sort by.
2783
+ * | |
2784
+ * | :omit_norms | Same as :yes except omit the
2785
+ * | | norms file. The norms file can
2786
+ * | | be omitted if you don't boost
2787
+ * | | any fields and you don't need
2788
+ * | | scoring based on field length.
2789
+ * | |
2790
+ * | :untokenized_omit_norms | Same as :untokenized except omit
2791
+ * | | the norms file. Norms files can
2792
+ * | | be omitted if you don't boost
2793
+ * | | any fields and you don't need
2794
+ * | | scoring based on field length.
2795
+ * | |
2796
+ * -------------|-------------------------|------------------------------
2797
+ * :term_vector | :no | Don't store term-vectors
2798
+ * | |
2799
+ * | :yes | Store term-vectors without
2800
+ * | | storing positions or offsets.
2801
+ * | |
2802
+ * | :with_positions | Store term-vectors with
2803
+ * | | positions.
2804
+ * | |
2805
+ * | :with_offsets | Store term-vectors with
2806
+ * | | offsets.
2807
+ * | |
2808
+ * | :with_positions_offsets | Store term-vectors with
2809
+ * | (default) | positions and offsets.
2810
+ * -------------|-------------------------|------------------------------
2811
+ * :boost | Float | The boost property is used to
2812
+ * | | set the default boost for a
2813
+ * | | field. This boost value will
2814
+ * | | used for all instances of the
2815
+ * | | field in the index unless
2816
+ * | | otherwise specified when you
2817
+ * | | create the field. All values
2818
+ * | | should be positive.
2819
+ * | |
2820
+ *
2821
+ * == Examples
2822
+ *
2823
+ * fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
2824
+ * :boost => 10.0)
2825
+ *
2826
+ * fi = FieldInfo.new(:content)
2827
+ *
2828
+ * fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
2829
+ * :term_vector => :no)
2830
+ *
2831
+ * fi = FieldInfo.new(:image, :store => :compressed, :index => :no,
2832
+ * :term_vector => :no)
2833
+ */
2834
+ static void
2835
+ Init_FieldInfo(void)
2836
+ {
2837
+ sym_store = ID2SYM(rb_intern("store"));
2838
+ sym_index = ID2SYM(rb_intern("index"));
2839
+ sym_term_vector = ID2SYM(rb_intern("term_vector"));
2840
+
2841
+ sym_compress = ID2SYM(rb_intern("compress"));
2842
+ sym_compressed = ID2SYM(rb_intern("compressed"));
2843
+
2844
+ sym_untokenized = ID2SYM(rb_intern("untokenized"));
2845
+ sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
2846
+ sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
2847
+
2848
+ sym_with_positions = ID2SYM(rb_intern("with_positions"));
2849
+ sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
2850
+ sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
2851
+
2852
+ cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
2853
+ rb_define_alloc_func(cFieldInfo, frt_data_alloc);
2854
+
2855
+ rb_define_method(cFieldInfo, "initialize", frt_fi_init, -1);
2856
+ rb_define_method(cFieldInfo, "name", frt_fi_name, 0);
2857
+ rb_define_method(cFieldInfo, "stored?", frt_fi_is_stored, 0);
2858
+ rb_define_method(cFieldInfo, "compressed?", frt_fi_is_compressed, 0);
2859
+ rb_define_method(cFieldInfo, "indexed?", frt_fi_is_indexed, 0);
2860
+ rb_define_method(cFieldInfo, "tokenized?", frt_fi_is_tokenized, 0);
2861
+ rb_define_method(cFieldInfo, "omit_norms?", frt_fi_omit_norms, 0);
2862
+ rb_define_method(cFieldInfo, "store_term_vector?",
2863
+ frt_fi_store_term_vector, 0);
2864
+ rb_define_method(cFieldInfo, "store_positions?",
2865
+ frt_fi_store_positions, 0);
2866
+ rb_define_method(cFieldInfo, "store_offsets?",
2867
+ frt_fi_store_offsets, 0);
2868
+ rb_define_method(cFieldInfo, "has_norms?", frt_fi_has_norms, 0);
2869
+ rb_define_method(cFieldInfo, "boost", frt_fi_boost, 0);
2870
+ rb_define_method(cFieldInfo, "to_s", frt_fi_to_s, 0);
2871
+ }
2872
+
2873
+ /*
2874
+ * Document-class: Ferret::Index::FieldInfos
2875
+ *
2876
+ * == Summary
2877
+ *
2878
+ * The FieldInfos class holds all the field descriptors for an index. It is
2879
+ * this class that is used to create a new index using the
2880
+ * FieldInfos#create_index method. If you are happy with the default
2881
+ * properties for FieldInfo then you don't need to worry about this class.
2882
+ * IndexWriter can create the index for you. Otherwise you should set up the
2883
+ * index like in the example;
2884
+ *
2885
+ * == Example
2886
+ *
2887
+ * field_infos = FieldInfos.new(:term_vector => :no)
2888
+ *
2889
+ * field_infos.add_field(:title, :index => :untokenized, :term_vector => :no,
2890
+ * :boost => 10.0)
2891
+ *
2892
+ * field_infos.add_field(:content)
2893
+ *
2894
+ * field_infos.add_field(:created_on, :index => :untokenized_omit_norms,
2895
+ * :term_vector => :no)
2896
+ *
2897
+ * field_infos.add_field(:image, :store => :compressed, :index => :no,
2898
+ * :term_vector => :no)
2899
+ *
2900
+ * field_infos.create_index("/path/to/index")
2901
+ *
2902
+ * == Default Properties
2903
+ *
2904
+ * See FieldInfo for the available field property values.
2905
+ *
2906
+ * When you create the FieldInfos object you specify the default properties
2907
+ * for the fields. Often you'll specify all of the fields in the index before
2908
+ * you create the index so the default values won't come into play. However,
2909
+ * it is possible to continue to dynamically add fields as indexing goes
2910
+ * along. If you add a document to the index which has fields that the index
2911
+ * doesn't know about then the default properties are used for the new field.
2912
+ */
2913
+ static void
2914
+ Init_FieldInfos(void)
2915
+ {
2916
+ Init_FieldInfo();
2917
+
2918
+ cFieldInfos = rb_define_class_under(mIndex, "FieldInfos", rb_cObject);
2919
+ rb_define_alloc_func(cFieldInfos, frt_data_alloc);
2920
+
2921
+ rb_define_method(cFieldInfos, "initialize", frt_fis_init, -1);
2922
+ rb_define_method(cFieldInfos, "to_a", frt_fis_to_a, 0);
2923
+ rb_define_method(cFieldInfos, "[]", frt_fis_get, 1);
2924
+ rb_define_method(cFieldInfos, "add", frt_fis_add, 1);
2925
+ rb_define_method(cFieldInfos, "<<", frt_fis_add, 1);
2926
+ rb_define_method(cFieldInfos, "add_field", frt_fis_add_field, -1);
2927
+ rb_define_method(cFieldInfos, "each", frt_fis_each, 0);
2928
+ rb_define_method(cFieldInfos, "to_s", frt_fis_to_s, 0);
2929
+ rb_define_method(cFieldInfos, "size", frt_fis_size, 0);
2930
+ rb_define_method(cFieldInfos, "create_index",
2931
+ frt_fis_create_index, 1);
2932
+ rb_define_method(cFieldInfos, "fields", frt_fis_get_fields, 0);
2933
+ rb_define_method(cFieldInfos, "tokenized_fields", frt_fis_get_tk_fields, 0);
2934
+ }
2935
+
2936
+ /*
2937
+ * Document-class: Ferret::Index::TermEnum
2938
+ *
2939
+ * == Summary
2940
+ *
2941
+ * The TermEnum object is used to iterate through the terms in a field. To
2942
+ * get a TermEnum you need to use the IndexReader#terms(field) method.
2943
+ *
2944
+ * == Example
2945
+ *
2946
+ * te = index_reader.terms(:content)
2947
+ *
2948
+ * te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
2949
+ *
2950
+ * # or you could do it like this;
2951
+ * te = index_reader.terms(:content)
2952
+ *
2953
+ * while te.next?
2954
+ * puts "#{te.term} occured in #{te.doc_freq} documents in the index"
2955
+ * end
2956
+ */
2957
+ static void
2958
+ Init_TermEnum(void)
2959
+ {
2960
+ id_term = rb_intern("@term");
2961
+
2962
+ cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
2963
+ rb_define_alloc_func(cTermEnum, frt_data_alloc);
2964
+
2965
+ rb_define_method(cTermEnum, "next?", frt_te_next, 0);
2966
+ rb_define_method(cTermEnum, "term", frt_te_term, 0);
2967
+ rb_define_method(cTermEnum, "doc_freq", frt_te_doc_freq, 0);
2968
+ rb_define_method(cTermEnum, "skip_to", frt_te_skip_to, 1);
2969
+ rb_define_method(cTermEnum, "each", frt_te_each, 0);
2970
+ rb_define_method(cTermEnum, "field=", frt_te_set_field, 1);
2971
+ rb_define_method(cTermEnum, "set_field",frt_te_set_field, 1);
2972
+ rb_define_method(cTermEnum, "to_json", frt_te_to_json, -1);
2973
+ }
2974
+
2975
+ /*
2976
+ * Document-class: Ferret::Index::TermDocEnum
2977
+ *
2978
+ * == Summary
2979
+ *
2980
+ * Use a TermDocEnum to iterate through the documents that contain a
2981
+ * particular term. You can also iterate through the positions which the term
2982
+ * occurs in a document.
2983
+ *
2984
+ *
2985
+ * == Example
2986
+ *
2987
+ * tde = index_reader.term_docs_for(:content, "fox")
2988
+ *
2989
+ * tde.each do |doc_id, freq|
2990
+ * puts "fox appeared #{freq} times in document #{doc_id}:"
2991
+ * positions = []
2992
+ * tde.each_position {|pos| positions << pos}
2993
+ * puts " #{positions.join(', ')}"
2994
+ * end
2995
+ *
2996
+ * # or you can do it like this;
2997
+ * tde.seek(:title, "red")
2998
+ * while tde.next?
2999
+ * puts "red appeared #{tde.freq} times in document #{tde.doc}:"
3000
+ * positions = []
3001
+ * while pos = tde.next_position
3002
+ * positions << pos
3003
+ * end
3004
+ * puts " #{positions.join(', ')}"
3005
+ * end
3006
+ */
3007
+ static void
3008
+ Init_TermDocEnum(void)
3009
+ {
3010
+ id_fld_num_map = rb_intern("@field_num_map");
3011
+ id_field_num = rb_intern("@field_num");
3012
+
3013
+ cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
3014
+ rb_define_alloc_func(cTermDocEnum, frt_data_alloc);
3015
+ rb_define_method(cTermDocEnum, "seek", frt_tde_seek, 2);
3016
+ rb_define_method(cTermDocEnum, "seek_term_enum", frt_tde_seek_te, 1);
3017
+ rb_define_method(cTermDocEnum, "doc", frt_tde_doc, 0);
3018
+ rb_define_method(cTermDocEnum, "freq", frt_tde_freq, 0);
3019
+ rb_define_method(cTermDocEnum, "next?", frt_tde_next, 0);
3020
+ rb_define_method(cTermDocEnum, "next_position", frt_tde_next_position, 0);
3021
+ rb_define_method(cTermDocEnum, "each", frt_tde_each, 0);
3022
+ rb_define_method(cTermDocEnum, "each_position", frt_tde_each_position, 0);
3023
+ rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
3024
+ rb_define_method(cTermDocEnum, "to_json", frt_tde_to_json, -1);
3025
+ }
3026
+
3027
+ /* rdochack
3028
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3029
+ */
3030
+
3031
+ /*
3032
+ * Document-class: Ferret::Index::TermVector::TVOffsets
3033
+ *
3034
+ * == Summary
3035
+ *
3036
+ * Holds the start and end byte-offsets of a term in a field. For example, if
3037
+ * the field was "the quick brown fox" then the start and end offsets of:
3038
+ *
3039
+ * ["the", "quick", "brown", "fox"]
3040
+ *
3041
+ * Would be:
3042
+ *
3043
+ * [(0,3), (4,9), (10,15), (16,19)]
3044
+ *
3045
+ * See the Analysis module for more information on setting the offsets.
3046
+ */
3047
+ static void
3048
+ Init_TVOffsets(void)
3049
+ {
3050
+ const char *tv_offsets_class = "TVOffsets";
3051
+ /* rdochack
3052
+ cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
3053
+ */
3054
+ cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
3055
+ rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
3056
+ rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
3057
+ }
3058
+
3059
+ /*
3060
+ * Document-class: Ferret::Index::TermVector::TVTerm
3061
+ *
3062
+ * == Summary
3063
+ *
3064
+ * The TVTerm class holds the term information for each term in a TermVector.
3065
+ * That is it holds the term's text and its positions in the document. You
3066
+ * can use those positions to reference the offsets for the term.
3067
+ *
3068
+ * == Example
3069
+ *
3070
+ * tv = index_reader.term_vector(:content)
3071
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3072
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3073
+ */
3074
+ static void
3075
+ Init_TVTerm(void)
3076
+ {
3077
+ const char *tv_term_class = "TVTerm";
3078
+ /* rdochack
3079
+ cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
3080
+ */
3081
+ cTVTerm = rb_struct_define(tv_term_class, "text", "positions", NULL);
3082
+ rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
3083
+ rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
3084
+ }
3085
+
3086
+ /*
3087
+ * Document-class: Ferret::Index::TermVector
3088
+ *
3089
+ * == Summary
3090
+ *
3091
+ * TermVectors are most commonly used for creating search result excerpts and
3092
+ * highlight search matches in results. This is all done internally so you
3093
+ * won't need to worry about the TermVector object. There are some other
3094
+ * reasons you may want to use the TermVectors object however. For example,
3095
+ * you may wish to see which terms are the most commonly occurring terms in a
3096
+ * document to implement a MoreLikeThis search.
3097
+ *
3098
+ * == Example
3099
+ *
3100
+ * tv = index_reader.term_vector(:content)
3101
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3102
+ *
3103
+ * # get the term frequency
3104
+ * term_freq = tv_term.positions.size
3105
+ *
3106
+ * # get the offsets for a term
3107
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3108
+ *
3109
+ * == Note
3110
+ *
3111
+ * +positions+ and +offsets+ can be +nil+ depending on what you set the
3112
+ * +:term_vector+ to when you set the FieldInfo object for the field. Note in
3113
+ * particular that you need to store both positions and offsets if you want
3114
+ * to associate offsets with particular terms.
3115
+ */
3116
+ static void
3117
+ Init_TermVector(void)
3118
+ {
3119
+ const char *tv_class = "TermVector";
3120
+ /* rdochack
3121
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3122
+ */
3123
+ cTermVector = rb_struct_define(tv_class,
3124
+ "field", "terms", "offsets", NULL);
3125
+ rb_set_class_path(cTermVector, mIndex, tv_class);
3126
+ rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
3127
+
3128
+ Init_TVOffsets();
3129
+ Init_TVTerm();
3130
+ }
3131
+
3132
+ /*
3133
+ * Document-class: Ferret::Index::IndexWriter
3134
+ *
3135
+ * == Summary
3136
+ *
3137
+ * The IndexWriter is the class used to add documents to an index. You can
3138
+ * also delete documents from the index using this class. The indexing
3139
+ * process is highly customizable and the IndexWriter has the following
3140
+ * parameters;
3141
+ *
3142
+ * dir:: This is an Ferret::Store::Directory object. You
3143
+ * should either pass a +:dir+ or a +:path+ when
3144
+ * creating an index.
3145
+ * path:: A string representing the path to the index
3146
+ * directory. If you are creating the index for the
3147
+ * first time the directory will be created if it's
3148
+ * missing. You should not choose a directory which
3149
+ * contains other files as they could be over-written.
3150
+ * To protect against this set +:create_if_missing+ to
3151
+ * false.
3152
+ * create_if_missing:: Default: true. Create the index if no index is
3153
+ * found in the specified directory. Otherwise, use
3154
+ * the existing index.
3155
+ * create:: Default: false. Creates the index, even if one
3156
+ * already exists. That means any existing index will
3157
+ * be deleted. It is probably better to use the
3158
+ * create_if_missing option so that the index is only
3159
+ * created the first time when it doesn't exist.
3160
+ * field_infos:: Default FieldInfos.new. The FieldInfos object to use
3161
+ * when creating a new index if +:create_if_missing+ or
3162
+ * +:create+ is set to true. If an existing index is
3163
+ * opened then this parameter is ignored.
3164
+ * analyzer:: Default: Ferret::Analysis::StandardAnalyzer.
3165
+ * Sets the default analyzer for the index. This is
3166
+ * used by both the IndexWriter and the QueryParser
3167
+ * to tokenize the input. The default is the
3168
+ * StandardAnalyzer.
3169
+ * chunk_size:: Default: 0x100000 or 1Mb. Memory performance tuning
3170
+ * parameter. Sets the default size of chunks of memory
3171
+ * malloced for use during indexing. You can usually
3172
+ * leave this parameter as is.
3173
+ * max_buffer_memory:: Default: 0x1000000 or 16Mb. Memory performance
3174
+ * tuning parameter. Sets the amount of memory to be
3175
+ * used by the indexing process. Set to a larger value
3176
+ * to increase indexing speed. Note that this only
3177
+ * includes memory used by the indexing process, not
3178
+ * the rest of your ruby application.
3179
+ * term_index_interval:: Default: 128. The skip interval between terms in the
3180
+ * term dictionary. A smaller value will possibly
3181
+ * increase search performance while also increasing
3182
+ * memory usage and impacting negatively impacting
3183
+ * indexing performance.
3184
+ * doc_skip_interval:: Default: 16. The skip interval for document numbers
3185
+ * in the index. As with +:term_index_interval+ you
3186
+ * have a trade-off. A smaller number may increase
3187
+ * search performance while also increasing memory
3188
+ * usage and impacting negatively impacting indexing
3189
+ * performance.
3190
+ * merge_factor:: Default: 10. This must never be less than 2.
3191
+ * Specifies the number of segments of a certain size
3192
+ * that must exist before they are merged. A larger
3193
+ * value will improve indexing performance while
3194
+ * slowing search performance.
3195
+ * max_buffered_docs:: Default: 10000. The maximum number of documents that
3196
+ * may be stored in memory before being written to the
3197
+ * index. If you have a lot of memory and are indexing
3198
+ * a large number of small documents (like products in
3199
+ * a product database for example) you may want to set
3200
+ * this to a much higher number (like
3201
+ * Ferret::FIX_INT_MAX). If you are worried about your
3202
+ * application crashing during the middle of index you
3203
+ * might set this to a smaller number so that the index
3204
+ * is committed more often. This is like having an
3205
+ * auto-save in a word processor application.
3206
+ * max_merge_docs:: Set this value to limit the number of documents that
3207
+ * go into a single segment. Use this to avoid
3208
+ * extremely long merge times during indexing which can
3209
+ * make your application seem unresponsive. This is
3210
+ * only necessary for very large indexes (millions of
3211
+ * documents).
3212
+ * max_field_length:: Default: 10000. The maximum number of terms added to
3213
+ * a single field. This can be useful to protect the
3214
+ * indexer when indexing documents from the web for
3215
+ * example. Usually the most important terms will occur
3216
+ * early on in a document so you can often safely
3217
+ * ignore the terms in a field after a certain number
3218
+ * of them. If you wanted to speed up indexing and same
3219
+ * space in your index you may only want to index the
3220
+ * first 1000 terms in a field. On the other hand, if
3221
+ * you want to be more thorough and you are indexing
3222
+ * documents from your file-system you may set this
3223
+ * parameter to Ferret::FIX_INT_MAX.
3224
+ * use_compound_file:: Default: true. Uses a compound file to store the
3225
+ * index. This prevents an error being raised for
3226
+ * having too many files open at the same time. The
3227
+ * default is true but performance is better if this is
3228
+ * set to false.
3229
+ *
3230
+ *
3231
+ * === Deleting Documents
3232
+ *
3233
+ * Both IndexReader and IndexWriter allow you to delete documents. You should
3234
+ * use the IndexReader to delete documents by document id and IndexWriter to
3235
+ * delete documents by term which we'll explain now. It is preferrable to
3236
+ * delete documents from an index using IndexWriter for performance reasons.
3237
+ * To delete documents using the IndexWriter you should give each document in
3238
+ * the index a unique ID. If you are indexing documents from the file-system
3239
+ * this unique ID will be the full file path. If indexing documents from the
3240
+ * database you should use the primary key as the ID field. You can then
3241
+ * use the delete method to delete a file referenced by the ID. For example;
3242
+ *
3243
+ * index_writer.delete(:id, "/path/to/indexed/file")
3244
+ */
3245
+ void
3246
+ Init_IndexWriter(void)
3247
+ {
3248
+ id_boost = rb_intern("boost");
3249
+
3250
+ sym_create = ID2SYM(rb_intern("create"));
3251
+ sym_create_if_missing = ID2SYM(rb_intern("create_if_missing"));
3252
+ sym_field_infos = ID2SYM(rb_intern("field_infos"));
3253
+
3254
+ sym_chunk_size = ID2SYM(rb_intern("chunk_size"));
3255
+ sym_max_buffer_memory = ID2SYM(rb_intern("max_buffer_memory"));
3256
+ sym_index_interval = ID2SYM(rb_intern("term_index_interval"));
3257
+ sym_skip_interval = ID2SYM(rb_intern("doc_skip_interval"));
3258
+ sym_merge_factor = ID2SYM(rb_intern("merge_factor"));
3259
+ sym_max_buffered_docs = ID2SYM(rb_intern("max_buffered_docs"));
3260
+ sym_max_merge_docs = ID2SYM(rb_intern("max_merge_docs"));
3261
+ sym_max_field_length = ID2SYM(rb_intern("max_field_length"));
3262
+ sym_use_compound_file = ID2SYM(rb_intern("use_compound_file"));
3263
+
3264
+ cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
3265
+ rb_define_alloc_func(cIndexWriter, frt_data_alloc);
3266
+
3267
+ rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
3268
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
3269
+ rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
3270
+ rb_str_new2(WRITE_LOCK_NAME));
3271
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
3272
+ rb_str_new2(COMMIT_LOCK_NAME));
3273
+ rb_define_const(cIndexWriter, "DEFAULT_CHUNK_SIZE",
3274
+ INT2FIX(default_config.chunk_size));
3275
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFER_MEMORY",
3276
+ INT2FIX(default_config.max_buffer_memory));
3277
+ rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
3278
+ INT2FIX(default_config.index_interval));
3279
+ rb_define_const(cIndexWriter, "DEFAULT_DOC_SKIP_INTERVAL",
3280
+ INT2FIX(default_config.skip_interval));
3281
+ rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
3282
+ INT2FIX(default_config.merge_factor));
3283
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFERED_DOCS",
3284
+ INT2FIX(default_config.max_buffered_docs));
3285
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
3286
+ INT2FIX(default_config.max_merge_docs));
3287
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
3288
+ INT2FIX(default_config.max_field_length));
3289
+ rb_define_const(cIndexWriter, "DEFAULT_USE_COMPOUND_FILE",
3290
+ default_config.use_compound_file ? Qtrue : Qfalse);
3291
+
3292
+ rb_define_method(cIndexWriter, "initialize", frt_iw_init, -1);
3293
+ rb_define_method(cIndexWriter, "doc_count", frt_iw_get_doc_count, 0);
3294
+ rb_define_method(cIndexWriter, "close", frt_iw_close, 0);
3295
+ rb_define_method(cIndexWriter, "add_document", frt_iw_add_doc, 1);
3296
+ rb_define_method(cIndexWriter, "<<", frt_iw_add_doc, 1);
3297
+ rb_define_method(cIndexWriter, "optimize", frt_iw_optimize, 0);
3298
+ rb_define_method(cIndexWriter, "commit", frt_iw_commit, 0);
3299
+ rb_define_method(cIndexWriter, "add_readers", frt_iw_add_readers, 1);
3300
+ rb_define_method(cIndexWriter, "delete", frt_iw_delete, 2);
3301
+ rb_define_method(cIndexWriter, "field_infos", frt_iw_field_infos, 0);
3302
+ rb_define_method(cIndexWriter, "analyzer", frt_iw_get_analyzer, 0);
3303
+ rb_define_method(cIndexWriter, "analyzer=", frt_iw_set_analyzer, 1);
3304
+ rb_define_method(cIndexWriter, "version", frt_iw_version, 0);
3305
+
3306
+ rb_define_method(cIndexWriter, "chunk_size",
3307
+ frt_iw_get_chunk_size, 0);
3308
+ rb_define_method(cIndexWriter, "chunk_size=",
3309
+ frt_iw_set_chunk_size, 1);
3310
+
3311
+ rb_define_method(cIndexWriter, "max_buffer_memory",
3312
+ frt_iw_get_max_buffer_memory, 0);
3313
+ rb_define_method(cIndexWriter, "max_buffer_memory=",
3314
+ frt_iw_set_max_buffer_memory, 1);
3315
+
3316
+ rb_define_method(cIndexWriter, "term_index_interval",
3317
+ frt_iw_get_index_interval, 0);
3318
+ rb_define_method(cIndexWriter, "term_index_interval=",
3319
+ frt_iw_set_index_interval, 1);
3320
+
3321
+ rb_define_method(cIndexWriter, "doc_skip_interval",
3322
+ frt_iw_get_skip_interval, 0);
3323
+ rb_define_method(cIndexWriter, "doc_skip_interval=",
3324
+ frt_iw_set_skip_interval, 1);
3325
+
3326
+ rb_define_method(cIndexWriter, "merge_factor",
3327
+ frt_iw_get_merge_factor, 0);
3328
+ rb_define_method(cIndexWriter, "merge_factor=",
3329
+ frt_iw_set_merge_factor, 1);
3330
+
3331
+ rb_define_method(cIndexWriter, "max_buffered_docs",
3332
+ frt_iw_get_max_buffered_docs, 0);
3333
+ rb_define_method(cIndexWriter, "max_buffered_docs=",
3334
+ frt_iw_set_max_buffered_docs, 1);
3335
+
3336
+ rb_define_method(cIndexWriter, "max_merge_docs",
3337
+ frt_iw_get_max_merge_docs, 0);
3338
+ rb_define_method(cIndexWriter, "max_merge_docs=",
3339
+ frt_iw_set_max_merge_docs, 1);
3340
+
3341
+ rb_define_method(cIndexWriter, "max_field_length",
3342
+ frt_iw_get_max_field_length, 0);
3343
+ rb_define_method(cIndexWriter, "max_field_length=",
3344
+ frt_iw_set_max_field_length, 1);
3345
+
3346
+ rb_define_method(cIndexWriter, "use_compound_file",
3347
+ frt_iw_get_use_compound_file, 0);
3348
+ rb_define_method(cIndexWriter, "use_compound_file=",
3349
+ frt_iw_set_use_compound_file, 1);
3350
+
3351
+ }
3352
+
3353
+ /*
3354
+ * Document-class: Ferret::Index::LazyDoc
3355
+ *
3356
+ * == Summary
3357
+ *
3358
+ * When a document is retrieved from the index a LazyDoc is returned.
3359
+ * Actually, LazyDoc is just a modified Hash object which lazily adds fields
3360
+ * to itself when they are accessed. You should not that they keys method
3361
+ * will return nothing until you actually access one of the fields. To see
3362
+ * what fields are available use LazyDoc#fields rather than LazyDoc#keys. To
3363
+ * load all fields use the LazyDoc#load method.
3364
+ *
3365
+ * == Example
3366
+ *
3367
+ * doc = index_reader[0]
3368
+ *
3369
+ * doc.keys #=> []
3370
+ * doc.values #=> []
3371
+ * doc.fields #=> [:title, :content]
3372
+ *
3373
+ * title = doc[:title] #=> "the title"
3374
+ * doc.keys #=> [:title]
3375
+ * doc.values #=> ["the title"]
3376
+ * doc.fields #=> [:title, :content]
3377
+ *
3378
+ * doc.load
3379
+ * doc.keys #=> [:title, :content]
3380
+ * doc.values #=> ["the title", "the content"]
3381
+ * doc.fields #=> [:title, :content]
3382
+ */
3383
+ void
3384
+ Init_LazyDoc(void)
3385
+ {
3386
+ id_fields = rb_intern("@fields");
3387
+
3388
+
3389
+ cLazyDoc = rb_define_class_under(mIndex, "LazyDoc", rb_cHash);
3390
+ rb_define_method(cLazyDoc, "default", frt_lzd_default, 1);
3391
+ rb_define_method(cLazyDoc, "load", frt_lzd_load, 0);
3392
+ rb_define_method(cLazyDoc, "fields", frt_lzd_fields, 0);
3393
+
3394
+ cLazyDocData = rb_define_class_under(cLazyDoc, "LazyDocData", rb_cObject);
3395
+ rb_define_alloc_func(cLazyDocData, frt_data_alloc);
3396
+ }
3397
+
3398
+ /*
3399
+ * Document-class: Ferret::Index::IndexReader
3400
+ *
3401
+ * == Summary
3402
+ *
3403
+ * IndexReader is used for reading data from the index. This class is usually
3404
+ * used directly for more advanced tasks like iterating through terms in an
3405
+ * index, accessing term-vectors or deleting documents by document id. It is
3406
+ * also used internally by IndexSearcher.
3407
+ */
3408
+ void
3409
+ Init_IndexReader(void)
3410
+ {
3411
+ cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
3412
+ rb_define_alloc_func(cIndexReader, frt_data_alloc);
3413
+ rb_define_method(cIndexReader, "initialize", frt_ir_init, 1);
3414
+ rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
3415
+ rb_define_method(cIndexReader, "norms", frt_ir_norms, 1);
3416
+ rb_define_method(cIndexReader, "get_norms_into",frt_ir_get_norms_into, 3);
3417
+ rb_define_method(cIndexReader, "commit", frt_ir_commit, 0);
3418
+ rb_define_method(cIndexReader, "close", frt_ir_close, 0);
3419
+ rb_define_method(cIndexReader, "has_deletions?",frt_ir_has_deletions, 0);
3420
+ rb_define_method(cIndexReader, "delete", frt_ir_delete, 1);
3421
+ rb_define_method(cIndexReader, "deleted?", frt_ir_is_deleted, 1);
3422
+ rb_define_method(cIndexReader, "max_doc", frt_ir_max_doc, 0);
3423
+ rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
3424
+ rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
3425
+ rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
3426
+ rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, -1);
3427
+ rb_define_method(cIndexReader, "[]", frt_ir_get_doc, -1);
3428
+ rb_define_method(cIndexReader, "term_vector", frt_ir_term_vector, 2);
3429
+ rb_define_method(cIndexReader, "term_vectors", frt_ir_term_vectors, 1);
3430
+ rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
3431
+ rb_define_method(cIndexReader, "term_positions",frt_ir_term_positions, 0);
3432
+ rb_define_method(cIndexReader, "term_docs_for", frt_ir_term_docs_for, 2);
3433
+ rb_define_method(cIndexReader, "term_positions_for", frt_ir_t_pos_for, 2);
3434
+ rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 2);
3435
+ rb_define_method(cIndexReader, "terms", frt_ir_terms, 1);
3436
+ rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 2);
3437
+ rb_define_method(cIndexReader, "term_count", frt_ir_term_count, 1);
3438
+ rb_define_method(cIndexReader, "fields", frt_ir_fields, 0);
3439
+ rb_define_method(cIndexReader, "field_names", frt_ir_fields, 0);
3440
+ rb_define_method(cIndexReader, "field_infos", frt_ir_field_infos, 0);
3441
+ rb_define_method(cIndexReader, "tokenized_fields", frt_ir_tk_fields, 0);
3442
+ rb_define_method(cIndexReader, "version", frt_ir_version, 0);
3443
+ }
3444
+
3445
+ /* rdoc hack
3446
+ extern VALUE mFerret = rb_define_module("Ferret");
3447
+ */
3448
+
3449
+ /*
3450
+ * Document-module: Ferret::Index
3451
+ *
3452
+ * == Summary
3453
+ *
3454
+ * The Index module contains all the classes used for adding to and
3455
+ * retrieving from the index. The important classes to know about are;
3456
+ *
3457
+ * * FieldInfo
3458
+ * * FieldInfos
3459
+ * * IndexWriter
3460
+ * * IndexReader
3461
+ * * LazyDoc
3462
+ *
3463
+ * The other classes in this module are useful for more advanced uses like
3464
+ * building tag clouds, creating more-like-this queries, custom highlighting
3465
+ * etc. They are also useful for index browsers.
3466
+ */
3467
+ void
3468
+ Init_Index(void)
3469
+ {
3470
+ mIndex = rb_define_module_under(mFerret, "Index");
3471
+
3472
+ sym_boost = ID2SYM(rb_intern("boost"));
3473
+ sym_analyzer = ID2SYM(rb_intern("analyzer"));
3474
+ sym_close_dir = ID2SYM(rb_intern("close_dir"));
3475
+
3476
+ Init_TermVector();
3477
+ Init_TermEnum();
3478
+ Init_TermDocEnum();
3479
+
3480
+ Init_FieldInfos();
3481
+
3482
+ Init_LazyDoc();
3483
+ Init_IndexWriter();
3484
+ Init_IndexReader();
3485
+ }