sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,3472 @@
1
+ #include "ferret.h"
2
+ #include "index.h"
3
+ #include <ruby/st.h>
4
+
5
+ VALUE mIndex;
6
+
7
+ VALUE cFieldInfo;
8
+ VALUE cFieldInfos;
9
+
10
+ VALUE cTVOffsets;
11
+ VALUE cTVTerm;
12
+ VALUE cTermVector;
13
+
14
+ VALUE cTermEnum;
15
+ VALUE cTermDocEnum;
16
+
17
+ VALUE cLazyDoc;
18
+ VALUE cLazyDocData;
19
+ VALUE cIndexWriter;
20
+ VALUE cIndexReader;
21
+
22
+ VALUE sym_analyzer;
23
+ static VALUE sym_close_dir;
24
+ static VALUE sym_create;
25
+ static VALUE sym_create_if_missing;
26
+
27
+ static VALUE sym_chunk_size;
28
+ static VALUE sym_max_buffer_memory;
29
+ static VALUE sym_index_interval;
30
+ static VALUE sym_skip_interval;
31
+ static VALUE sym_merge_factor;
32
+ static VALUE sym_max_buffered_docs;
33
+ static VALUE sym_max_merge_docs;
34
+ static VALUE sym_max_field_length;
35
+ static VALUE sym_use_compound_file;
36
+
37
+ static VALUE sym_boost;
38
+ static VALUE sym_field_infos;
39
+
40
+ static VALUE sym_store;
41
+ static VALUE sym_index;
42
+ static VALUE sym_term_vector;
43
+
44
+ static VALUE sym_compress;
45
+ static VALUE sym_compressed;
46
+
47
+ static VALUE sym_untokenized;
48
+ static VALUE sym_omit_norms;
49
+ static VALUE sym_untokenized_omit_norms;
50
+
51
+ static VALUE sym_with_positions;
52
+ static VALUE sym_with_offsets;
53
+ static VALUE sym_with_positions_offsets;
54
+
55
+ static ID id_term;
56
+ static ID id_fields;
57
+ static ID id_fld_num_map;
58
+ static ID id_field_num;
59
+ static ID id_boost;
60
+
61
+ extern void frt_set_term(VALUE rterm, Term *t);
62
+ extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
63
+ extern VALUE frt_get_analyzer(Analyzer *a);
64
+
65
+ /****************************************************************************
66
+ *
67
+ * FieldInfo Methods
68
+ *
69
+ ****************************************************************************/
70
+
71
+ static void
72
+ frt_fi_free(void *p)
73
+ {
74
+ object_del(p);
75
+ fi_deref((FieldInfo *)p);
76
+ }
77
+
78
+ static void
79
+ frt_fi_get_params(VALUE roptions,
80
+ enum StoreValues *store,
81
+ enum IndexValues *index,
82
+ enum TermVectorValues *term_vector,
83
+ float *boost)
84
+ {
85
+ VALUE v;
86
+ Check_Type(roptions, T_HASH);
87
+ v = rb_hash_aref(roptions, sym_boost);
88
+ if (Qnil != v) {
89
+ *boost = (float)NUM2DBL(v);
90
+ } else {
91
+ *boost = 1.0f;
92
+ }
93
+ v = rb_hash_aref(roptions, sym_store);
94
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
95
+ if (v == sym_no || v == sym_false || v == Qfalse) {
96
+ *store = STORE_NO;
97
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
98
+ *store = STORE_YES;
99
+ } else if (v == sym_compress || v == sym_compressed) {
100
+ *store = STORE_COMPRESS;
101
+ } else if (v == Qnil) {
102
+ /* leave as default */
103
+ } else {
104
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
105
+ " Please choose from [:yes, :no, :compressed]",
106
+ rb_id2name(SYM2ID(v)));
107
+ }
108
+
109
+ v = rb_hash_aref(roptions, sym_index);
110
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
111
+ if (v == sym_no || v == sym_false || v == Qfalse) {
112
+ *index = INDEX_NO;
113
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
114
+ *index = INDEX_YES;
115
+ } else if (v == sym_untokenized) {
116
+ *index = INDEX_UNTOKENIZED;
117
+ } else if (v == sym_omit_norms) {
118
+ *index = INDEX_YES_OMIT_NORMS;
119
+ } else if (v == sym_untokenized_omit_norms) {
120
+ *index = INDEX_UNTOKENIZED_OMIT_NORMS;
121
+ } else if (v == Qnil) {
122
+ /* leave as default */
123
+ } else {
124
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
125
+ " Please choose from [:no, :yes, :untokenized, "
126
+ ":omit_norms, :untokenized_omit_norms]",
127
+ rb_id2name(SYM2ID(v)));
128
+ }
129
+
130
+ v = rb_hash_aref(roptions, sym_term_vector);
131
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
132
+ if (v == sym_no || v == sym_false || v == Qfalse) {
133
+ *term_vector = TERM_VECTOR_NO;
134
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
135
+ *term_vector = TERM_VECTOR_YES;
136
+ } else if (v == sym_with_positions) {
137
+ *term_vector = TERM_VECTOR_WITH_POSITIONS;
138
+ } else if (v == sym_with_offsets) {
139
+ *term_vector = TERM_VECTOR_WITH_OFFSETS;
140
+ } else if (v == sym_with_positions_offsets) {
141
+ *term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
142
+ } else if (v == Qnil) {
143
+ /* leave as default */
144
+ } else {
145
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
146
+ ":term_vector. Please choose from [:no, :yes, "
147
+ ":with_positions, :with_offsets, "
148
+ ":with_positions_offsets]",
149
+ rb_id2name(SYM2ID(v)));
150
+ }
151
+ }
152
+
153
+ static VALUE
154
+ frt_get_field_info(FieldInfo *fi)
155
+ {
156
+
157
+ VALUE rfi = Qnil;
158
+ if (fi) {
159
+ rfi = object_get(fi);
160
+ if (rfi == Qnil) {
161
+ rfi = Data_Wrap_Struct(cFieldInfo, NULL, &frt_fi_free, fi);
162
+ REF(fi);
163
+ object_add(fi, rfi);
164
+ }
165
+ }
166
+ return rfi;
167
+ }
168
+
169
+ /*
170
+ * call-seq:
171
+ * FieldInfo.new(name, options = {}) -> field_info
172
+ *
173
+ * Create a new FieldInfo object with the name +name+ and the properties
174
+ * specified in +options+. The available options are [:store, :index,
175
+ * :term_vector, :boost]. See the description of FieldInfo for more
176
+ * information on these properties.
177
+ */
178
+ static VALUE
179
+ frt_fi_init(int argc, VALUE *argv, VALUE self)
180
+ {
181
+ VALUE roptions, rname;
182
+ FieldInfo *fi;
183
+ enum StoreValues store = STORE_YES;
184
+ enum IndexValues index = INDEX_YES;
185
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
186
+ float boost = 1.0f;
187
+
188
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
189
+ if (argc > 1) {
190
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
191
+ }
192
+ fi = fi_new(frt_field(rname), store, index, term_vector);
193
+ fi->boost = boost;
194
+ Frt_Wrap_Struct(self, NULL, &frt_fi_free, fi);
195
+ object_add(fi, self);
196
+ return self;
197
+ }
198
+
199
+ /*
200
+ * call-seq:
201
+ * fi.name -> symbol
202
+ *
203
+ * Return the name of the field
204
+ */
205
+ static VALUE
206
+ frt_fi_name(VALUE self)
207
+ {
208
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
209
+ return ID2SYM(rb_intern(fi->name));
210
+ }
211
+
212
+ /*
213
+ * call-seq:
214
+ * fi.stored? -> bool
215
+ *
216
+ * Return true if the field is stored in the index.
217
+ */
218
+ static VALUE
219
+ frt_fi_is_stored(VALUE self)
220
+ {
221
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
222
+ return fi_is_stored(fi) ? Qtrue : Qfalse;
223
+ }
224
+
225
+ /*
226
+ * call-seq:
227
+ * fi.compressed? -> bool
228
+ *
229
+ * Return true if the field is stored in the index in compressed format.
230
+ */
231
+ static VALUE
232
+ frt_fi_is_compressed(VALUE self)
233
+ {
234
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
235
+ return fi_is_compressed(fi) ? Qtrue : Qfalse;
236
+ }
237
+
238
+ /*
239
+ * call-seq:
240
+ * fi.indexed? -> bool
241
+ *
242
+ * Return true if the field is indexed, ie searchable in the index.
243
+ */
244
+ static VALUE
245
+ frt_fi_is_indexed(VALUE self)
246
+ {
247
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
248
+ return fi_is_indexed(fi) ? Qtrue : Qfalse;
249
+ }
250
+
251
+ /*
252
+ * call-seq:
253
+ * fi.tokenized? -> bool
254
+ *
255
+ * Return true if the field is tokenized. Tokenizing is the process of
256
+ * breaking the field up into tokens. That is "the quick brown fox" becomes:
257
+ *
258
+ * ["the", "quick", "brown", "fox"]
259
+ *
260
+ * A field can only be tokenized if it is indexed.
261
+ */
262
+ static VALUE
263
+ frt_fi_is_tokenized(VALUE self)
264
+ {
265
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
266
+ return fi_is_tokenized(fi) ? Qtrue : Qfalse;
267
+ }
268
+
269
+ /*
270
+ * call-seq:
271
+ * fi.omit_norms? -> bool
272
+ *
273
+ * Return true if the field omits the norm file. The norm file is the file
274
+ * used to store the field boosts for an indexed field. If you do not boost
275
+ * any fields, and you can live without scoring based on field length then
276
+ * you can omit the norms file. This will give the index a slight performance
277
+ * boost and it will use less memory, especially for indexes which have a
278
+ * large number of documents.
279
+ */
280
+ static VALUE
281
+ frt_fi_omit_norms(VALUE self)
282
+ {
283
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
284
+ return fi_omit_norms(fi) ? Qtrue : Qfalse;
285
+ }
286
+
287
+ /*
288
+ * call-seq:
289
+ * fi.store_term_vector? -> bool
290
+ *
291
+ * Return true if the term-vectors are stored for this field.
292
+ */
293
+ static VALUE
294
+ frt_fi_store_term_vector(VALUE self)
295
+ {
296
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
297
+ return fi_store_term_vector(fi) ? Qtrue : Qfalse;
298
+ }
299
+
300
+ /*
301
+ * call-seq:
302
+ * fi.store_positions? -> bool
303
+ *
304
+ * Return true if positions are stored with the term-vectors for this field.
305
+ */
306
+ static VALUE
307
+ frt_fi_store_positions(VALUE self)
308
+ {
309
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
310
+ return fi_store_positions(fi) ? Qtrue : Qfalse;
311
+ }
312
+
313
+ /*
314
+ * call-seq:
315
+ * fi.store_offsets? -> bool
316
+ *
317
+ * Return true if offsets are stored with the term-vectors for this field.
318
+ */
319
+ static VALUE
320
+ frt_fi_store_offsets(VALUE self)
321
+ {
322
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
323
+ return fi_store_offsets(fi) ? Qtrue : Qfalse;
324
+ }
325
+
326
+ /*
327
+ * call-seq:
328
+ * fi.has_norms? -> bool
329
+ *
330
+ * Return true if this field has a norms file. This is the same as calling;
331
+ *
332
+ * fi.indexed? and not fi.omit_norms?
333
+ */
334
+ static VALUE
335
+ frt_fi_has_norms(VALUE self)
336
+ {
337
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
338
+ return fi_has_norms(fi) ? Qtrue : Qfalse;
339
+ }
340
+
341
+ /*
342
+ * call-seq:
343
+ * fi.boost -> boost
344
+ *
345
+ * Return the default boost for this field
346
+ */
347
+ static VALUE
348
+ frt_fi_boost(VALUE self)
349
+ {
350
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
351
+ return rb_float_new((double)fi->boost);
352
+ }
353
+
354
+ /*
355
+ * call-seq:
356
+ * fi.to_s -> string
357
+ *
358
+ * Return a string representation of the FieldInfo object.
359
+ */
360
+ static VALUE
361
+ frt_fi_to_s(VALUE self)
362
+ {
363
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
364
+ char *fi_s = fi_to_s(fi);
365
+ VALUE rfi_s = rb_str_new2(fi_s);
366
+ free(fi_s);
367
+ return rfi_s;
368
+ }
369
+
370
+ /****************************************************************************
371
+ *
372
+ * FieldInfos Methods
373
+ *
374
+ ****************************************************************************/
375
+
376
+ static void
377
+ frt_fis_free(void *p)
378
+ {
379
+ object_del(p);
380
+ fis_deref((FieldInfos *)p);
381
+ }
382
+
383
+ static void
384
+ frt_fis_mark(void *p)
385
+ {
386
+ int i;
387
+ FieldInfos *fis = (FieldInfos *)p;
388
+
389
+ for (i = 0; i < fis->size; i++) {
390
+ frt_gc_mark(fis->fields[i]);
391
+ }
392
+ }
393
+
394
+ static VALUE
395
+ frt_get_field_infos(FieldInfos *fis)
396
+ {
397
+
398
+ VALUE rfis = Qnil;
399
+ if (fis) {
400
+ rfis = object_get(fis);
401
+ if (rfis == Qnil) {
402
+ rfis = Data_Wrap_Struct(cFieldInfos, &frt_fis_mark, &frt_fis_free,
403
+ fis);
404
+ REF(fis);
405
+ object_add(fis, rfis);
406
+ }
407
+ }
408
+ return rfis;
409
+ }
410
+
411
+ /*
412
+ * call-seq:
413
+ * FieldInfos.new(defaults = {}) -> field_infos
414
+ *
415
+ * Create a new FieldInfos object which uses the default values for fields
416
+ * specified in the +default+ hash parameter. See FieldInfo for available
417
+ * property values.
418
+ */
419
+ static VALUE
420
+ frt_fis_init(int argc, VALUE *argv, VALUE self)
421
+ {
422
+ VALUE roptions;
423
+ FieldInfos *fis;
424
+ enum StoreValues store = STORE_YES;
425
+ enum IndexValues index = INDEX_YES;
426
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
427
+ float boost;
428
+
429
+ rb_scan_args(argc, argv, "01", &roptions);
430
+ if (argc > 0) {
431
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
432
+ }
433
+ fis = fis_new(store, index, term_vector);
434
+ Frt_Wrap_Struct(self, &frt_fis_mark, &frt_fis_free, fis);
435
+ object_add(fis, self);
436
+ return self;
437
+ }
438
+
439
+ /*
440
+ * call-seq:
441
+ * fis.to_a -> array
442
+ *
443
+ * Return an array of the FieldInfo objects contained but this FieldInfos
444
+ * object.
445
+ */
446
+ static VALUE
447
+ frt_fis_to_a(VALUE self)
448
+ {
449
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
450
+ VALUE rary = rb_ary_new();
451
+ int i;
452
+
453
+ for (i = 0; i < fis->size; i++) {
454
+ rb_ary_push(rary, frt_get_field_info(fis->fields[i]));
455
+ }
456
+ return rary;
457
+ }
458
+
459
+ /*
460
+ * call-seq:
461
+ * fis[name] -> field_info
462
+ * fis[number] -> field_info
463
+ *
464
+ * Get the FieldInfo object. FieldInfo objects can be referenced by either
465
+ * their field-number of the field-name (which must be a symbol). For
466
+ * example;
467
+ *
468
+ * fi = fis[:name]
469
+ * fi = fis[2]
470
+ */
471
+ static VALUE
472
+ frt_fis_get(VALUE self, VALUE ridx)
473
+ {
474
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
475
+ VALUE rfi = Qnil;
476
+ switch (TYPE(ridx)) {
477
+ case T_FIXNUM: {
478
+ int index = FIX2INT(ridx);
479
+ if (index < 0) index += fis->size;
480
+ if (index < 0 || index >= fis->size) {
481
+ rb_raise(rb_eArgError, "index of %d is out of range (0..%d)\n",
482
+ index, fis->size);
483
+ }
484
+ rfi = frt_get_field_info(fis->fields[index]);
485
+ break;
486
+ }
487
+ case T_SYMBOL:
488
+ rfi = frt_get_field_info(fis_get_field(fis, frt_field(ridx)));
489
+ break;
490
+ case T_STRING:
491
+ rfi = frt_get_field_info(fis_get_field(fis, StringValuePtr(ridx)));
492
+ break;
493
+ default:
494
+ rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
495
+ rs2s(rb_obj_as_string(ridx)));
496
+ break;
497
+ }
498
+ return rfi;
499
+ }
500
+
501
+ /*
502
+ * call-seq:
503
+ * fis << fi -> fis
504
+ * fis.add(fi) -> fis
505
+ *
506
+ * Add a FieldInfo object. Use the FieldInfos#add_field method where
507
+ * possible.
508
+ */
509
+ static VALUE
510
+ frt_fis_add(VALUE self, VALUE rfi)
511
+ {
512
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
513
+ FieldInfo *fi = (FieldInfo *)frt_rb_data_ptr(rfi);
514
+ fis_add_field(fis, fi);
515
+ REF(fi);
516
+ return self;
517
+ }
518
+
519
+ /*
520
+ * call-seq:
521
+ * fis.add_field(name, properties = {} -> fis
522
+ *
523
+ * Add a new field to the FieldInfos object. See FieldInfo for a description
524
+ * of the available properties.
525
+ */
526
+ static VALUE
527
+ frt_fis_add_field(int argc, VALUE *argv, VALUE self)
528
+ {
529
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
530
+ FieldInfo *fi;
531
+ enum StoreValues store = fis->store;
532
+ enum IndexValues index = fis->index;
533
+ enum TermVectorValues term_vector = fis->term_vector;
534
+ float boost = 1.0f;
535
+ VALUE rname, roptions;
536
+
537
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
538
+ if (argc > 1) {
539
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
540
+ }
541
+ fi = fi_new(frt_field(rname), store, index, term_vector);
542
+ fi->boost = boost;
543
+ fis_add_field(fis, fi);
544
+ return self;
545
+ }
546
+
547
+ /*
548
+ * call-seq:
549
+ * fis.each {|fi| do_something } -> fis
550
+ *
551
+ * Iterate through the FieldInfo objects.
552
+ */
553
+ static VALUE
554
+ frt_fis_each(VALUE self)
555
+ {
556
+ int i;
557
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
558
+
559
+ for (i = 0; i < fis->size; i++) {
560
+ rb_yield(frt_get_field_info(fis->fields[i]));
561
+ }
562
+ return self;
563
+ }
564
+
565
+ /*
566
+ * call-seq:
567
+ * fis.to_s -> string
568
+ *
569
+ * Return a string representation of the FieldInfos object.
570
+ */
571
+ static VALUE
572
+ frt_fis_to_s(VALUE self)
573
+ {
574
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
575
+ char *fis_s = fis_to_s(fis);
576
+ VALUE rfis_s = rb_str_new2(fis_s);
577
+ free(fis_s);
578
+ return rfis_s;
579
+ }
580
+
581
+ /*
582
+ * call-seq:
583
+ * fis.size -> int
584
+ *
585
+ * Return the number of fields in the FieldInfos object.
586
+ */
587
+ static VALUE
588
+ frt_fis_size(VALUE self)
589
+ {
590
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
591
+ return INT2FIX(fis->size);
592
+ }
593
+
594
+ /*
595
+ * call-seq:
596
+ * fis.create_index(dir) -> self
597
+ *
598
+ * Create a new index in the directory specified. The directory +dir+ can
599
+ * either be a string path representing a directory on the file-system or an
600
+ * actual directory object. Care should be taken when using this method. Any
601
+ * existing index (or other files for that matter) will be deleted from the
602
+ * directory and overwritten by the new index.
603
+ */
604
+ static VALUE
605
+ frt_fis_create_index(VALUE self, VALUE rdir)
606
+ {
607
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
608
+ Store *store = NULL;
609
+ if (TYPE(rdir) == T_DATA) {
610
+ store = DATA_PTR(rdir);
611
+ REF(store);
612
+ } else {
613
+ StringValue(rdir);
614
+ frt_create_dir(rdir);
615
+ store = open_fs_store(rs2s(rdir));
616
+ }
617
+ index_create(store, fis);
618
+ store_deref(store);
619
+ return self;
620
+ }
621
+
622
+ /*
623
+ * call-seq:
624
+ * fis.fields -> symbol array
625
+ *
626
+ * Return a list of the field names (as symbols) of all the fields in the
627
+ * index.
628
+ */
629
+ static VALUE
630
+ frt_fis_get_fields(VALUE self)
631
+ {
632
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
633
+ VALUE rfield_names = rb_ary_new();
634
+ int i;
635
+ for (i = 0; i < fis->size; i++) {
636
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
637
+ }
638
+ return rfield_names;
639
+ }
640
+
641
+ /*
642
+ * call-seq:
643
+ * fis.tokenized_fields -> symbol array
644
+ *
645
+ * Return a list of the field names (as symbols) of all the tokenized fields
646
+ * in the index.
647
+ */
648
+ static VALUE
649
+ frt_fis_get_tk_fields(VALUE self)
650
+ {
651
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
652
+ VALUE rfield_names = rb_ary_new();
653
+ int i;
654
+ for (i = 0; i < fis->size; i++) {
655
+ if (!fi_is_tokenized(fis->fields[i])) continue;
656
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
657
+ }
658
+ return rfield_names;
659
+ }
660
+
661
+ /****************************************************************************
662
+ *
663
+ * TermEnum Methods
664
+ *
665
+ ****************************************************************************/
666
+
667
+ static void
668
+ frt_te_free(void *p)
669
+ {
670
+ TermEnum *te = (TermEnum *)p;
671
+ te->close(te);
672
+ }
673
+
674
+ static VALUE
675
+ frt_te_get_set_term(VALUE self, const char *term)
676
+ {
677
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
678
+ VALUE str = term ? rb_str_new(term, te->curr_term_len) : Qnil;
679
+ rb_ivar_set(self, id_term, str);
680
+ return str;
681
+ }
682
+
683
+ static VALUE
684
+ frt_get_te(VALUE rir, TermEnum *te)
685
+ {
686
+ VALUE self = Qnil;
687
+ if (te != NULL) {
688
+ self = Data_Wrap_Struct(cTermEnum, NULL, &frt_te_free, te);
689
+ frt_te_get_set_term(self, te->curr_term);
690
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
691
+ }
692
+ return self;
693
+ }
694
+
695
+ /*
696
+ * call-seq:
697
+ * term_enum.next -> term_string
698
+ *
699
+ * Returns the next term in the enumeration or nil otherwise.
700
+ */
701
+ static VALUE
702
+ frt_te_next(VALUE self)
703
+ {
704
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
705
+ return frt_te_get_set_term(self, te->next(te));
706
+ }
707
+
708
+ /*
709
+ * call-seq:
710
+ * term_enum.term -> term_string
711
+ *
712
+ * Returns the current term pointed to by the enum. This method should only
713
+ * be called after a successful call to TermEnum#next.
714
+ */
715
+ static VALUE
716
+ frt_te_term(VALUE self)
717
+ {
718
+ return rb_ivar_get(self, id_term);
719
+ }
720
+
721
+ /*
722
+ * call-seq:
723
+ * term_enum.doc_freq -> integer
724
+ *
725
+ * Returns the document frequency of the current term pointed to by the enum.
726
+ * That is the number of documents that this term appears in. The method
727
+ * should only be called after a successful call to TermEnum#next.
728
+ */
729
+ static VALUE
730
+ frt_te_doc_freq(VALUE self)
731
+ {
732
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
733
+ return INT2FIX(te->curr_ti.doc_freq);
734
+ }
735
+
736
+ /*
737
+ * call-seq:
738
+ * term_enum.skip_to(target) -> term
739
+ *
740
+ * Skip to term +target+. This method can skip forwards or backwards. If you
741
+ * want to skip back to the start, pass the empty string "". That is;
742
+ *
743
+ * term_enum.skip_to("")
744
+ *
745
+ * Returns the first term greater than or equal to +target+
746
+ */
747
+ static VALUE
748
+ frt_te_skip_to(VALUE self, VALUE rterm)
749
+ {
750
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
751
+ return frt_te_get_set_term(self, te->skip_to(te, frt_field(rterm)));
752
+ }
753
+
754
+ /*
755
+ * call-seq:
756
+ * term_enum.each {|term, doc_freq| do_something() } -> term_count
757
+ *
758
+ * Iterates through all the terms in the field, yielding the term and the
759
+ * document frequency.
760
+ */
761
+ static VALUE
762
+ frt_te_each(VALUE self)
763
+ {
764
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
765
+ char *term;
766
+ int term_cnt = 0;
767
+ VALUE vals = rb_ary_new2(2);
768
+ rb_ary_store(vals, 0, Qnil);
769
+ rb_ary_store(vals, 1, Qnil);
770
+
771
+
772
+ /* each is being called so there will be no current term */
773
+ rb_ivar_set(self, id_term, Qnil);
774
+
775
+
776
+ while (NULL != (term = te->next(te))) {
777
+ term_cnt++;
778
+ RARRAY_PTR(vals)[0] = rb_str_new(term, te->curr_term_len);
779
+ RARRAY_PTR(vals)[1] = INT2FIX(te->curr_ti.doc_freq);
780
+ rb_yield(vals);
781
+ }
782
+ return INT2FIX(term_cnt);
783
+ }
784
+
785
+ /*
786
+ * call-seq:
787
+ * term_enum.set_field(field) -> self
788
+ *
789
+ * Set the field for the term_enum. The field value should be a symbol as
790
+ * usual. For example, to scan all title terms you'd do this;
791
+ *
792
+ * term_enum.set_field(:title).each do |term, doc_freq|
793
+ * do_something()
794
+ * end
795
+ */
796
+ static VALUE
797
+ frt_te_set_field(VALUE self, VALUE rfield)
798
+ {
799
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
800
+ int field_num = 0;
801
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
802
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
803
+ if (rfnum != Qnil) {
804
+ field_num = FIX2INT(rfnum);
805
+ rb_ivar_set(self, id_field_num, rfnum);
806
+ } else {
807
+ Check_Type(rfield, T_SYMBOL);
808
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
809
+ frt_field(rfield));
810
+ }
811
+ te->set_field(te, field_num);
812
+
813
+ return self;
814
+ }
815
+
816
+ /*
817
+ * call-seq:
818
+ * term_enum.to_json() -> string
819
+ *
820
+ * Returns a JSON representation of the term enum. You can speed this up by
821
+ * having the method return arrays instead of objects, simply by passing an
822
+ * argument to the to_json method. For example;
823
+ *
824
+ * term_enum.to_json() #=>
825
+ * # [
826
+ * # {"term":"apple","frequency":12},
827
+ * # {"term":"banana","frequency":2},
828
+ * # {"term":"cantaloupe","frequency":12}
829
+ * # ]
830
+ *
831
+ * term_enum.to_json(:fast) #=>
832
+ * # [
833
+ * # ["apple",12],
834
+ * # ["banana",2],
835
+ * # ["cantaloupe",12]
836
+ * # ]
837
+ */
838
+ static VALUE
839
+ frt_te_to_json(int argc, VALUE *argv, VALUE self)
840
+ {
841
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
842
+ VALUE rjson;
843
+ char *json, *jp;
844
+ char *term;
845
+ int capa = 65536;
846
+ jp = json = ALLOC_N(char, capa);
847
+ *(jp++) = '[';
848
+
849
+ if (argc > 0) {
850
+ while (NULL != (term = te->next(te))) {
851
+ /* enough room for for term after converting " to '"' and frequency
852
+ * plus some extra for good measure */
853
+ *(jp++) = '[';
854
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
855
+ capa <<= 1;
856
+ REALLOC_N(json, char, capa);
857
+ }
858
+ jp = json_concat_string(jp, term);
859
+ *(jp++) = ',';
860
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
861
+ jp += strlen(jp);
862
+ *(jp++) = ']';
863
+ *(jp++) = ',';
864
+ }
865
+ }
866
+ else {
867
+ while (NULL != (term = te->next(te))) {
868
+ /* enough room for for term after converting " to '"' and frequency
869
+ * plus some extra for good measure */
870
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
871
+ capa <<= 1;
872
+ REALLOC_N(json, char, capa);
873
+ }
874
+ *(jp++) = '{';
875
+ memcpy(jp, "\"term\":", 7);
876
+ jp += 7;
877
+ jp = json_concat_string(jp, term);
878
+ *(jp++) = ',';
879
+ memcpy(jp, "\"frequency\":", 12);
880
+ jp += 12;
881
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
882
+ jp += strlen(jp);
883
+ *(jp++) = '}';
884
+ *(jp++) = ',';
885
+ }
886
+ }
887
+ if (*(jp-1) == ',') jp--;
888
+ *(jp++) = ']';
889
+ *jp = '\0';
890
+
891
+ rjson = rb_str_new2(json);
892
+ free(json);
893
+ return rjson;
894
+ }
895
+
896
+ /****************************************************************************
897
+ *
898
+ * TermDocEnum Methods
899
+ *
900
+ ****************************************************************************/
901
+
902
+ static void
903
+ frt_tde_free(void *p)
904
+ {
905
+ TermDocEnum *tde = (TermDocEnum *)p;
906
+ tde->close(tde);
907
+ }
908
+
909
+ static VALUE
910
+ frt_get_tde(VALUE rir, TermDocEnum *tde)
911
+ {
912
+ VALUE self = Data_Wrap_Struct(cTermDocEnum, NULL, &frt_tde_free, tde);
913
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
914
+ return self;
915
+ }
916
+
917
+ /*
918
+ * call-seq:
919
+ * term_doc_enum.seek(field, term) -> self
920
+ *
921
+ * Seek the term +term+ in the index for +field+. After you call this method
922
+ * you can call next or each to skip through the documents and positions of
923
+ * this particular term.
924
+ */
925
+ static VALUE
926
+ frt_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
927
+ {
928
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
929
+ char *term;
930
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
931
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
932
+ int field_num = -1;
933
+ term = StringValuePtr(rterm);
934
+ if (rfnum != Qnil) {
935
+ field_num = FIX2INT(rfnum);
936
+ } else {
937
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
938
+ frt_field(rfield));
939
+ }
940
+ tde->seek(tde, field_num, term);
941
+ return self;
942
+ }
943
+
944
+ /*
945
+ * call-seq:
946
+ * term_doc_enum.seek_term_enum(term_enum) -> self
947
+ *
948
+ * Seek the current term in +term_enum+. You could just use the standard seek
949
+ * method like this;
950
+ *
951
+ * term_doc_enum.seek(term_enum.term)
952
+ *
953
+ * However the +seek_term_enum+ method saves an index lookup so should offer
954
+ * a large performance improvement.
955
+ */
956
+ static VALUE
957
+ frt_tde_seek_te(VALUE self, VALUE rterm_enum)
958
+ {
959
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
960
+ TermEnum *te = (TermEnum *)frt_rb_data_ptr(rterm_enum);
961
+ tde->seek_te(tde, te);
962
+ return self;
963
+ }
964
+
965
+ /*
966
+ * call-seq:
967
+ * term_doc_enum.doc -> doc_id
968
+ *
969
+ * Returns the current document number pointed to by the +term_doc_enum+.
970
+ */
971
+ static VALUE
972
+ frt_tde_doc(VALUE self)
973
+ {
974
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
975
+ return INT2FIX(tde->doc_num(tde));
976
+ }
977
+
978
+ /*
979
+ * call-seq:
980
+ * term_doc_enum.doc -> doc_id
981
+ *
982
+ * Returns the frequency of the current document pointed to by the
983
+ * +term_doc_enum+.
984
+ */
985
+ static VALUE
986
+ frt_tde_freq(VALUE self)
987
+ {
988
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
989
+ return INT2FIX(tde->freq(tde));
990
+ }
991
+
992
+ /*
993
+ * call-seq:
994
+ * term_doc_enum.doc -> doc_id
995
+ *
996
+ * Move forward to the next document in the enumeration. Returns +true+ if
997
+ * there is another document or +false+ otherwise.
998
+ */
999
+ static VALUE
1000
+ frt_tde_next(VALUE self)
1001
+ {
1002
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1003
+ return tde->next(tde) ? Qtrue : Qfalse;
1004
+ }
1005
+
1006
+ /*
1007
+ * call-seq:
1008
+ * term_doc_enum.doc -> doc_id
1009
+ *
1010
+ * Move forward to the next document in the enumeration. Returns +true+ if
1011
+ * there is another document or +false+ otherwise.
1012
+ */
1013
+ static VALUE
1014
+ frt_tde_next_position(VALUE self)
1015
+ {
1016
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1017
+ int pos;
1018
+ if (tde->next_position == NULL) {
1019
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1020
+ "the TermDocEnum with Index#term_positions method rather "
1021
+ "than the Index#term_docs method");
1022
+ }
1023
+ pos = tde->next_position(tde);
1024
+ return pos >= 0 ? INT2FIX(pos) : Qnil;
1025
+ }
1026
+
1027
+ /*
1028
+ * call-seq:
1029
+ * term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
1030
+ *
1031
+ * Iterate through the documents and document frequencies in the
1032
+ * +term_doc_enum+.
1033
+ *
1034
+ * NOTE: this method can only be called once after each seek. If you need to
1035
+ * call +#each+ again then you should call +#seek+ again too.
1036
+ */
1037
+ static VALUE
1038
+ frt_tde_each(VALUE self)
1039
+ {
1040
+ int doc_cnt = 0;
1041
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1042
+ VALUE vals = rb_ary_new2(2);
1043
+ rb_ary_store(vals, 0, Qnil);
1044
+ rb_ary_store(vals, 1, Qnil);
1045
+
1046
+ while (tde->next(tde)) {
1047
+ doc_cnt++;
1048
+ RARRAY_PTR(vals)[0] = INT2FIX(tde->doc_num(tde));
1049
+ RARRAY_PTR(vals)[1] = INT2FIX(tde->freq(tde));
1050
+ rb_yield(vals);
1051
+
1052
+ }
1053
+ return INT2FIX(doc_cnt);
1054
+ }
1055
+
1056
+ /*
1057
+ * call-seq:
1058
+ * term_doc_enum.to_json() -> string
1059
+ *
1060
+ * Returns a json representation of the term doc enum. It will also add the
1061
+ * term positions if they are available. You can speed this up by having the
1062
+ * method return arrays instead of objects, simply by passing an argument to
1063
+ * the to_json method. For example;
1064
+ *
1065
+ * term_doc_enum.to_json() #=>
1066
+ * # [
1067
+ * # {"document":1,"frequency":12},
1068
+ * # {"document":11,"frequency":1},
1069
+ * # {"document":29,"frequency":120},
1070
+ * # {"document":30,"frequency":3}
1071
+ * # ]
1072
+ *
1073
+ * term_doc_enum.to_json(:fast) #=>
1074
+ * # [
1075
+ * # [1,12],
1076
+ * # [11,1],
1077
+ * # [29,120],
1078
+ * # [30,3]
1079
+ * # ]
1080
+ */
1081
+ static VALUE
1082
+ frt_tde_to_json(int argc, VALUE *argv, VALUE self)
1083
+ {
1084
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1085
+ VALUE rjson;
1086
+ char *json, *jp;
1087
+ int capa = 65536;
1088
+ char *format;
1089
+ char close = (argc > 0) ? ']' : '}';
1090
+ bool do_positions = tde->next_position != NULL;
1091
+ jp = json = ALLOC_N(char, capa);
1092
+ *(jp++) = '[';
1093
+
1094
+ if (do_positions) {
1095
+ if (argc == 0) {
1096
+ format = "{\"document\":%d,\"frequency\":%d,\"positions\":[";
1097
+ }
1098
+ else {
1099
+ format = "[%d,%d,[";
1100
+ }
1101
+ }
1102
+ else {
1103
+ if (argc == 0) {
1104
+ format = "{\"document\":%d,\"frequency\":%d},";
1105
+ }
1106
+ else {
1107
+ format = "[%d,%d],";
1108
+ }
1109
+ }
1110
+ while (tde->next(tde)) {
1111
+ /* 100 chars should be enough room for an extra entry */
1112
+ if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) {
1113
+ capa <<= 1;
1114
+ REALLOC_N(json, char, capa);
1115
+ }
1116
+ sprintf(jp, format, tde->doc_num(tde), tde->freq(tde));
1117
+ jp += strlen(jp);
1118
+ if (do_positions) {
1119
+ int pos;
1120
+ while (0 <= (pos = tde->next_position(tde))) {
1121
+ sprintf(jp, "%d,", pos);
1122
+ jp += strlen(jp);
1123
+ }
1124
+ if (*(jp - 1) == ',') jp--;
1125
+ *(jp++) = ']';
1126
+ *(jp++) = close;
1127
+ *(jp++) = ',';
1128
+ }
1129
+ }
1130
+ if (*(jp - 1) == ',') jp--;
1131
+ *(jp++) = ']';
1132
+ *jp = '\0';
1133
+
1134
+ rjson = rb_str_new2(json);
1135
+ free(json);
1136
+ return rjson;
1137
+ }
1138
+
1139
+ /*
1140
+ * call-seq:
1141
+ * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
1142
+ *
1143
+ * Iterate through each of the positions occupied by the current term in the
1144
+ * current document. This can only be called once per document. It can be
1145
+ * used within the each method. For example, to print the terms documents and
1146
+ * positions;
1147
+ *
1148
+ * tde.each do |doc_id, freq|
1149
+ * puts "term appeared #{freq} times in document #{doc_id}:"
1150
+ * positions = []
1151
+ * tde.each_position {|pos| positions << pos}
1152
+ * puts " #{positions.join(', ')}"
1153
+ * end
1154
+ */
1155
+ static VALUE
1156
+ frt_tde_each_position(VALUE self)
1157
+ {
1158
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1159
+ int pos;
1160
+ if (tde->next_position == NULL) {
1161
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1162
+ "the TermDocEnum with Index#term_positions method rather "
1163
+ "than the Index#term_docs method");
1164
+ }
1165
+ while (0 <= (pos = tde->next_position(tde))) {
1166
+ rb_yield(INT2FIX(pos));
1167
+ }
1168
+ return self;
1169
+ }
1170
+
1171
+ /*
1172
+ * call-seq:
1173
+ * term_doc_enum.skip_to(target) -> bool
1174
+ *
1175
+ * Skip to the required document number +target+ and return true if there is
1176
+ * a document >= +target+.
1177
+ */
1178
+ static VALUE
1179
+ frt_tde_skip_to(VALUE self, VALUE rtarget)
1180
+ {
1181
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1182
+ return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
1183
+ }
1184
+
1185
+ /****************************************************************************
1186
+ *
1187
+ * TVOffsets Methods
1188
+ *
1189
+ ****************************************************************************/
1190
+
1191
+ static VALUE
1192
+ frt_get_tv_offsets(Offset *offset)
1193
+ {
1194
+ return rb_struct_new(cTVOffsets,
1195
+ ULL2NUM((unsigned long long)offset->start),
1196
+ ULL2NUM((unsigned long long)offset->end),
1197
+ NULL);
1198
+ }
1199
+
1200
+ /****************************************************************************
1201
+ *
1202
+ * TVTerm Methods
1203
+ *
1204
+ ****************************************************************************/
1205
+
1206
+ static VALUE
1207
+ frt_get_tv_term(TVTerm *tv_term)
1208
+ {
1209
+ int i;
1210
+ const int freq = tv_term->freq;
1211
+ VALUE rtext;
1212
+ VALUE rpositions = Qnil;
1213
+ rtext = rb_str_new2(tv_term->text);
1214
+ if (tv_term->positions) {
1215
+ int *positions = tv_term->positions;
1216
+ rpositions = rb_ary_new2(freq);
1217
+ for (i = 0; i < freq; i++) {
1218
+ rb_ary_store(rpositions, i, INT2FIX(positions[i]));
1219
+ }
1220
+ }
1221
+ return rb_struct_new(cTVTerm, rtext, rpositions, NULL);
1222
+ }
1223
+
1224
+ /****************************************************************************
1225
+ *
1226
+ * TermVector Methods
1227
+ *
1228
+ ****************************************************************************/
1229
+
1230
+ static VALUE
1231
+ frt_get_tv(TermVector *tv)
1232
+ {
1233
+ int i;
1234
+ TVTerm *terms = tv->terms;
1235
+ const int t_cnt = tv->term_cnt;
1236
+ const int o_cnt = tv->offset_cnt;
1237
+ VALUE rfield, rterms;
1238
+ VALUE roffsets = Qnil;
1239
+ rfield = ID2SYM(rb_intern(tv->field));
1240
+
1241
+ rterms = rb_ary_new2(t_cnt);
1242
+ for (i = 0; i < t_cnt; i++) {
1243
+ rb_ary_store(rterms, i, frt_get_tv_term(&terms[i]));
1244
+ }
1245
+
1246
+ if (tv->offsets) {
1247
+ Offset *offsets = tv->offsets;
1248
+ roffsets = rb_ary_new2(o_cnt);
1249
+ for (i = 0; i < o_cnt; i++) {
1250
+ rb_ary_store(roffsets, i, frt_get_tv_offsets(&offsets[i]));
1251
+ }
1252
+ }
1253
+
1254
+ return rb_struct_new(cTermVector, rfield, rterms, roffsets, NULL);
1255
+ }
1256
+
1257
+ /****************************************************************************
1258
+ *
1259
+ * IndexWriter Methods
1260
+ *
1261
+ ****************************************************************************/
1262
+
1263
+ void
1264
+ frt_iw_free(void *p)
1265
+ {
1266
+ iw_close((IndexWriter *)p);
1267
+ }
1268
+
1269
+ void
1270
+ frt_iw_mark(void *p)
1271
+ {
1272
+ IndexWriter *iw = (IndexWriter *)p;
1273
+ frt_gc_mark(iw->analyzer);
1274
+ frt_gc_mark(iw->store);
1275
+ frt_gc_mark(iw->fis);
1276
+ }
1277
+
1278
+ /*
1279
+ * call-seq:
1280
+ * index_writer.close -> nil
1281
+ *
1282
+ * Close the IndexWriter. This will close and free all resources used
1283
+ * exclusively by the index writer. The garbage collector will do this
1284
+ * automatically if not called explicitly.
1285
+ */
1286
+ static VALUE
1287
+ frt_iw_close(VALUE self)
1288
+ {
1289
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1290
+ Frt_Unwrap_Struct(self);
1291
+ iw_close(iw);
1292
+ return Qnil;
1293
+ }
1294
+
1295
+ #define SET_INT_ATTR(attr) \
1296
+ do {\
1297
+ if (RTEST(rval = rb_hash_aref(roptions, sym_##attr)))\
1298
+ config.attr = FIX2INT(rval);\
1299
+ } while (0)
1300
+
1301
+ /*
1302
+ * call-seq:
1303
+ * IndexWriter.new(options = {}) -> index_writer
1304
+ *
1305
+ * Create a new IndexWriter. You should either pass a path or a directory to
1306
+ * this constructor. For example, here are three ways you can create an
1307
+ * IndexWriter;
1308
+ *
1309
+ * dir = RAMDirectory.new()
1310
+ * iw = IndexWriter.new(:dir => dir)
1311
+ *
1312
+ * dir = FSDirectory.new("/path/to/index")
1313
+ * iw = IndexWriter.new(:dir => dir)
1314
+ *
1315
+ * iw = IndexWriter.new(:path => "/path/to/index")
1316
+ *
1317
+ * See IndexWriter for more options.
1318
+ */
1319
+ static VALUE
1320
+ frt_iw_init(int argc, VALUE *argv, VALUE self)
1321
+ {
1322
+ VALUE roptions, rval;
1323
+ bool create = false;
1324
+ bool create_if_missing = true;
1325
+ Store *store = NULL;
1326
+ Analyzer *analyzer = NULL;
1327
+ IndexWriter *volatile iw = NULL;
1328
+ Config config = default_config;
1329
+
1330
+ rb_scan_args(argc, argv, "01", &roptions);
1331
+ if (argc > 0) {
1332
+ Check_Type(roptions, T_HASH);
1333
+
1334
+ if ((rval = rb_hash_aref(roptions, sym_dir)) != Qnil) {
1335
+ Check_Type(rval, T_DATA);
1336
+ store = DATA_PTR(rval);
1337
+ } else if ((rval = rb_hash_aref(roptions, sym_path)) != Qnil) {
1338
+ StringValue(rval);
1339
+ frt_create_dir(rval);
1340
+ store = open_fs_store(rs2s(rval));
1341
+ DEREF(store);
1342
+ }
1343
+
1344
+ /* Let ruby's garbage collector handle the closing of the store
1345
+ if (!close_dir) {
1346
+ close_dir = RTEST(rb_hash_aref(roptions, sym_close_dir));
1347
+ }
1348
+ */
1349
+ /* use_compound_file defaults to true */
1350
+ config.use_compound_file =
1351
+ (rb_hash_aref(roptions, sym_use_compound_file) == Qfalse)
1352
+ ? false
1353
+ : true;
1354
+
1355
+ if ((rval = rb_hash_aref(roptions, sym_analyzer)) != Qnil) {
1356
+ analyzer = frt_get_cwrapped_analyzer(rval);
1357
+ }
1358
+
1359
+ create = RTEST(rb_hash_aref(roptions, sym_create));
1360
+ if ((rval = rb_hash_aref(roptions, sym_create_if_missing)) != Qnil) {
1361
+ create_if_missing = RTEST(rval);
1362
+ }
1363
+ SET_INT_ATTR(chunk_size);
1364
+ SET_INT_ATTR(max_buffer_memory);
1365
+ SET_INT_ATTR(index_interval);
1366
+ SET_INT_ATTR(skip_interval);
1367
+ SET_INT_ATTR(merge_factor);
1368
+ SET_INT_ATTR(max_buffered_docs);
1369
+ SET_INT_ATTR(max_merge_docs);
1370
+ SET_INT_ATTR(max_field_length);
1371
+ }
1372
+ if (NULL == store) {
1373
+ store = open_ram_store();
1374
+ DEREF(store);
1375
+ }
1376
+ if (!create && create_if_missing && !store->exists(store, "segments")) {
1377
+ create = true;
1378
+ }
1379
+ if (create) {
1380
+ FieldInfos *fis;
1381
+ if ((rval = rb_hash_aref(roptions, sym_field_infos)) != Qnil) {
1382
+ Data_Get_Struct(rval, FieldInfos, fis);
1383
+ index_create(store, fis);
1384
+ } else {
1385
+ fis = fis_new(STORE_YES, INDEX_YES,
1386
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS);
1387
+ index_create(store, fis);
1388
+ fis_deref(fis);
1389
+ }
1390
+ }
1391
+
1392
+ iw = iw_open(store, analyzer, &config);
1393
+
1394
+ Frt_Wrap_Struct(self, &frt_iw_mark, &frt_iw_free, iw);
1395
+
1396
+ if (rb_block_given_p()) {
1397
+ rb_yield(self);
1398
+ frt_iw_close(self);
1399
+ return Qnil;
1400
+ } else {
1401
+ return self;
1402
+ }
1403
+ }
1404
+
1405
+ /*
1406
+ * call-seq:
1407
+ * iw.doc_count -> number
1408
+ *
1409
+ * Returns the number of documents in the Index. Note that deletions won't be
1410
+ * taken into account until the IndexWriter has been committed.
1411
+ */
1412
+ static VALUE
1413
+ frt_iw_get_doc_count(VALUE self)
1414
+ {
1415
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1416
+ return INT2FIX(iw_doc_count(iw));
1417
+ }
1418
+
1419
+ static int
1420
+ frt_hash_to_doc_i(VALUE key, VALUE value, VALUE arg)
1421
+ {
1422
+ if (key == Qundef) {
1423
+ return ST_CONTINUE;
1424
+ } else {
1425
+ Document *doc = (Document *)arg;
1426
+ char *field;
1427
+ VALUE val;
1428
+ DocField *df;
1429
+ switch (TYPE(key)) {
1430
+ case T_STRING:
1431
+ field = rs2s(key);
1432
+ break;
1433
+ case T_SYMBOL:
1434
+ field = rb_id2name(SYM2ID(key));
1435
+ break;
1436
+ default:
1437
+ rb_raise(rb_eArgError,
1438
+ "%s cannot be a key to a field. Field keys must "
1439
+ " be symbols.", rs2s(rb_obj_as_string(key)));
1440
+ break;
1441
+ }
1442
+ if (NULL == (df = doc_get_field(doc, field))) {
1443
+ df = df_new(field);
1444
+ }
1445
+ if (rb_respond_to(value, id_boost)) {
1446
+ df->boost = (float)NUM2DBL(rb_funcall(value, id_boost, 0));
1447
+ }
1448
+ switch (TYPE(value)) {
1449
+ case T_ARRAY:
1450
+ {
1451
+ int i;
1452
+ df->destroy_data = true;
1453
+ for (i = 0; i < RARRAY_LEN(value); i++) {
1454
+ val = rb_obj_as_string(RARRAY_PTR(value)[i]);
1455
+ df_add_data_len(df, nstrdup(val), RSTRING_LEN(val));
1456
+ }
1457
+ }
1458
+ break;
1459
+ case T_STRING:
1460
+ df_add_data_len(df, rs2s(value), RSTRING_LEN(value));
1461
+ break;
1462
+ default:
1463
+ val = rb_obj_as_string(value);
1464
+ df->destroy_data = true;
1465
+ df_add_data_len(df, nstrdup(val), RSTRING_LEN(val));
1466
+ break;
1467
+ }
1468
+ doc_add_field(doc, df);
1469
+ }
1470
+ return ST_CONTINUE;
1471
+ }
1472
+
1473
+ static Document *
1474
+ frt_get_doc(VALUE rdoc)
1475
+ {
1476
+ VALUE val;
1477
+ Document *doc = doc_new();
1478
+ DocField *df;
1479
+
1480
+ if (rb_respond_to(rdoc, id_boost)) {
1481
+ doc->boost = (float)NUM2DBL(rb_funcall(rdoc, id_boost, 0));
1482
+ }
1483
+
1484
+ switch (TYPE(rdoc)) {
1485
+ case T_HASH:
1486
+ rb_hash_foreach(rdoc, frt_hash_to_doc_i, (VALUE)doc);
1487
+ break;
1488
+ case T_ARRAY:
1489
+ {
1490
+ int i;
1491
+ df = df_new("content");
1492
+ df->destroy_data = true;
1493
+ for (i = 0; i < RARRAY_LEN(rdoc); i++) {
1494
+ val = rb_obj_as_string(RARRAY_PTR(rdoc)[i]);
1495
+ df_add_data_len(df, nstrdup(val), RSTRING_LEN(val));
1496
+ }
1497
+ doc_add_field(doc, df);
1498
+ }
1499
+ break;
1500
+ case T_SYMBOL:
1501
+ df = df_add_data(df_new("content"), rb_id2name(SYM2ID(rdoc)));
1502
+ doc_add_field(doc, df);
1503
+ break;
1504
+ case T_STRING:
1505
+ df = df_add_data_len(df_new("content"), rs2s(rdoc),
1506
+ RSTRING_LEN(rdoc));
1507
+ doc_add_field(doc, df);
1508
+ break;
1509
+ default:
1510
+ val = rb_obj_as_string(rdoc);
1511
+ df = df_add_data_len(df_new("content"), nstrdup(val),
1512
+ RSTRING_LEN(val));
1513
+ df->destroy_data = true;
1514
+ doc_add_field(doc, df);
1515
+ break;
1516
+ }
1517
+ return doc;
1518
+ }
1519
+
1520
+ /*
1521
+ * call-seq:
1522
+ * iw << document -> iw
1523
+ * iw.add_document(document) -> iw
1524
+ *
1525
+ * Add a document to the index. See Document. A document can also be a simple
1526
+ * hash object.
1527
+ */
1528
+ static VALUE
1529
+ frt_iw_add_doc(VALUE self, VALUE rdoc)
1530
+ {
1531
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1532
+ Document *doc = frt_get_doc(rdoc);
1533
+ iw_add_doc(iw, doc);
1534
+ doc_destroy(doc);
1535
+ return self;
1536
+ }
1537
+
1538
+ /*
1539
+ * call-seq:
1540
+ * iw.optimize -> iw
1541
+ *
1542
+ * Optimize the index for searching. This commits any unwritten data to the
1543
+ * index and optimizes the index into a single segment to improve search
1544
+ * performance. This is an expensive operation and should not be called too
1545
+ * often. The best time to call this is at the end of a long batch indexing
1546
+ * process. Note that calling the optimize method do not in any way effect
1547
+ * indexing speed (except for the time taken to complete the optimization
1548
+ * process).
1549
+ */
1550
+ static VALUE
1551
+ frt_iw_optimize(VALUE self)
1552
+ {
1553
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1554
+ iw_optimize(iw);
1555
+ return self;
1556
+ }
1557
+
1558
+ /*
1559
+ * call-seq:
1560
+ * iw.commit -> iw
1561
+ *
1562
+ * Explicitly commit any changes to the index that may be hanging around in
1563
+ * memory. You should call this method if you want to read the latest index
1564
+ * with an IndexWriter.
1565
+ */
1566
+ static VALUE
1567
+ frt_iw_commit(VALUE self)
1568
+ {
1569
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1570
+ iw_commit(iw);
1571
+ return self;
1572
+ }
1573
+
1574
+ /*
1575
+ * call-seq:
1576
+ * iw.add_readers(reader_array) -> iw
1577
+ *
1578
+ * Use this method to merge other indexes into the one being written by
1579
+ * IndexWriter. This is useful for parallel indexing. You can have several
1580
+ * indexing processes running in parallel, possibly even on different
1581
+ * machines. Then you can finish by merging all of the indexes into a single
1582
+ * index.
1583
+ */
1584
+ static VALUE
1585
+ frt_iw_add_readers(VALUE self, VALUE rreaders)
1586
+ {
1587
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1588
+ int i;
1589
+ IndexReader **irs;
1590
+ Check_Type(rreaders, T_ARRAY);
1591
+
1592
+ irs = ALLOC_N(IndexReader *, RARRAY_LEN(rreaders));
1593
+ i = RARRAY_LEN(rreaders);
1594
+ while (i-- > 0) {
1595
+ IndexReader *ir;
1596
+ Data_Get_Struct(RARRAY_PTR(rreaders)[i], IndexReader, ir);
1597
+ irs[i] = ir;
1598
+ }
1599
+ iw_add_readers(iw, irs, RARRAY_LEN(rreaders));
1600
+ free(irs);
1601
+ return self;
1602
+ }
1603
+
1604
+ /*
1605
+ * call-seq:
1606
+ * iw.delete(field, term) -> iw
1607
+ *
1608
+ * Delete all documents in the index with the term +term+ in the field
1609
+ * +field+. You should usually have a unique document id which you use with
1610
+ * this method, rather then deleting all documents with the word "the" in
1611
+ * them. You may however use this method to delete spam.
1612
+ */
1613
+ static VALUE
1614
+ frt_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
1615
+ {
1616
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1617
+ iw_delete_term(iw, frt_field(rfield), StringValuePtr(rterm));
1618
+ return self;
1619
+ }
1620
+
1621
+ /*
1622
+ * call-seq:
1623
+ * index_writer.field_infos -> FieldInfos
1624
+ *
1625
+ * Get the FieldInfos object for this IndexWriter. This is useful if you need
1626
+ * to dynamically add new fields to the index with specific properties.
1627
+ */
1628
+ static VALUE
1629
+ frt_iw_field_infos(VALUE self)
1630
+ {
1631
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1632
+ return frt_get_field_infos(iw->fis);
1633
+ }
1634
+
1635
+ /*
1636
+ * call-seq:
1637
+ * index_writer.analyzer -> Analyzer
1638
+ *
1639
+ * Get the Analyzer for this IndexWriter. This is useful if you need
1640
+ * to use the same analyzer in a QueryParser.
1641
+ */
1642
+ static VALUE
1643
+ frt_iw_get_analyzer(VALUE self)
1644
+ {
1645
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1646
+ return frt_get_analyzer(iw->analyzer);
1647
+ }
1648
+
1649
+ /*
1650
+ * call-seq:
1651
+ * index_writer.analyzer -> Analyzer
1652
+ *
1653
+ * Set the Analyzer for this IndexWriter. This is useful if you need to
1654
+ * change the analyzer for a special document. It is risky though as the
1655
+ * same analyzer will be used for all documents during search.
1656
+ */
1657
+ static VALUE
1658
+ frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
1659
+ {
1660
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1661
+
1662
+ a_deref(iw->analyzer);
1663
+ iw->analyzer = frt_get_cwrapped_analyzer(ranalyzer);
1664
+ return ranalyzer;
1665
+ }
1666
+
1667
+ /*
1668
+ * call-seq:
1669
+ * index_writer.version -> int
1670
+ *
1671
+ * Returns the current version of the index writer.
1672
+ */
1673
+ static VALUE
1674
+ frt_iw_version(VALUE self)
1675
+ {
1676
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1677
+ return ULL2NUM(iw->sis->version);
1678
+ }
1679
+
1680
+ /*
1681
+ * call-seq:
1682
+ * iw.chunk_size -> number
1683
+ *
1684
+ * Return the current value of chunk_size
1685
+ */
1686
+ static VALUE
1687
+ frt_iw_get_chunk_size(VALUE self)
1688
+ {
1689
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1690
+ return INT2FIX(iw->config.chunk_size);
1691
+ }
1692
+
1693
+ /*
1694
+ * call-seq:
1695
+ * iw.chunk_size = chunk_size -> chunk_size
1696
+ *
1697
+ * Set the chunk_size parameter
1698
+ */
1699
+ static VALUE
1700
+ frt_iw_set_chunk_size(VALUE self, VALUE rval)
1701
+ {
1702
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1703
+ iw->config.chunk_size = FIX2INT(rval);
1704
+ return rval;
1705
+ }
1706
+
1707
+ /*
1708
+ * call-seq:
1709
+ * iw.max_buffer_memory -> number
1710
+ *
1711
+ * Return the current value of max_buffer_memory
1712
+ */
1713
+ static VALUE
1714
+ frt_iw_get_max_buffer_memory(VALUE self)
1715
+ {
1716
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1717
+ return INT2FIX(iw->config.max_buffer_memory);
1718
+ }
1719
+
1720
+ /*
1721
+ * call-seq:
1722
+ * iw.max_buffer_memory = max_buffer_memory -> max_buffer_memory
1723
+ *
1724
+ * Set the max_buffer_memory parameter
1725
+ */
1726
+ static VALUE
1727
+ frt_iw_set_max_buffer_memory(VALUE self, VALUE rval)
1728
+ {
1729
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1730
+ iw->config.max_buffer_memory = FIX2INT(rval);
1731
+ return rval;
1732
+ }
1733
+
1734
+ /*
1735
+ * call-seq:
1736
+ * iw.term_index_interval -> number
1737
+ *
1738
+ * Return the current value of term_index_interval
1739
+ */
1740
+ static VALUE
1741
+ frt_iw_get_index_interval(VALUE self)
1742
+ {
1743
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1744
+ return INT2FIX(iw->config.index_interval);
1745
+ }
1746
+
1747
+ /*
1748
+ * call-seq:
1749
+ * iw.term_index_interval = term_index_interval -> term_index_interval
1750
+ *
1751
+ * Set the term_index_interval parameter
1752
+ */
1753
+ static VALUE
1754
+ frt_iw_set_index_interval(VALUE self, VALUE rval)
1755
+ {
1756
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1757
+ iw->config.index_interval = FIX2INT(rval);
1758
+ return rval;
1759
+ }
1760
+
1761
+ /*
1762
+ * call-seq:
1763
+ * iw.doc_skip_interval -> number
1764
+ *
1765
+ * Return the current value of doc_skip_interval
1766
+ */
1767
+ static VALUE
1768
+ frt_iw_get_skip_interval(VALUE self)
1769
+ {
1770
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1771
+ return INT2FIX(iw->config.skip_interval);
1772
+ }
1773
+
1774
+ /*
1775
+ * call-seq:
1776
+ * iw.doc_skip_interval = doc_skip_interval -> doc_skip_interval
1777
+ *
1778
+ * Set the doc_skip_interval parameter
1779
+ */
1780
+ static VALUE
1781
+ frt_iw_set_skip_interval(VALUE self, VALUE rval)
1782
+ {
1783
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1784
+ iw->config.skip_interval = FIX2INT(rval);
1785
+ return rval;
1786
+ }
1787
+
1788
+ /*
1789
+ * call-seq:
1790
+ * iw.merge_factor -> number
1791
+ *
1792
+ * Return the current value of merge_factor
1793
+ */
1794
+ static VALUE
1795
+ frt_iw_get_merge_factor(VALUE self)
1796
+ {
1797
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1798
+ return INT2FIX(iw->config.merge_factor);
1799
+ }
1800
+
1801
+ /*
1802
+ * call-seq:
1803
+ * iw.merge_factor = merge_factor -> merge_factor
1804
+ *
1805
+ * Set the merge_factor parameter
1806
+ */
1807
+ static VALUE
1808
+ frt_iw_set_merge_factor(VALUE self, VALUE rval)
1809
+ {
1810
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1811
+ iw->config.merge_factor = FIX2INT(rval);
1812
+ return rval;
1813
+ }
1814
+
1815
+ /*
1816
+ * call-seq:
1817
+ * iw.max_buffered_docs -> number
1818
+ *
1819
+ * Return the current value of max_buffered_docs
1820
+ */
1821
+ static VALUE
1822
+ frt_iw_get_max_buffered_docs(VALUE self)
1823
+ {
1824
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1825
+ return INT2FIX(iw->config.max_buffered_docs);
1826
+ }
1827
+
1828
+ /*
1829
+ * call-seq:
1830
+ * iw.max_buffered_docs = max_buffered_docs -> max_buffered_docs
1831
+ *
1832
+ * Set the max_buffered_docs parameter
1833
+ */
1834
+ static VALUE
1835
+ frt_iw_set_max_buffered_docs(VALUE self, VALUE rval)
1836
+ {
1837
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1838
+ iw->config.max_buffered_docs = FIX2INT(rval);
1839
+ return rval;
1840
+ }
1841
+
1842
+ /*
1843
+ * call-seq:
1844
+ * iw.max_merge_docs -> number
1845
+ *
1846
+ * Return the current value of max_merge_docs
1847
+ */
1848
+ static VALUE
1849
+ frt_iw_get_max_merge_docs(VALUE self)
1850
+ {
1851
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1852
+ return INT2FIX(iw->config.max_merge_docs);
1853
+ }
1854
+
1855
+ /*
1856
+ * call-seq:
1857
+ * iw.max_merge_docs = max_merge_docs -> max_merge_docs
1858
+ *
1859
+ * Set the max_merge_docs parameter
1860
+ */
1861
+ static VALUE
1862
+ frt_iw_set_max_merge_docs(VALUE self, VALUE rval)
1863
+ {
1864
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1865
+ iw->config.max_merge_docs = FIX2INT(rval);
1866
+ return rval;
1867
+ }
1868
+
1869
+ /*
1870
+ * call-seq:
1871
+ * iw.max_field_length -> number
1872
+ *
1873
+ * Return the current value of max_field_length
1874
+ */
1875
+ static VALUE
1876
+ frt_iw_get_max_field_length(VALUE self)
1877
+ {
1878
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1879
+ return INT2FIX(iw->config.max_field_length);
1880
+ }
1881
+
1882
+ /*
1883
+ * call-seq:
1884
+ * iw.max_field_length = max_field_length -> max_field_length
1885
+ *
1886
+ * Set the max_field_length parameter
1887
+ */
1888
+ static VALUE
1889
+ frt_iw_set_max_field_length(VALUE self, VALUE rval)
1890
+ {
1891
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1892
+ iw->config.max_field_length = FIX2INT(rval);
1893
+ return rval;
1894
+ }
1895
+
1896
+ /*
1897
+ * call-seq:
1898
+ * iw.use_compound_file -> number
1899
+ *
1900
+ * Return the current value of use_compound_file
1901
+ */
1902
+ static VALUE
1903
+ frt_iw_get_use_compound_file(VALUE self)
1904
+ {
1905
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1906
+ return iw->config.use_compound_file ? Qtrue : Qfalse;
1907
+ }
1908
+
1909
+ /*
1910
+ * call-seq:
1911
+ * iw.use_compound_file = use_compound_file -> use_compound_file
1912
+ *
1913
+ * Set the use_compound_file parameter
1914
+ */
1915
+ static VALUE
1916
+ frt_iw_set_use_compound_file(VALUE self, VALUE rval)
1917
+ {
1918
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1919
+ iw->config.use_compound_file = RTEST(rval);
1920
+ return rval;
1921
+ }
1922
+
1923
+ /****************************************************************************
1924
+ *
1925
+ * LazyDoc Methods
1926
+ *
1927
+ ****************************************************************************/
1928
+
1929
+ static void
1930
+ frt_lzd_date_free(void *p)
1931
+ {
1932
+ lazy_doc_close((LazyDoc *)p);
1933
+ }
1934
+
1935
+ static VALUE
1936
+ frt_lazy_df_load(VALUE self, VALUE rkey, LazyDocField *lazy_df)
1937
+ {
1938
+ VALUE rdata = Qnil;
1939
+ if (lazy_df) {
1940
+ if (lazy_df->size == 1) {
1941
+ char *data = lazy_df_get_data(lazy_df, 0);
1942
+ rdata = rb_str_new(data, lazy_df->len);
1943
+ } else {
1944
+ int i;
1945
+ rdata = rb_ary_new2(lazy_df->size);
1946
+ for (i = 0; i < lazy_df->size; i++) {
1947
+ char *data = lazy_df_get_data(lazy_df, i);
1948
+ rb_ary_store(rdata, i, rb_str_new(data, lazy_df->data[i].length));
1949
+ }
1950
+ }
1951
+ rb_hash_aset(self, rkey, rdata);
1952
+ }
1953
+ return rdata;
1954
+ }
1955
+
1956
+ /*
1957
+ * call-seq:
1958
+ * lazy_doc.default(key) -> string
1959
+ *
1960
+ * This method is used internally to lazily load fields. You should never
1961
+ * really need to call it yourself.
1962
+ */
1963
+ static VALUE
1964
+ frt_lzd_default(VALUE self, VALUE rkey)
1965
+ {
1966
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1967
+ char *field = NULL;
1968
+ switch (TYPE(rkey)) {
1969
+ case T_STRING:
1970
+ field = rs2s(rkey);
1971
+ rkey = ID2SYM(rb_intern(field));
1972
+ break;
1973
+ case T_SYMBOL:
1974
+ field = frt_field(rkey);
1975
+ break;
1976
+ default:
1977
+ rb_raise(rb_eArgError,
1978
+ "%s cannot be a key to a field. Field keys must "
1979
+ " be symbols.", rs2s(rb_obj_as_string(rkey)));
1980
+ break;
1981
+ }
1982
+ return frt_lazy_df_load(self, rkey, h_get(lazy_doc->field_dict, field));
1983
+ }
1984
+
1985
+ /*
1986
+ * call-seq:
1987
+ * lazy_doc.fields -> array of available fields
1988
+ *
1989
+ * Returns the list of fields stored for this particular document. If you try
1990
+ * to access any of these fields in the document the field will be loaded.
1991
+ * Try to access any other field an nil will be returned.
1992
+ */
1993
+ static VALUE
1994
+ frt_lzd_fields(VALUE self)
1995
+ {
1996
+ return rb_ivar_get(self, id_fields);
1997
+ }
1998
+
1999
+ /*
2000
+ * call-seq:
2001
+ * lazy_doc.load -> lazy_doc
2002
+ *
2003
+ * Load all unloaded fields in the document from the index.
2004
+ */
2005
+ static VALUE
2006
+ frt_lzd_load(VALUE self)
2007
+ {
2008
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
2009
+ int i;
2010
+ for (i = 0; i < lazy_doc->size; i++) {
2011
+ LazyDocField *lazy_df = lazy_doc->fields[i];
2012
+ frt_lazy_df_load(self, ID2SYM(rb_intern(lazy_df->name)), lazy_df);
2013
+ }
2014
+ return self;
2015
+ }
2016
+
2017
+ VALUE
2018
+ frt_get_lazy_doc(LazyDoc *lazy_doc)
2019
+ {
2020
+ int i;
2021
+ VALUE rfields = rb_ary_new2(lazy_doc->size);
2022
+
2023
+ VALUE self, rdata;
2024
+ self = rb_hash_new();
2025
+ OBJSETUP(self, cLazyDoc, T_HASH);
2026
+
2027
+ rdata = Data_Wrap_Struct(cLazyDocData, NULL, &frt_lzd_date_free, lazy_doc);
2028
+ rb_ivar_set(self, id_data, rdata);
2029
+
2030
+ for (i = 0; i < lazy_doc->size; i++) {
2031
+ rb_ary_store(rfields, i, ID2SYM(rb_intern(lazy_doc->fields[i]->name)));
2032
+ }
2033
+ rb_ivar_set(self, id_fields, rfields);
2034
+
2035
+ return self;
2036
+ }
2037
+
2038
+ /****************************************************************************
2039
+ *
2040
+ * IndexReader Methods
2041
+ *
2042
+ ****************************************************************************/
2043
+
2044
+ void
2045
+ frt_ir_free(void *p)
2046
+ {
2047
+ object_del(p);
2048
+ ir_close((IndexReader *)p);
2049
+ }
2050
+
2051
+ void
2052
+ frt_ir_mark(void *p)
2053
+ {
2054
+ IndexReader *ir = (IndexReader *)p;
2055
+ frt_gc_mark(ir->store);
2056
+ }
2057
+
2058
+ static VALUE frt_ir_close(VALUE self);
2059
+
2060
+ void
2061
+ frt_mr_mark(void *p)
2062
+ {
2063
+ MultiReader *mr = (MultiReader *)p;
2064
+ int i;
2065
+ for (i = 0; i < mr->r_cnt; i++) {
2066
+ frt_gc_mark(mr->sub_readers[i]);
2067
+ }
2068
+ }
2069
+
2070
+ /*
2071
+ * call-seq:
2072
+ * IndexReader.new(dir) -> index_reader
2073
+ *
2074
+ * Create a new IndexReader. You can either pass a string path to a
2075
+ * file-system directory or an actual Ferret::Store::Directory object. For
2076
+ * example;
2077
+ *
2078
+ * dir = RAMDirectory.new()
2079
+ * iw = IndexReader.new(dir)
2080
+ *
2081
+ * dir = FSDirectory.new("/path/to/index")
2082
+ * iw = IndexReader.new(dir)
2083
+ *
2084
+ * iw = IndexReader.new("/path/to/index")
2085
+ *
2086
+ * You can also create a what used to be known as a MultiReader by passing an
2087
+ * array of IndexReader objects, Ferret::Store::Directory objects or
2088
+ * file-system paths;
2089
+ *
2090
+ * iw = IndexReader.new([dir, dir2, dir3])
2091
+ *
2092
+ * iw = IndexReader.new([reader1, reader2, reader3])
2093
+ *
2094
+ * iw = IndexReader.new(["/path/to/index1", "/path/to/index2"])
2095
+ */
2096
+ static VALUE
2097
+ frt_ir_init(VALUE self, VALUE rdir)
2098
+ {
2099
+ Store *store = NULL;
2100
+ IndexReader *ir;
2101
+ int i;
2102
+ FieldInfos *fis;
2103
+ VALUE rfield_num_map = rb_hash_new();
2104
+
2105
+ if (TYPE(rdir) == T_ARRAY) {
2106
+ VALUE rdirs = rdir;
2107
+ const int reader_cnt = RARRAY_LEN(rdir);
2108
+ IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
2109
+ int i;
2110
+ for (i = 0; i < reader_cnt; i++) {
2111
+ rdir = RARRAY_PTR(rdirs)[i];
2112
+ switch (TYPE(rdir)) {
2113
+ case T_DATA:
2114
+ if (CLASS_OF(rdir) == cIndexReader) {
2115
+ Data_Get_Struct(rdir, IndexReader, sub_readers[i]);
2116
+ REF(sub_readers[i]);
2117
+ continue;
2118
+ } else if (RTEST(rb_obj_is_kind_of(rdir, cDirectory))) {
2119
+ store = DATA_PTR(rdir);
2120
+ } else {
2121
+ rb_raise(rb_eArgError, "A Multi-IndexReader can only "
2122
+ "be created from other IndexReaders, "
2123
+ "Directory objects or file-system paths. "
2124
+ "Not %s",
2125
+ rs2s(rb_obj_as_string(rdir)));
2126
+ }
2127
+ break;
2128
+ case T_STRING:
2129
+ frt_create_dir(rdir);
2130
+ store = open_fs_store(rs2s(rdir));
2131
+ DEREF(store);
2132
+ break;
2133
+ default:
2134
+ rb_raise(rb_eArgError, "%s isn't a valid directory "
2135
+ "argument. You should use either a String or "
2136
+ "a Directory",
2137
+ rs2s(rb_obj_as_string(rdir)));
2138
+ break;
2139
+ }
2140
+ sub_readers[i] = ir_open(store);
2141
+ }
2142
+ ir = mr_open(sub_readers, reader_cnt);
2143
+ Frt_Wrap_Struct(self, &frt_mr_mark, &frt_ir_free, ir);
2144
+ } else {
2145
+ switch (TYPE(rdir)) {
2146
+ case T_DATA:
2147
+ store = DATA_PTR(rdir);
2148
+ break;
2149
+ case T_STRING:
2150
+ frt_create_dir(rdir);
2151
+ store = open_fs_store(rs2s(rdir));
2152
+ DEREF(store);
2153
+ break;
2154
+ default:
2155
+ rb_raise(rb_eArgError, "%s isn't a valid directory argument. "
2156
+ "You should use either a String or a Directory",
2157
+ rs2s(rb_obj_as_string(rdir)));
2158
+ break;
2159
+ }
2160
+ ir = ir_open(store);
2161
+ Frt_Wrap_Struct(self, &frt_ir_mark, &frt_ir_free, ir);
2162
+ }
2163
+ object_add(ir, self);
2164
+
2165
+ fis = ir->fis;
2166
+ for (i = 0; i < fis->size; i++) {
2167
+ FieldInfo *fi = fis->fields[i];
2168
+ rb_hash_aset(rfield_num_map,
2169
+ ID2SYM(rb_intern(fi->name)),
2170
+ INT2FIX(fi->number));
2171
+ }
2172
+ rb_ivar_set(self, id_fld_num_map, rfield_num_map);
2173
+
2174
+ return self;
2175
+ }
2176
+
2177
+ /*
2178
+ * call-seq:
2179
+ * index_reader.set_norm(doc_id, field, val)
2180
+ *
2181
+ * Expert: change the boost value for a +field+ in document at +doc_id+.
2182
+ * +val+ should be an integer in the range 0..255 which corresponds to an
2183
+ * encoded float value.
2184
+ */
2185
+ static VALUE
2186
+ frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
2187
+ {
2188
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2189
+ ir_set_norm(ir, FIX2INT(rdoc_id), frt_field(rfield), (uchar)NUM2CHR(rval));
2190
+ return self;
2191
+ }
2192
+
2193
+ /*
2194
+ * call-seq:
2195
+ * index_reader.norms(field) -> string
2196
+ *
2197
+ * Expert: Returns a string containing the norm values for a field. The
2198
+ * string length will be equal to the number of documents in the index and it
2199
+ * could have null bytes.
2200
+ */
2201
+ static VALUE
2202
+ frt_ir_norms(VALUE self, VALUE rfield)
2203
+ {
2204
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2205
+ uchar *norms;
2206
+ norms = ir_get_norms(ir, frt_field(rfield));
2207
+ if (norms) {
2208
+ return rb_str_new((char *)norms, ir->max_doc(ir));
2209
+ } else {
2210
+ return Qnil;
2211
+ }
2212
+ }
2213
+
2214
+ /*
2215
+ * call-seq:
2216
+ * index_reader.get_norms_into(field, buffer, offset) -> buffer
2217
+ *
2218
+ * Expert: Get the norm values into a string +buffer+ starting at +offset+.
2219
+ */
2220
+ static VALUE
2221
+ frt_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
2222
+ {
2223
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2224
+ int offset;
2225
+ offset = FIX2INT(roffset);
2226
+ Check_Type(rnorms, T_STRING);
2227
+ if (RSTRING_LEN(rnorms) < offset + ir->max_doc(ir)) {
2228
+ rb_raise(rb_eArgError, "supplied a string of length:%ld to "
2229
+ "IndexReader#get_norms_into but needed a string of length "
2230
+ "offset:%d + maxdoc:%d",
2231
+ RSTRING_LEN(rnorms), offset, ir->max_doc(ir));
2232
+ }
2233
+
2234
+ ir_get_norms_into(ir, frt_field(rfield),
2235
+ (uchar *)rs2s(rnorms) + offset);
2236
+ return rnorms;
2237
+ }
2238
+
2239
+ /*
2240
+ * call-seq:
2241
+ * index_reader.commit -> index_reader
2242
+ *
2243
+ * Commit any deletes made by this particular IndexReader to the index. This
2244
+ * will use open a Commit lock.
2245
+ */
2246
+ static VALUE
2247
+ frt_ir_commit(VALUE self)
2248
+ {
2249
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2250
+ ir_commit(ir);
2251
+ return self;
2252
+ }
2253
+
2254
+ /*
2255
+ * call-seq:
2256
+ * index_reader.close -> index_reader
2257
+ *
2258
+ * Close the IndexReader. This method also commits any deletions made by this
2259
+ * IndexReader. This method will be called explicitly by the garbage
2260
+ * collector but you should call it explicitly to commit any changes as soon
2261
+ * as possible and to close any locks held by the object to prevent locking
2262
+ * errors.
2263
+ */
2264
+ static VALUE
2265
+ frt_ir_close(VALUE self)
2266
+ {
2267
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2268
+ object_del(ir);
2269
+ Frt_Unwrap_Struct(self);
2270
+ ir_close(ir);
2271
+ return self;
2272
+ }
2273
+
2274
+ /*
2275
+ * call-seq:
2276
+ * index_reader.has_deletions? -> bool
2277
+ *
2278
+ * Return true if the index has any deletions, either uncommitted by this
2279
+ * IndexReader or committed by any other IndexReader.
2280
+ */
2281
+ static VALUE
2282
+ frt_ir_has_deletions(VALUE self)
2283
+ {
2284
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2285
+ return ir->has_deletions(ir) ? Qtrue : Qfalse;
2286
+ }
2287
+
2288
+ /*
2289
+ * call-seq:
2290
+ * index_reader.delete(doc_id) -> index_reader
2291
+ *
2292
+ * Delete document referenced internally by document id +doc_id+. The
2293
+ * document_id is the number used to reference documents in the index and is
2294
+ * returned by search methods.
2295
+ */
2296
+ static VALUE
2297
+ frt_ir_delete(VALUE self, VALUE rdoc_id)
2298
+ {
2299
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2300
+ ir_delete_doc(ir, FIX2INT(rdoc_id));
2301
+ return self;
2302
+ }
2303
+
2304
+ /*
2305
+ * call-seq:
2306
+ * index_reader.deleted?(doc_id) -> bool
2307
+ *
2308
+ * Returns true if the document at +doc_id+ has been deleted.
2309
+ */
2310
+ static VALUE
2311
+ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
2312
+ {
2313
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2314
+ return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse;
2315
+ }
2316
+
2317
+ /*
2318
+ * call-seq:
2319
+ * index_reader.max_doc -> number
2320
+ *
2321
+ * Returns 1 + the maximum document id in the index. It is the
2322
+ * document_id that will be used by the next document added to the index. If
2323
+ * there are no deletions, this number also refers to the number of documents
2324
+ * in the index.
2325
+ */
2326
+ static VALUE
2327
+ frt_ir_max_doc(VALUE self)
2328
+ {
2329
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2330
+ return INT2FIX(ir->max_doc(ir));
2331
+ }
2332
+
2333
+ /*
2334
+ * call-seq:
2335
+ * index_reader.num_docs -> number
2336
+ *
2337
+ * Returns the number of accessible (not deleted) documents in the index.
2338
+ * This will be equal to IndexReader#max_doc if there have been no documents
2339
+ * deleted from the index.
2340
+ */
2341
+ static VALUE
2342
+ frt_ir_num_docs(VALUE self)
2343
+ {
2344
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2345
+ return INT2FIX(ir->num_docs(ir));
2346
+ }
2347
+
2348
+ /*
2349
+ * call-seq:
2350
+ * index_reader.undelete_all -> index_reader
2351
+ *
2352
+ * Undelete all deleted documents in the index. This is kind of like a
2353
+ * rollback feature. Not that once an index is committed or a merge happens
2354
+ * during index, deletions will be committed and undelete_all will have no
2355
+ * effect on these documents.
2356
+ */
2357
+ static VALUE
2358
+ frt_ir_undelete_all(VALUE self)
2359
+ {
2360
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2361
+ ir_undelete_all(ir);
2362
+ return self;
2363
+ }
2364
+
2365
+ static VALUE
2366
+ frt_get_doc_range(IndexReader *ir, int pos, int len, int max)
2367
+ {
2368
+ VALUE ary;
2369
+ int i;
2370
+ max = min2(max, pos+len);
2371
+ len = max - pos;
2372
+ ary = rb_ary_new2(len);
2373
+ for (i = 0; i < len; i++) {
2374
+ rb_ary_store(ary, i, frt_get_lazy_doc(ir->get_lazy_doc(ir, i + pos)));
2375
+ }
2376
+ return ary;
2377
+ }
2378
+
2379
+ /*
2380
+ * call-seq:
2381
+ * index_reader.get_document(doc_id) -> LazyDoc
2382
+ * index_reader[doc_id] -> LazyDoc
2383
+ *
2384
+ * Retrieve a document from the index. See LazyDoc for more details on the
2385
+ * document returned. Documents are referenced internally by document ids
2386
+ * which are returned by the Searchers search methods.
2387
+ */
2388
+ static VALUE
2389
+ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
2390
+ {
2391
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2392
+ VALUE arg1, arg2;
2393
+ long pos, len;
2394
+ long max = ir->max_doc(ir);
2395
+ rb_scan_args(argc, argv, "11", &arg1, &arg2);
2396
+ if (argc == 1) {
2397
+ if (FIXNUM_P(arg1)) {
2398
+ pos = FIX2INT(arg1);
2399
+ pos = (pos < 0) ? (max + pos) : pos;
2400
+ if (pos < 0 || pos >= max) {
2401
+ rb_raise(rb_eArgError, ":%ld is out of range [%d..%ld] for "
2402
+ "IndexReader#[]", pos, 0, max);
2403
+ }
2404
+ return frt_get_lazy_doc(ir->get_lazy_doc(ir, pos));
2405
+ }
2406
+
2407
+ /* check if idx is Range */
2408
+ switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) {
2409
+ case Qfalse:
2410
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
2411
+ "IndexReader.get_document(index)",
2412
+ rb_id2name(SYM2ID(argv)));
2413
+ case Qnil:
2414
+ return Qnil;
2415
+ default:
2416
+ return frt_get_doc_range(ir, pos, len, max);
2417
+ }
2418
+ }
2419
+ else {
2420
+ pos = FIX2LONG(arg1);
2421
+ len = FIX2LONG(arg2);
2422
+ return frt_get_doc_range(ir, pos, len, max);
2423
+ }
2424
+ }
2425
+
2426
+ /*
2427
+ * call-seq:
2428
+ * index_reader.is_latest? -> bool
2429
+ *
2430
+ * Return true if the index version referenced by this IndexReader is the
2431
+ * latest version of the index. If it isn't you should close and reopen the
2432
+ * index to search the latest documents added to the index.
2433
+ */
2434
+ static VALUE
2435
+ frt_ir_is_latest(VALUE self)
2436
+ {
2437
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2438
+ return ir_is_latest(ir) ? Qtrue : Qfalse;
2439
+ }
2440
+
2441
+ /*
2442
+ * call-seq:
2443
+ * index_reader.term_vector(doc_id, field) -> TermVector
2444
+ *
2445
+ * Return the TermVector for the field +field+ in the document at +doc_id+ in
2446
+ * the index. Return nil of no such term_vector exists. See TermVector.
2447
+ */
2448
+ static VALUE
2449
+ frt_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield)
2450
+ {
2451
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2452
+ TermVector *tv;
2453
+ VALUE rtv;
2454
+ tv = ir->term_vector(ir, FIX2INT(rdoc_id), frt_field(rfield));
2455
+ if (tv) {
2456
+ rtv = frt_get_tv(tv);
2457
+ tv_destroy(tv);
2458
+ return rtv;
2459
+ }
2460
+ else {
2461
+ return Qnil;
2462
+ }
2463
+ }
2464
+
2465
+ static void
2466
+ frt_add_each_tv(void *key, void *value, void *rtvs)
2467
+ {
2468
+ rb_hash_aset((VALUE)rtvs, ID2SYM(rb_intern(key)), frt_get_tv(value));
2469
+ }
2470
+
2471
+ /*
2472
+ * call-seq:
2473
+ * index_reader.term_vectors(doc_id) -> hash of TermVector
2474
+ *
2475
+ * Return the TermVectors for the document at +doc_id+ in the index. The
2476
+ * value returned is a hash of the TermVectors for each field in the document
2477
+ * and they are referenced by field names (as symbols).
2478
+ */
2479
+ static VALUE
2480
+ frt_ir_term_vectors(VALUE self, VALUE rdoc_id)
2481
+ {
2482
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2483
+ HashTable *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id));
2484
+ VALUE rtvs = rb_hash_new();
2485
+ h_each(tvs, &frt_add_each_tv, (void *)rtvs);
2486
+ h_destroy(tvs);
2487
+
2488
+ return rtvs;
2489
+ }
2490
+
2491
+ /*
2492
+ * call-seq:
2493
+ * index_reader.term_docs -> TermDocEnum
2494
+ *
2495
+ * Builds a TermDocEnum (term-document enumerator) for the index. You can use
2496
+ * this object to iterate through the documents in which certain terms occur.
2497
+ * See TermDocEnum for more info.
2498
+ */
2499
+ static VALUE
2500
+ frt_ir_term_docs(VALUE self)
2501
+ {
2502
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2503
+ return frt_get_tde(self, ir->term_docs(ir));
2504
+ }
2505
+
2506
+ /*
2507
+ * call-seq:
2508
+ * index_reader.term_docs_for(field, term) -> TermDocEnum
2509
+ *
2510
+ * Builds a TermDocEnum to iterate through the documents that contain the
2511
+ * term +term+ in the field +field+. See TermDocEnum for more info.
2512
+ */
2513
+ static VALUE
2514
+ frt_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm)
2515
+ {
2516
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2517
+ return frt_get_tde(self, ir_term_docs_for(ir,
2518
+ frt_field(rfield),
2519
+ StringValuePtr(rterm)));
2520
+ }
2521
+
2522
+ /*
2523
+ * call-seq:
2524
+ * index_reader.term_positions -> TermDocEnum
2525
+ *
2526
+ * Same as IndexReader#term_docs except the TermDocEnum will also allow you
2527
+ * to scan through the positions at which a term occurs. See TermDocEnum for
2528
+ * more info.
2529
+ */
2530
+ static VALUE
2531
+ frt_ir_term_positions(VALUE self)
2532
+ {
2533
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2534
+ return frt_get_tde(self, ir->term_positions(ir));
2535
+ }
2536
+
2537
+ /*
2538
+ * call-seq:
2539
+ * index_reader.term_positions_for(field, term) -> TermDocEnum
2540
+ *
2541
+ * Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will
2542
+ * also allow you to scan through the positions at which a term occurs. See
2543
+ * TermDocEnum for more info.
2544
+ */
2545
+ static VALUE
2546
+ frt_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm)
2547
+ {
2548
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2549
+ return frt_get_tde(self, ir_term_positions_for(ir,
2550
+ frt_field(rfield),
2551
+ StringValuePtr(rterm)));
2552
+ }
2553
+
2554
+ /*
2555
+ * call-seq:
2556
+ * index_reader.doc_freq(field, term) -> integer
2557
+ *
2558
+ * Return the number of documents in which the term +term+ appears in the
2559
+ * field +field+.
2560
+ */
2561
+ static VALUE
2562
+ frt_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
2563
+ {
2564
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2565
+ return INT2FIX(ir_doc_freq(ir,
2566
+ frt_field(rfield),
2567
+ StringValuePtr(rterm)));
2568
+ }
2569
+
2570
+ /*
2571
+ * call-seq:
2572
+ * index_reader.terms(field) -> TermEnum
2573
+ *
2574
+ * Returns a term enumerator which allows you to iterate through all the
2575
+ * terms in the field +field+ in the index.
2576
+ */
2577
+ static VALUE
2578
+ frt_ir_terms(VALUE self, VALUE rfield)
2579
+ {
2580
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2581
+ return frt_get_te(self, ir_terms(ir, frt_field(rfield)));
2582
+ }
2583
+
2584
+ /*
2585
+ * call-seq:
2586
+ * index_reader.terms_from(field, term) -> TermEnum
2587
+ *
2588
+ * Same as IndexReader#terms(fields) except that it starts the enumerator off
2589
+ * at term +term+.
2590
+ */
2591
+ static VALUE
2592
+ frt_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2593
+ {
2594
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2595
+ return frt_get_te(self, ir_terms_from(ir,
2596
+ frt_field(rfield),
2597
+ StringValuePtr(rterm)));
2598
+ }
2599
+
2600
+ /*
2601
+ * call-seq:
2602
+ * index_reader.term_count(field) -> int
2603
+ *
2604
+ * Same return a count of the number of terms in the field
2605
+ */
2606
+ static VALUE
2607
+ frt_ir_term_count(VALUE self, VALUE rfield)
2608
+ {
2609
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2610
+ TermEnum *te = ir_terms(ir, frt_field(rfield));
2611
+ int count = 0;
2612
+ while (te->next(te)) {
2613
+ count++;
2614
+ }
2615
+ te->close(te);
2616
+ return INT2FIX(count);
2617
+ }
2618
+
2619
+ /*
2620
+ * call-seq:
2621
+ * index_reader.fields -> array of field-names
2622
+ *
2623
+ * Returns an array of field names in the index. This can be used to pass to
2624
+ * the QueryParser so that the QueryParser knows how to expand the "*"
2625
+ * wild-card to all fields in the index. A list of field names can also be
2626
+ * gathered from the FieldInfos object.
2627
+ */
2628
+ static VALUE
2629
+ frt_ir_fields(VALUE self)
2630
+ {
2631
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2632
+ FieldInfos *fis = ir->fis;
2633
+ VALUE rfield_names = rb_ary_new();
2634
+ int i;
2635
+ for (i = 0; i < fis->size; i++) {
2636
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
2637
+ }
2638
+ return rfield_names;
2639
+ }
2640
+
2641
+ /*
2642
+ * call-seq:
2643
+ * index_reader.field_infos -> FieldInfos
2644
+ *
2645
+ * Get the FieldInfos object for this IndexReader.
2646
+ */
2647
+ static VALUE
2648
+ frt_ir_field_infos(VALUE self)
2649
+ {
2650
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2651
+ return frt_get_field_infos(ir->fis);
2652
+ }
2653
+
2654
+ /*
2655
+ * call-seq:
2656
+ * index_reader.tokenized_fields -> array of field-names
2657
+ *
2658
+ * Returns an array of field names of all of the tokenized fields in the
2659
+ * index. This can be used to pass to the QueryParser so that the QueryParser
2660
+ * knows how to expand the "*" wild-card to all fields in the index. A list
2661
+ * of field names can also be gathered from the FieldInfos object.
2662
+ */
2663
+ static VALUE
2664
+ frt_ir_tk_fields(VALUE self)
2665
+ {
2666
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2667
+ FieldInfos *fis = ir->fis;
2668
+ VALUE rfield_names = rb_ary_new();
2669
+ int i;
2670
+ for (i = 0; i < fis->size; i++) {
2671
+ if (!fi_is_tokenized(fis->fields[i])) continue;
2672
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
2673
+ }
2674
+ return rfield_names;
2675
+ }
2676
+
2677
+ /*
2678
+ * call-seq:
2679
+ * index_reader.version -> int
2680
+ *
2681
+ * Returns the current version of the index reader.
2682
+ */
2683
+ static VALUE
2684
+ frt_ir_version(VALUE self)
2685
+ {
2686
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2687
+ return ULL2NUM(ir->sis->version);
2688
+ }
2689
+
2690
+ /****************************************************************************
2691
+ *
2692
+ * Init Functions
2693
+ *
2694
+ ****************************************************************************/
2695
+
2696
+
2697
+ /*
2698
+ * Document-class: Ferret::Index::FieldInfo
2699
+ *
2700
+ * == Summary
2701
+ *
2702
+ * The FieldInfo class is the field descriptor for the index. It specifies
2703
+ * whether a field is compressed or not or whether it should be indexed and
2704
+ * tokenized. Every field has a name which must be a symbol. There are three
2705
+ * properties that you can set, +:store+, +:index+ and +:term_vector+. You
2706
+ * can also set the default +:boost+ for a field as well.
2707
+ *
2708
+ * == Properties
2709
+ *
2710
+ * === :store
2711
+ *
2712
+ * The +:store+ property allows you to specify how a field is stored. You can
2713
+ * leave a field unstored (+:no+), store it in it's original format (+:yes+)
2714
+ * or store it in compressed format (+:compressed+). By default the document
2715
+ * is stored in its original format. If the field is large and it is stored
2716
+ * elsewhere where it is easily accessible you might want to leave it
2717
+ * unstored. This will keep the index size a lot smaller and make the
2718
+ * indexing process a lot faster. For example, you should probably leave the
2719
+ * +:content+ field unstored when indexing all the documents in your
2720
+ * file-system.
2721
+ *
2722
+ * === :index
2723
+ *
2724
+ * The +:index+ property allows you to specify how a field is indexed. A
2725
+ * field must be indexed to be searchable. However, a field doesn't need to
2726
+ * be indexed to be store in the Ferret index. You may want to use the index
2727
+ * as a simple database and store things like images or MP3s in the index. By
2728
+ * default each field is indexed and tokenized (split into tokens) (+:yes+).
2729
+ * If you don't want to index the field use +:no+. If you want the field
2730
+ * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2731
+ * wish to sort by. There are two other values for +:index+; +:omit_norms+
2732
+ * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
2733
+ * +:untokenized+ respectively and are useful if you are not boosting any
2734
+ * fields and you'd like to speed up the index. The norms file is the file
2735
+ * which contains the boost values for each document for a particular field.
2736
+ *
2737
+ * === :term_vector
2738
+ *
2739
+ * See TermVector for a description of term-vectors. You can specify whether
2740
+ * or not you would like to store term-vectors. The available options are
2741
+ * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2742
+ * +:with_positions_offsets+. Note that you need to store the positions to
2743
+ * associate offsets with individual terms in the term_vector.
2744
+ *
2745
+ * == Property Table
2746
+ *
2747
+ * Property Value Description
2748
+ * ------------------------------------------------------------------------
2749
+ * :store | :no | Don't store field
2750
+ * | |
2751
+ * | :yes (default) | Store field in its original
2752
+ * | | format. Use this value if you
2753
+ * | | want to highlight matches.
2754
+ * | | or print match excerpts a la
2755
+ * | | Google search.
2756
+ * | |
2757
+ * | :compressed | Store field in compressed
2758
+ * | | format.
2759
+ * -------------|-------------------------|------------------------------
2760
+ * :index | :no | Do not make this field
2761
+ * | | searchable.
2762
+ * | |
2763
+ * | :yes (default) | Make this field searchable and
2764
+ * | | tokenized its contents.
2765
+ * | |
2766
+ * | :untokenized | Make this field searchable but
2767
+ * | | do not tokenize its contents.
2768
+ * | | use this value for fields you
2769
+ * | | wish to sort by.
2770
+ * | |
2771
+ * | :omit_norms | Same as :yes except omit the
2772
+ * | | norms file. The norms file can
2773
+ * | | be omitted if you don't boost
2774
+ * | | any fields and you don't need
2775
+ * | | scoring based on field length.
2776
+ * | |
2777
+ * | :untokenized_omit_norms | Same as :untokenized except omit
2778
+ * | | the norms file. Norms files can
2779
+ * | | be omitted if you don't boost
2780
+ * | | any fields and you don't need
2781
+ * | | scoring based on field length.
2782
+ * | |
2783
+ * -------------|-------------------------|------------------------------
2784
+ * :term_vector | :no | Don't store term-vectors
2785
+ * | |
2786
+ * | :yes | Store term-vectors without
2787
+ * | | storing positions or offsets.
2788
+ * | |
2789
+ * | :with_positions | Store term-vectors with
2790
+ * | | positions.
2791
+ * | |
2792
+ * | :with_offsets | Store term-vectors with
2793
+ * | | offsets.
2794
+ * | |
2795
+ * | :with_positions_offsets | Store term-vectors with
2796
+ * | (default) | positions and offsets.
2797
+ * -------------|-------------------------|------------------------------
2798
+ * :boost | Float | The boost property is used to
2799
+ * | | set the default boost for a
2800
+ * | | field. This boost value will
2801
+ * | | used for all instances of the
2802
+ * | | field in the index unless
2803
+ * | | otherwise specified when you
2804
+ * | | create the field. All values
2805
+ * | | should be positive.
2806
+ * | |
2807
+ *
2808
+ * == Examples
2809
+ *
2810
+ * fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
2811
+ * :boost => 10.0)
2812
+ *
2813
+ * fi = FieldInfo.new(:content)
2814
+ *
2815
+ * fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
2816
+ * :term_vector => :no)
2817
+ *
2818
+ * fi = FieldInfo.new(:image, :store => :compressed, :index => :no,
2819
+ * :term_vector => :no)
2820
+ */
2821
+ static void
2822
+ Init_FieldInfo(void)
2823
+ {
2824
+ sym_store = ID2SYM(rb_intern("store"));
2825
+ sym_index = ID2SYM(rb_intern("index"));
2826
+ sym_term_vector = ID2SYM(rb_intern("term_vector"));
2827
+
2828
+ sym_compress = ID2SYM(rb_intern("compress"));
2829
+ sym_compressed = ID2SYM(rb_intern("compressed"));
2830
+
2831
+ sym_untokenized = ID2SYM(rb_intern("untokenized"));
2832
+ sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
2833
+ sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
2834
+
2835
+ sym_with_positions = ID2SYM(rb_intern("with_positions"));
2836
+ sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
2837
+ sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
2838
+
2839
+ cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
2840
+ rb_define_alloc_func(cFieldInfo, frt_data_alloc);
2841
+
2842
+ rb_define_method(cFieldInfo, "initialize", frt_fi_init, -1);
2843
+ rb_define_method(cFieldInfo, "name", frt_fi_name, 0);
2844
+ rb_define_method(cFieldInfo, "stored?", frt_fi_is_stored, 0);
2845
+ rb_define_method(cFieldInfo, "compressed?", frt_fi_is_compressed, 0);
2846
+ rb_define_method(cFieldInfo, "indexed?", frt_fi_is_indexed, 0);
2847
+ rb_define_method(cFieldInfo, "tokenized?", frt_fi_is_tokenized, 0);
2848
+ rb_define_method(cFieldInfo, "omit_norms?", frt_fi_omit_norms, 0);
2849
+ rb_define_method(cFieldInfo, "store_term_vector?",
2850
+ frt_fi_store_term_vector, 0);
2851
+ rb_define_method(cFieldInfo, "store_positions?",
2852
+ frt_fi_store_positions, 0);
2853
+ rb_define_method(cFieldInfo, "store_offsets?",
2854
+ frt_fi_store_offsets, 0);
2855
+ rb_define_method(cFieldInfo, "has_norms?", frt_fi_has_norms, 0);
2856
+ rb_define_method(cFieldInfo, "boost", frt_fi_boost, 0);
2857
+ rb_define_method(cFieldInfo, "to_s", frt_fi_to_s, 0);
2858
+ }
2859
+
2860
+ /*
2861
+ * Document-class: Ferret::Index::FieldInfos
2862
+ *
2863
+ * == Summary
2864
+ *
2865
+ * The FieldInfos class holds all the field descriptors for an index. It is
2866
+ * this class that is used to create a new index using the
2867
+ * FieldInfos#create_index method. If you are happy with the default
2868
+ * properties for FieldInfo then you don't need to worry about this class.
2869
+ * IndexWriter can create the index for you. Otherwise you should set up the
2870
+ * index like in the example;
2871
+ *
2872
+ * == Example
2873
+ *
2874
+ * field_infos = FieldInfos.new(:term_vector => :no)
2875
+ *
2876
+ * field_infos.add_field(:title, :index => :untokenized, :term_vector => :no,
2877
+ * :boost => 10.0)
2878
+ *
2879
+ * field_infos.add_field(:content)
2880
+ *
2881
+ * field_infos.add_field(:created_on, :index => :untokenized_omit_norms,
2882
+ * :term_vector => :no)
2883
+ *
2884
+ * field_infos.add_field(:image, :store => :compressed, :index => :no,
2885
+ * :term_vector => :no)
2886
+ *
2887
+ * field_infos.create_index("/path/to/index")
2888
+ *
2889
+ * == Default Properties
2890
+ *
2891
+ * See FieldInfo for the available field property values.
2892
+ *
2893
+ * When you create the FieldInfos object you specify the default properties
2894
+ * for the fields. Often you'll specify all of the fields in the index before
2895
+ * you create the index so the default values won't come into play. However,
2896
+ * it is possible to continue to dynamically add fields as indexing goes
2897
+ * along. If you add a document to the index which has fields that the index
2898
+ * doesn't know about then the default properties are used for the new field.
2899
+ */
2900
+ static void
2901
+ Init_FieldInfos(void)
2902
+ {
2903
+ Init_FieldInfo();
2904
+
2905
+ cFieldInfos = rb_define_class_under(mIndex, "FieldInfos", rb_cObject);
2906
+ rb_define_alloc_func(cFieldInfos, frt_data_alloc);
2907
+
2908
+ rb_define_method(cFieldInfos, "initialize", frt_fis_init, -1);
2909
+ rb_define_method(cFieldInfos, "to_a", frt_fis_to_a, 0);
2910
+ rb_define_method(cFieldInfos, "[]", frt_fis_get, 1);
2911
+ rb_define_method(cFieldInfos, "add", frt_fis_add, 1);
2912
+ rb_define_method(cFieldInfos, "<<", frt_fis_add, 1);
2913
+ rb_define_method(cFieldInfos, "add_field", frt_fis_add_field, -1);
2914
+ rb_define_method(cFieldInfos, "each", frt_fis_each, 0);
2915
+ rb_define_method(cFieldInfos, "to_s", frt_fis_to_s, 0);
2916
+ rb_define_method(cFieldInfos, "size", frt_fis_size, 0);
2917
+ rb_define_method(cFieldInfos, "create_index",
2918
+ frt_fis_create_index, 1);
2919
+ rb_define_method(cFieldInfos, "fields", frt_fis_get_fields, 0);
2920
+ rb_define_method(cFieldInfos, "tokenized_fields", frt_fis_get_tk_fields, 0);
2921
+ }
2922
+
2923
+ /*
2924
+ * Document-class: Ferret::Index::TermEnum
2925
+ *
2926
+ * == Summary
2927
+ *
2928
+ * The TermEnum object is used to iterate through the terms in a field. To
2929
+ * get a TermEnum you need to use the IndexReader#terms(field) method.
2930
+ *
2931
+ * == Example
2932
+ *
2933
+ * te = index_reader.terms(:content)
2934
+ *
2935
+ * te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
2936
+ *
2937
+ * # or you could do it like this;
2938
+ * te = index_reader.terms(:content)
2939
+ *
2940
+ * while te.next?
2941
+ * puts "#{te.term} occured in #{te.doc_freq} documents in the index"
2942
+ * end
2943
+ */
2944
+ static void
2945
+ Init_TermEnum(void)
2946
+ {
2947
+ id_term = rb_intern("@term");
2948
+
2949
+ cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
2950
+ rb_define_alloc_func(cTermEnum, frt_data_alloc);
2951
+
2952
+ rb_define_method(cTermEnum, "next?", frt_te_next, 0);
2953
+ rb_define_method(cTermEnum, "term", frt_te_term, 0);
2954
+ rb_define_method(cTermEnum, "doc_freq", frt_te_doc_freq, 0);
2955
+ rb_define_method(cTermEnum, "skip_to", frt_te_skip_to, 1);
2956
+ rb_define_method(cTermEnum, "each", frt_te_each, 0);
2957
+ rb_define_method(cTermEnum, "field=", frt_te_set_field, 1);
2958
+ rb_define_method(cTermEnum, "set_field",frt_te_set_field, 1);
2959
+ rb_define_method(cTermEnum, "to_json", frt_te_to_json, -1);
2960
+ }
2961
+
2962
+ /*
2963
+ * Document-class: Ferret::Index::TermDocEnum
2964
+ *
2965
+ * == Summary
2966
+ *
2967
+ * Use a TermDocEnum to iterate through the documents that contain a
2968
+ * particular term. You can also iterate through the positions which the term
2969
+ * occurs in a document.
2970
+ *
2971
+ *
2972
+ * == Example
2973
+ *
2974
+ * tde = index_reader.term_docs_for(:content, "fox")
2975
+ *
2976
+ * tde.each do |doc_id, freq|
2977
+ * puts "fox appeared #{freq} times in document #{doc_id}:"
2978
+ * positions = []
2979
+ * tde.each_position {|pos| positions << pos}
2980
+ * puts " #{positions.join(', ')}"
2981
+ * end
2982
+ *
2983
+ * # or you can do it like this;
2984
+ * tde.seek(:title, "red")
2985
+ * while tde.next?
2986
+ * puts "red appeared #{tde.freq} times in document #{tde.doc}:"
2987
+ * positions = []
2988
+ * while pos = tde.next_position
2989
+ * positions << pos
2990
+ * end
2991
+ * puts " #{positions.join(', ')}"
2992
+ * end
2993
+ */
2994
+ static void
2995
+ Init_TermDocEnum(void)
2996
+ {
2997
+ id_fld_num_map = rb_intern("@field_num_map");
2998
+ id_field_num = rb_intern("@field_num");
2999
+
3000
+ cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
3001
+ rb_define_alloc_func(cTermDocEnum, frt_data_alloc);
3002
+ rb_define_method(cTermDocEnum, "seek", frt_tde_seek, 2);
3003
+ rb_define_method(cTermDocEnum, "seek_term_enum", frt_tde_seek_te, 1);
3004
+ rb_define_method(cTermDocEnum, "doc", frt_tde_doc, 0);
3005
+ rb_define_method(cTermDocEnum, "freq", frt_tde_freq, 0);
3006
+ rb_define_method(cTermDocEnum, "next?", frt_tde_next, 0);
3007
+ rb_define_method(cTermDocEnum, "next_position", frt_tde_next_position, 0);
3008
+ rb_define_method(cTermDocEnum, "each", frt_tde_each, 0);
3009
+ rb_define_method(cTermDocEnum, "each_position", frt_tde_each_position, 0);
3010
+ rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
3011
+ rb_define_method(cTermDocEnum, "to_json", frt_tde_to_json, -1);
3012
+ }
3013
+
3014
+ /* rdochack
3015
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3016
+ */
3017
+
3018
+ /*
3019
+ * Document-class: Ferret::Index::TermVector::TVOffsets
3020
+ *
3021
+ * == Summary
3022
+ *
3023
+ * Holds the start and end byte-offsets of a term in a field. For example, if
3024
+ * the field was "the quick brown fox" then the start and end offsets of:
3025
+ *
3026
+ * ["the", "quick", "brown", "fox"]
3027
+ *
3028
+ * Would be:
3029
+ *
3030
+ * [(0,3), (4,9), (10,15), (16,19)]
3031
+ *
3032
+ * See the Analysis module for more information on setting the offsets.
3033
+ */
3034
+ static void
3035
+ Init_TVOffsets(void)
3036
+ {
3037
+ const char *tv_offsets_class = "TVOffsets";
3038
+ /* rdochack
3039
+ cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
3040
+ */
3041
+ cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
3042
+ rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
3043
+ rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
3044
+ }
3045
+
3046
+ /*
3047
+ * Document-class: Ferret::Index::TermVector::TVTerm
3048
+ *
3049
+ * == Summary
3050
+ *
3051
+ * The TVTerm class holds the term information for each term in a TermVector.
3052
+ * That is it holds the term's text and its positions in the document. You
3053
+ * can use those positions to reference the offsets for the term.
3054
+ *
3055
+ * == Example
3056
+ *
3057
+ * tv = index_reader.term_vector(:content)
3058
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3059
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3060
+ */
3061
+ static void
3062
+ Init_TVTerm(void)
3063
+ {
3064
+ const char *tv_term_class = "TVTerm";
3065
+ /* rdochack
3066
+ cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
3067
+ */
3068
+ cTVTerm = rb_struct_define(tv_term_class, "text", "positions", NULL);
3069
+ rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
3070
+ rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
3071
+ }
3072
+
3073
+ /*
3074
+ * Document-class: Ferret::Index::TermVector
3075
+ *
3076
+ * == Summary
3077
+ *
3078
+ * TermVectors are most commonly used for creating search result excerpts and
3079
+ * highlight search matches in results. This is all done internally so you
3080
+ * won't need to worry about the TermVector object. There are some other
3081
+ * reasons you may want to use the TermVectors object however. For example,
3082
+ * you may wish to see which terms are the most commonly occurring terms in a
3083
+ * document to implement a MoreLikeThis search.
3084
+ *
3085
+ * == Example
3086
+ *
3087
+ * tv = index_reader.term_vector(:content)
3088
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3089
+ *
3090
+ * # get the term frequency
3091
+ * term_freq = tv_term.positions.size
3092
+ *
3093
+ * # get the offsets for a term
3094
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3095
+ *
3096
+ * == Note
3097
+ *
3098
+ * +positions+ and +offsets+ can be +nil+ depending on what you set the
3099
+ * +:term_vector+ to when you set the FieldInfo object for the field. Note in
3100
+ * particular that you need to store both positions and offsets if you want
3101
+ * to associate offsets with particular terms.
3102
+ */
3103
+ static void
3104
+ Init_TermVector(void)
3105
+ {
3106
+ const char *tv_class = "TermVector";
3107
+ /* rdochack
3108
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3109
+ */
3110
+ cTermVector = rb_struct_define(tv_class,
3111
+ "field", "terms", "offsets", NULL);
3112
+ rb_set_class_path(cTermVector, mIndex, tv_class);
3113
+ rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
3114
+
3115
+ Init_TVOffsets();
3116
+ Init_TVTerm();
3117
+ }
3118
+
3119
+ /*
3120
+ * Document-class: Ferret::Index::IndexWriter
3121
+ *
3122
+ * == Summary
3123
+ *
3124
+ * The IndexWriter is the class used to add documents to an index. You can
3125
+ * also delete documents from the index using this class. The indexing
3126
+ * process is highly customizable and the IndexWriter has the following
3127
+ * parameters;
3128
+ *
3129
+ * dir:: This is an Ferret::Store::Directory object. You
3130
+ * should either pass a +:dir+ or a +:path+ when
3131
+ * creating an index.
3132
+ * path:: A string representing the path to the index
3133
+ * directory. If you are creating the index for the
3134
+ * first time the directory will be created if it's
3135
+ * missing. You should not choose a directory which
3136
+ * contains other files as they could be over-written.
3137
+ * To protect against this set +:create_if_missing+ to
3138
+ * false.
3139
+ * create_if_missing:: Default: true. Create the index if no index is
3140
+ * found in the specified directory. Otherwise, use
3141
+ * the existing index.
3142
+ * create:: Default: false. Creates the index, even if one
3143
+ * already exists. That means any existing index will
3144
+ * be deleted. It is probably better to use the
3145
+ * create_if_missing option so that the index is only
3146
+ * created the first time when it doesn't exist.
3147
+ * field_infos:: Default FieldInfos.new. The FieldInfos object to use
3148
+ * when creating a new index if +:create_if_missing+ or
3149
+ * +:create+ is set to true. If an existing index is
3150
+ * opened then this parameter is ignored.
3151
+ * analyzer:: Default: Ferret::Analysis::StandardAnalyzer.
3152
+ * Sets the default analyzer for the index. This is
3153
+ * used by both the IndexWriter and the QueryParser
3154
+ * to tokenize the input. The default is the
3155
+ * StandardAnalyzer.
3156
+ * chunk_size:: Default: 0x100000 or 1Mb. Memory performance tuning
3157
+ * parameter. Sets the default size of chunks of memory
3158
+ * malloced for use during indexing. You can usually
3159
+ * leave this parameter as is.
3160
+ * max_buffer_memory:: Default: 0x1000000 or 16Mb. Memory performance
3161
+ * tuning parameter. Sets the amount of memory to be
3162
+ * used by the indexing process. Set to a larger value
3163
+ * to increase indexing speed. Note that this only
3164
+ * includes memory used by the indexing process, not
3165
+ * the rest of your ruby application.
3166
+ * term_index_interval:: Default: 128. The skip interval between terms in the
3167
+ * term dictionary. A smaller value will possibly
3168
+ * increase search performance while also increasing
3169
+ * memory usage and impacting negatively impacting
3170
+ * indexing performance.
3171
+ * doc_skip_interval:: Default: 16. The skip interval for document numbers
3172
+ * in the index. As with +:term_index_interval+ you
3173
+ * have a trade-off. A smaller number may increase
3174
+ * search performance while also increasing memory
3175
+ * usage and impacting negatively impacting indexing
3176
+ * performance.
3177
+ * merge_factor:: Default: 10. This must never be less than 2.
3178
+ * Specifies the number of segments of a certain size
3179
+ * that must exist before they are merged. A larger
3180
+ * value will improve indexing performance while
3181
+ * slowing search performance.
3182
+ * max_buffered_docs:: Default: 10000. The maximum number of documents that
3183
+ * may be stored in memory before being written to the
3184
+ * index. If you have a lot of memory and are indexing
3185
+ * a large number of small documents (like products in
3186
+ * a product database for example) you may want to set
3187
+ * this to a much higher number (like
3188
+ * Ferret::FIX_INT_MAX). If you are worried about your
3189
+ * application crashing during the middle of index you
3190
+ * might set this to a smaller number so that the index
3191
+ * is committed more often. This is like having an
3192
+ * auto-save in a word processor application.
3193
+ * max_merge_docs:: Set this value to limit the number of documents that
3194
+ * go into a single segment. Use this to avoid
3195
+ * extremely long merge times during indexing which can
3196
+ * make your application seem unresponsive. This is
3197
+ * only necessary for very large indexes (millions of
3198
+ * documents).
3199
+ * max_field_length:: Default: 10000. The maximum number of terms added to
3200
+ * a single field. This can be useful to protect the
3201
+ * indexer when indexing documents from the web for
3202
+ * example. Usually the most important terms will occur
3203
+ * early on in a document so you can often safely
3204
+ * ignore the terms in a field after a certain number
3205
+ * of them. If you wanted to speed up indexing and same
3206
+ * space in your index you may only want to index the
3207
+ * first 1000 terms in a field. On the other hand, if
3208
+ * you want to be more thorough and you are indexing
3209
+ * documents from your file-system you may set this
3210
+ * parameter to Ferret::FIX_INT_MAX.
3211
+ * use_compound_file:: Default: true. Uses a compound file to store the
3212
+ * index. This prevents an error being raised for
3213
+ * having too many files open at the same time. The
3214
+ * default is true but performance is better if this is
3215
+ * set to false.
3216
+ *
3217
+ *
3218
+ * === Deleting Documents
3219
+ *
3220
+ * Both IndexReader and IndexWriter allow you to delete documents. You should
3221
+ * use the IndexReader to delete documents by document id and IndexWriter to
3222
+ * delete documents by term which we'll explain now. It is preferrable to
3223
+ * delete documents from an index using IndexWriter for performance reasons.
3224
+ * To delete documents using the IndexWriter you should give each document in
3225
+ * the index a unique ID. If you are indexing documents from the file-system
3226
+ * this unique ID will be the full file path. If indexing documents from the
3227
+ * database you should use the primary key as the ID field. You can then
3228
+ * use the delete method to delete a file referenced by the ID. For example;
3229
+ *
3230
+ * index_writer.delete(:id, "/path/to/indexed/file")
3231
+ */
3232
+ void
3233
+ Init_IndexWriter(void)
3234
+ {
3235
+ id_boost = rb_intern("boost");
3236
+
3237
+ sym_create = ID2SYM(rb_intern("create"));
3238
+ sym_create_if_missing = ID2SYM(rb_intern("create_if_missing"));
3239
+ sym_field_infos = ID2SYM(rb_intern("field_infos"));
3240
+
3241
+ sym_chunk_size = ID2SYM(rb_intern("chunk_size"));
3242
+ sym_max_buffer_memory = ID2SYM(rb_intern("max_buffer_memory"));
3243
+ sym_index_interval = ID2SYM(rb_intern("term_index_interval"));
3244
+ sym_skip_interval = ID2SYM(rb_intern("doc_skip_interval"));
3245
+ sym_merge_factor = ID2SYM(rb_intern("merge_factor"));
3246
+ sym_max_buffered_docs = ID2SYM(rb_intern("max_buffered_docs"));
3247
+ sym_max_merge_docs = ID2SYM(rb_intern("max_merge_docs"));
3248
+ sym_max_field_length = ID2SYM(rb_intern("max_field_length"));
3249
+ sym_use_compound_file = ID2SYM(rb_intern("use_compound_file"));
3250
+
3251
+ cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
3252
+ rb_define_alloc_func(cIndexWriter, frt_data_alloc);
3253
+
3254
+ rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
3255
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
3256
+ rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
3257
+ rb_str_new2(WRITE_LOCK_NAME));
3258
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
3259
+ rb_str_new2(COMMIT_LOCK_NAME));
3260
+ rb_define_const(cIndexWriter, "DEFAULT_CHUNK_SIZE",
3261
+ INT2FIX(default_config.chunk_size));
3262
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFER_MEMORY",
3263
+ INT2FIX(default_config.max_buffer_memory));
3264
+ rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
3265
+ INT2FIX(default_config.index_interval));
3266
+ rb_define_const(cIndexWriter, "DEFAULT_DOC_SKIP_INTERVAL",
3267
+ INT2FIX(default_config.skip_interval));
3268
+ rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
3269
+ INT2FIX(default_config.merge_factor));
3270
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFERED_DOCS",
3271
+ INT2FIX(default_config.max_buffered_docs));
3272
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
3273
+ INT2FIX(default_config.max_merge_docs));
3274
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
3275
+ INT2FIX(default_config.max_field_length));
3276
+ rb_define_const(cIndexWriter, "DEFAULT_USE_COMPOUND_FILE",
3277
+ default_config.use_compound_file ? Qtrue : Qfalse);
3278
+
3279
+ rb_define_method(cIndexWriter, "initialize", frt_iw_init, -1);
3280
+ rb_define_method(cIndexWriter, "doc_count", frt_iw_get_doc_count, 0);
3281
+ rb_define_method(cIndexWriter, "close", frt_iw_close, 0);
3282
+ rb_define_method(cIndexWriter, "add_document", frt_iw_add_doc, 1);
3283
+ rb_define_method(cIndexWriter, "<<", frt_iw_add_doc, 1);
3284
+ rb_define_method(cIndexWriter, "optimize", frt_iw_optimize, 0);
3285
+ rb_define_method(cIndexWriter, "commit", frt_iw_commit, 0);
3286
+ rb_define_method(cIndexWriter, "add_readers", frt_iw_add_readers, 1);
3287
+ rb_define_method(cIndexWriter, "delete", frt_iw_delete, 2);
3288
+ rb_define_method(cIndexWriter, "field_infos", frt_iw_field_infos, 0);
3289
+ rb_define_method(cIndexWriter, "analyzer", frt_iw_get_analyzer, 0);
3290
+ rb_define_method(cIndexWriter, "analyzer=", frt_iw_set_analyzer, 1);
3291
+ rb_define_method(cIndexWriter, "version", frt_iw_version, 0);
3292
+
3293
+ rb_define_method(cIndexWriter, "chunk_size",
3294
+ frt_iw_get_chunk_size, 0);
3295
+ rb_define_method(cIndexWriter, "chunk_size=",
3296
+ frt_iw_set_chunk_size, 1);
3297
+
3298
+ rb_define_method(cIndexWriter, "max_buffer_memory",
3299
+ frt_iw_get_max_buffer_memory, 0);
3300
+ rb_define_method(cIndexWriter, "max_buffer_memory=",
3301
+ frt_iw_set_max_buffer_memory, 1);
3302
+
3303
+ rb_define_method(cIndexWriter, "term_index_interval",
3304
+ frt_iw_get_index_interval, 0);
3305
+ rb_define_method(cIndexWriter, "term_index_interval=",
3306
+ frt_iw_set_index_interval, 1);
3307
+
3308
+ rb_define_method(cIndexWriter, "doc_skip_interval",
3309
+ frt_iw_get_skip_interval, 0);
3310
+ rb_define_method(cIndexWriter, "doc_skip_interval=",
3311
+ frt_iw_set_skip_interval, 1);
3312
+
3313
+ rb_define_method(cIndexWriter, "merge_factor",
3314
+ frt_iw_get_merge_factor, 0);
3315
+ rb_define_method(cIndexWriter, "merge_factor=",
3316
+ frt_iw_set_merge_factor, 1);
3317
+
3318
+ rb_define_method(cIndexWriter, "max_buffered_docs",
3319
+ frt_iw_get_max_buffered_docs, 0);
3320
+ rb_define_method(cIndexWriter, "max_buffered_docs=",
3321
+ frt_iw_set_max_buffered_docs, 1);
3322
+
3323
+ rb_define_method(cIndexWriter, "max_merge_docs",
3324
+ frt_iw_get_max_merge_docs, 0);
3325
+ rb_define_method(cIndexWriter, "max_merge_docs=",
3326
+ frt_iw_set_max_merge_docs, 1);
3327
+
3328
+ rb_define_method(cIndexWriter, "max_field_length",
3329
+ frt_iw_get_max_field_length, 0);
3330
+ rb_define_method(cIndexWriter, "max_field_length=",
3331
+ frt_iw_set_max_field_length, 1);
3332
+
3333
+ rb_define_method(cIndexWriter, "use_compound_file",
3334
+ frt_iw_get_use_compound_file, 0);
3335
+ rb_define_method(cIndexWriter, "use_compound_file=",
3336
+ frt_iw_set_use_compound_file, 1);
3337
+
3338
+ }
3339
+
3340
+ /*
3341
+ * Document-class: Ferret::Index::LazyDoc
3342
+ *
3343
+ * == Summary
3344
+ *
3345
+ * When a document is retrieved from the index a LazyDoc is returned.
3346
+ * Actually, LazyDoc is just a modified Hash object which lazily adds fields
3347
+ * to itself when they are accessed. You should not that they keys method
3348
+ * will return nothing until you actually access one of the fields. To see
3349
+ * what fields are available use LazyDoc#fields rather than LazyDoc#keys. To
3350
+ * load all fields use the LazyDoc#load method.
3351
+ *
3352
+ * == Example
3353
+ *
3354
+ * doc = index_reader[0]
3355
+ *
3356
+ * doc.keys #=> []
3357
+ * doc.values #=> []
3358
+ * doc.fields #=> [:title, :content]
3359
+ *
3360
+ * title = doc[:title] #=> "the title"
3361
+ * doc.keys #=> [:title]
3362
+ * doc.values #=> ["the title"]
3363
+ * doc.fields #=> [:title, :content]
3364
+ *
3365
+ * doc.load
3366
+ * doc.keys #=> [:title, :content]
3367
+ * doc.values #=> ["the title", "the content"]
3368
+ * doc.fields #=> [:title, :content]
3369
+ */
3370
+ void
3371
+ Init_LazyDoc(void)
3372
+ {
3373
+ id_fields = rb_intern("@fields");
3374
+
3375
+
3376
+ cLazyDoc = rb_define_class_under(mIndex, "LazyDoc", rb_cHash);
3377
+ rb_define_method(cLazyDoc, "default", frt_lzd_default, 1);
3378
+ rb_define_method(cLazyDoc, "load", frt_lzd_load, 0);
3379
+ rb_define_method(cLazyDoc, "fields", frt_lzd_fields, 0);
3380
+
3381
+ cLazyDocData = rb_define_class_under(cLazyDoc, "LazyDocData", rb_cObject);
3382
+ rb_define_alloc_func(cLazyDocData, frt_data_alloc);
3383
+ }
3384
+
3385
+ /*
3386
+ * Document-class: Ferret::Index::IndexReader
3387
+ *
3388
+ * == Summary
3389
+ *
3390
+ * IndexReader is used for reading data from the index. This class is usually
3391
+ * used directly for more advanced tasks like iterating through terms in an
3392
+ * index, accessing term-vectors or deleting documents by document id. It is
3393
+ * also used internally by IndexSearcher.
3394
+ */
3395
+ void
3396
+ Init_IndexReader(void)
3397
+ {
3398
+ cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
3399
+ rb_define_alloc_func(cIndexReader, frt_data_alloc);
3400
+ rb_define_method(cIndexReader, "initialize", frt_ir_init, 1);
3401
+ rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
3402
+ rb_define_method(cIndexReader, "norms", frt_ir_norms, 1);
3403
+ rb_define_method(cIndexReader, "get_norms_into",frt_ir_get_norms_into, 3);
3404
+ rb_define_method(cIndexReader, "commit", frt_ir_commit, 0);
3405
+ rb_define_method(cIndexReader, "close", frt_ir_close, 0);
3406
+ rb_define_method(cIndexReader, "has_deletions?",frt_ir_has_deletions, 0);
3407
+ rb_define_method(cIndexReader, "delete", frt_ir_delete, 1);
3408
+ rb_define_method(cIndexReader, "deleted?", frt_ir_is_deleted, 1);
3409
+ rb_define_method(cIndexReader, "max_doc", frt_ir_max_doc, 0);
3410
+ rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
3411
+ rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
3412
+ rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
3413
+ rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, -1);
3414
+ rb_define_method(cIndexReader, "[]", frt_ir_get_doc, -1);
3415
+ rb_define_method(cIndexReader, "term_vector", frt_ir_term_vector, 2);
3416
+ rb_define_method(cIndexReader, "term_vectors", frt_ir_term_vectors, 1);
3417
+ rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
3418
+ rb_define_method(cIndexReader, "term_positions",frt_ir_term_positions, 0);
3419
+ rb_define_method(cIndexReader, "term_docs_for", frt_ir_term_docs_for, 2);
3420
+ rb_define_method(cIndexReader, "term_positions_for", frt_ir_t_pos_for, 2);
3421
+ rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 2);
3422
+ rb_define_method(cIndexReader, "terms", frt_ir_terms, 1);
3423
+ rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 2);
3424
+ rb_define_method(cIndexReader, "term_count", frt_ir_term_count, 1);
3425
+ rb_define_method(cIndexReader, "fields", frt_ir_fields, 0);
3426
+ rb_define_method(cIndexReader, "field_names", frt_ir_fields, 0);
3427
+ rb_define_method(cIndexReader, "field_infos", frt_ir_field_infos, 0);
3428
+ rb_define_method(cIndexReader, "tokenized_fields", frt_ir_tk_fields, 0);
3429
+ rb_define_method(cIndexReader, "version", frt_ir_version, 0);
3430
+ }
3431
+
3432
+ /* rdoc hack
3433
+ extern VALUE mFerret = rb_define_module("Ferret");
3434
+ */
3435
+
3436
+ /*
3437
+ * Document-module: Ferret::Index
3438
+ *
3439
+ * == Summary
3440
+ *
3441
+ * The Index module contains all the classes used for adding to and
3442
+ * retrieving from the index. The important classes to know about are;
3443
+ *
3444
+ * * FieldInfo
3445
+ * * FieldInfos
3446
+ * * IndexWriter
3447
+ * * IndexReader
3448
+ * * LazyDoc
3449
+ *
3450
+ * The other classes in this module are useful for more advanced uses like
3451
+ * building tag clouds, creating more-like-this queries, custom highlighting
3452
+ * etc. They are also useful for index browsers.
3453
+ */
3454
+ void
3455
+ Init_Index(void)
3456
+ {
3457
+ mIndex = rb_define_module_under(mFerret, "Index");
3458
+
3459
+ sym_boost = ID2SYM(rb_intern("boost"));
3460
+ sym_analyzer = ID2SYM(rb_intern("analyzer"));
3461
+ sym_close_dir = ID2SYM(rb_intern("close_dir"));
3462
+
3463
+ Init_TermVector();
3464
+ Init_TermEnum();
3465
+ Init_TermDocEnum();
3466
+
3467
+ Init_FieldInfos();
3468
+
3469
+ Init_LazyDoc();
3470
+ Init_IndexWriter();
3471
+ Init_IndexReader();
3472
+ }