jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/r_index.c ADDED
@@ -0,0 +1,3468 @@
1
+ #include "ferret.h"
2
+ #include "index.h"
3
+ #include <st.h>
4
+
5
+ VALUE mIndex;
6
+
7
+ VALUE cFieldInfo;
8
+ VALUE cFieldInfos;
9
+
10
+ VALUE cTVOffsets;
11
+ VALUE cTVTerm;
12
+ VALUE cTermVector;
13
+
14
+ VALUE cTermEnum;
15
+ VALUE cTermDocEnum;
16
+
17
+ VALUE cLazyDoc;
18
+ VALUE cLazyDocData;
19
+ VALUE cIndexWriter;
20
+ VALUE cIndexReader;
21
+
22
+ VALUE sym_analyzer;
23
+ static VALUE sym_close_dir;
24
+ static VALUE sym_create;
25
+ static VALUE sym_create_if_missing;
26
+
27
+ static VALUE sym_chunk_size;
28
+ static VALUE sym_max_buffer_memory;
29
+ static VALUE sym_index_interval;
30
+ static VALUE sym_skip_interval;
31
+ static VALUE sym_merge_factor;
32
+ static VALUE sym_max_buffered_docs;
33
+ static VALUE sym_max_merge_docs;
34
+ static VALUE sym_max_field_length;
35
+ static VALUE sym_use_compound_file;
36
+
37
+ static VALUE sym_boost;
38
+ static VALUE sym_field_infos;
39
+
40
+ static VALUE sym_store;
41
+ static VALUE sym_index;
42
+ static VALUE sym_term_vector;
43
+
44
+ static VALUE sym_compress;
45
+ static VALUE sym_compressed;
46
+
47
+ static VALUE sym_untokenized;
48
+ static VALUE sym_omit_norms;
49
+ static VALUE sym_untokenized_omit_norms;
50
+
51
+ static VALUE sym_with_positions;
52
+ static VALUE sym_with_offsets;
53
+ static VALUE sym_with_positions_offsets;
54
+
55
+ static Symbol fsym_content;
56
+
57
+ static ID id_term;
58
+ static ID id_fields;
59
+ static ID id_fld_num_map;
60
+ static ID id_field_num;
61
+ static ID id_boost;
62
+
63
+ extern void frb_set_term(VALUE rterm, Term *t);
64
+ extern Analyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
65
+ extern VALUE frb_get_analyzer(Analyzer *a);
66
+
67
+ /****************************************************************************
68
+ *
69
+ * FieldInfo Methods
70
+ *
71
+ ****************************************************************************/
72
+
73
+ static void
74
+ frb_fi_free(void *p)
75
+ {
76
+ object_del(p);
77
+ fi_deref((FieldInfo *)p);
78
+ }
79
+
80
+ static void
81
+ frb_fi_get_params(VALUE roptions,
82
+ StoreValue *store,
83
+ IndexValue *index,
84
+ TermVectorValue *term_vector,
85
+ float *boost)
86
+ {
87
+ VALUE v;
88
+ Check_Type(roptions, T_HASH);
89
+ v = rb_hash_aref(roptions, sym_boost);
90
+ if (Qnil != v) {
91
+ *boost = (float)NUM2DBL(v);
92
+ } else {
93
+ *boost = 1.0f;
94
+ }
95
+ v = rb_hash_aref(roptions, sym_store);
96
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
97
+ if (v == sym_no || v == sym_false || v == Qfalse) {
98
+ *store = STORE_NO;
99
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
100
+ *store = STORE_YES;
101
+ } else if (v == sym_compress || v == sym_compressed) {
102
+ *store = STORE_COMPRESS;
103
+ } else if (v == Qnil) {
104
+ /* leave as default */
105
+ } else {
106
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
107
+ " Please choose from [:yes, :no, :compressed]",
108
+ rb_id2name(SYM2ID(v)));
109
+ }
110
+
111
+ v = rb_hash_aref(roptions, sym_index);
112
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
113
+ if (v == sym_no || v == sym_false || v == Qfalse) {
114
+ *index = INDEX_NO;
115
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
116
+ *index = INDEX_YES;
117
+ } else if (v == sym_untokenized) {
118
+ *index = INDEX_UNTOKENIZED;
119
+ } else if (v == sym_omit_norms) {
120
+ *index = INDEX_YES_OMIT_NORMS;
121
+ } else if (v == sym_untokenized_omit_norms) {
122
+ *index = INDEX_UNTOKENIZED_OMIT_NORMS;
123
+ } else if (v == Qnil) {
124
+ /* leave as default */
125
+ } else {
126
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
127
+ " Please choose from [:no, :yes, :untokenized, "
128
+ ":omit_norms, :untokenized_omit_norms]",
129
+ rb_id2name(SYM2ID(v)));
130
+ }
131
+
132
+ v = rb_hash_aref(roptions, sym_term_vector);
133
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
134
+ if (v == sym_no || v == sym_false || v == Qfalse) {
135
+ *term_vector = TERM_VECTOR_NO;
136
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
137
+ *term_vector = TERM_VECTOR_YES;
138
+ } else if (v == sym_with_positions) {
139
+ *term_vector = TERM_VECTOR_WITH_POSITIONS;
140
+ } else if (v == sym_with_offsets) {
141
+ *term_vector = TERM_VECTOR_WITH_OFFSETS;
142
+ } else if (v == sym_with_positions_offsets) {
143
+ *term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
144
+ } else if (v == Qnil) {
145
+ /* leave as default */
146
+ } else {
147
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
148
+ ":term_vector. Please choose from [:no, :yes, "
149
+ ":with_positions, :with_offsets, "
150
+ ":with_positions_offsets]",
151
+ rb_id2name(SYM2ID(v)));
152
+ }
153
+ }
154
+
155
+ static VALUE
156
+ frb_get_field_info(FieldInfo *fi)
157
+ {
158
+
159
+ VALUE rfi = Qnil;
160
+ if (fi) {
161
+ rfi = object_get(fi);
162
+ if (rfi == Qnil) {
163
+ rfi = Data_Wrap_Struct(cFieldInfo, NULL, &frb_fi_free, fi);
164
+ REF(fi);
165
+ object_add(fi, rfi);
166
+ }
167
+ }
168
+ return rfi;
169
+ }
170
+
171
+ /*
172
+ * call-seq:
173
+ * FieldInfo.new(name, options = {}) -> field_info
174
+ *
175
+ * Create a new FieldInfo object with the name +name+ and the properties
176
+ * specified in +options+. The available options are [:store, :index,
177
+ * :term_vector, :boost]. See the description of FieldInfo for more
178
+ * information on these properties.
179
+ */
180
+ static VALUE
181
+ frb_fi_init(int argc, VALUE *argv, VALUE self)
182
+ {
183
+ VALUE roptions, rname;
184
+ FieldInfo *fi;
185
+ StoreValue store = STORE_YES;
186
+ IndexValue index = INDEX_YES;
187
+ TermVectorValue term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
188
+ float boost = 1.0f;
189
+
190
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
191
+ if (argc > 1) {
192
+ frb_fi_get_params(roptions, &store, &index, &term_vector, &boost);
193
+ }
194
+ fi = fi_new(frb_field(rname), store, index, term_vector);
195
+ fi->boost = boost;
196
+ Frt_Wrap_Struct(self, NULL, &frb_fi_free, fi);
197
+ object_add(fi, self);
198
+ return self;
199
+ }
200
+
201
+ /*
202
+ * call-seq:
203
+ * fi.name -> symbol
204
+ *
205
+ * Return the name of the field
206
+ */
207
+ static VALUE
208
+ frb_fi_name(VALUE self)
209
+ {
210
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
211
+ return ID2SYM(fi->name);
212
+ }
213
+
214
+ /*
215
+ * call-seq:
216
+ * fi.stored? -> bool
217
+ *
218
+ * Return true if the field is stored in the index.
219
+ */
220
+ static VALUE
221
+ frb_fi_is_stored(VALUE self)
222
+ {
223
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
224
+ return fi_is_stored(fi) ? Qtrue : Qfalse;
225
+ }
226
+
227
+ /*
228
+ * call-seq:
229
+ * fi.compressed? -> bool
230
+ *
231
+ * Return true if the field is stored in the index in compressed format.
232
+ */
233
+ static VALUE
234
+ frb_fi_is_compressed(VALUE self)
235
+ {
236
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
237
+ return fi_is_compressed(fi) ? Qtrue : Qfalse;
238
+ }
239
+
240
+ /*
241
+ * call-seq:
242
+ * fi.indexed? -> bool
243
+ *
244
+ * Return true if the field is indexed, ie searchable in the index.
245
+ */
246
+ static VALUE
247
+ frb_fi_is_indexed(VALUE self)
248
+ {
249
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
250
+ return fi_is_indexed(fi) ? Qtrue : Qfalse;
251
+ }
252
+
253
+ /*
254
+ * call-seq:
255
+ * fi.tokenized? -> bool
256
+ *
257
+ * Return true if the field is tokenized. Tokenizing is the process of
258
+ * breaking the field up into tokens. That is "the quick brown fox" becomes:
259
+ *
260
+ * ["the", "quick", "brown", "fox"]
261
+ *
262
+ * A field can only be tokenized if it is indexed.
263
+ */
264
+ static VALUE
265
+ frb_fi_is_tokenized(VALUE self)
266
+ {
267
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
268
+ return fi_is_tokenized(fi) ? Qtrue : Qfalse;
269
+ }
270
+
271
+ /*
272
+ * call-seq:
273
+ * fi.omit_norms? -> bool
274
+ *
275
+ * Return true if the field omits the norm file. The norm file is the file
276
+ * used to store the field boosts for an indexed field. If you do not boost
277
+ * any fields, and you can live without scoring based on field length then
278
+ * you can omit the norms file. This will give the index a slight performance
279
+ * boost and it will use less memory, especially for indexes which have a
280
+ * large number of documents.
281
+ */
282
+ static VALUE
283
+ frb_fi_omit_norms(VALUE self)
284
+ {
285
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
286
+ return fi_omit_norms(fi) ? Qtrue : Qfalse;
287
+ }
288
+
289
+ /*
290
+ * call-seq:
291
+ * fi.store_term_vector? -> bool
292
+ *
293
+ * Return true if the term-vectors are stored for this field.
294
+ */
295
+ static VALUE
296
+ frb_fi_store_term_vector(VALUE self)
297
+ {
298
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
299
+ return fi_store_term_vector(fi) ? Qtrue : Qfalse;
300
+ }
301
+
302
+ /*
303
+ * call-seq:
304
+ * fi.store_positions? -> bool
305
+ *
306
+ * Return true if positions are stored with the term-vectors for this field.
307
+ */
308
+ static VALUE
309
+ frb_fi_store_positions(VALUE self)
310
+ {
311
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
312
+ return fi_store_positions(fi) ? Qtrue : Qfalse;
313
+ }
314
+
315
+ /*
316
+ * call-seq:
317
+ * fi.store_offsets? -> bool
318
+ *
319
+ * Return true if offsets are stored with the term-vectors for this field.
320
+ */
321
+ static VALUE
322
+ frb_fi_store_offsets(VALUE self)
323
+ {
324
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
325
+ return fi_store_offsets(fi) ? Qtrue : Qfalse;
326
+ }
327
+
328
+ /*
329
+ * call-seq:
330
+ * fi.has_norms? -> bool
331
+ *
332
+ * Return true if this field has a norms file. This is the same as calling;
333
+ *
334
+ * fi.indexed? and not fi.omit_norms?
335
+ */
336
+ static VALUE
337
+ frb_fi_has_norms(VALUE self)
338
+ {
339
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
340
+ return fi_has_norms(fi) ? Qtrue : Qfalse;
341
+ }
342
+
343
+ /*
344
+ * call-seq:
345
+ * fi.boost -> boost
346
+ *
347
+ * Return the default boost for this field
348
+ */
349
+ static VALUE
350
+ frb_fi_boost(VALUE self)
351
+ {
352
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
353
+ return rb_float_new((double)fi->boost);
354
+ }
355
+
356
+ /*
357
+ * call-seq:
358
+ * fi.to_s -> string
359
+ *
360
+ * Return a string representation of the FieldInfo object.
361
+ */
362
+ static VALUE
363
+ frb_fi_to_s(VALUE self)
364
+ {
365
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
366
+ char *fi_s = fi_to_s(fi);
367
+ VALUE rfi_s = rb_str_new2(fi_s);
368
+ free(fi_s);
369
+ return rfi_s;
370
+ }
371
+
372
+ /****************************************************************************
373
+ *
374
+ * FieldInfos Methods
375
+ *
376
+ ****************************************************************************/
377
+
378
+ static void
379
+ frb_fis_free(void *p)
380
+ {
381
+ object_del(p);
382
+ fis_deref((FieldInfos *)p);
383
+ }
384
+
385
+ static void
386
+ frb_fis_mark(void *p)
387
+ {
388
+ int i;
389
+ FieldInfos *fis = (FieldInfos *)p;
390
+
391
+ for (i = 0; i < fis->size; i++) {
392
+ frb_gc_mark(fis->fields[i]);
393
+ }
394
+ }
395
+
396
+ static VALUE
397
+ frb_get_field_infos(FieldInfos *fis)
398
+ {
399
+
400
+ VALUE rfis = Qnil;
401
+ if (fis) {
402
+ rfis = object_get(fis);
403
+ if (rfis == Qnil) {
404
+ rfis = Data_Wrap_Struct(cFieldInfos, &frb_fis_mark, &frb_fis_free,
405
+ fis);
406
+ REF(fis);
407
+ object_add(fis, rfis);
408
+ }
409
+ }
410
+ return rfis;
411
+ }
412
+
413
+ /*
414
+ * call-seq:
415
+ * FieldInfos.new(defaults = {}) -> field_infos
416
+ *
417
+ * Create a new FieldInfos object which uses the default values for fields
418
+ * specified in the +default+ hash parameter. See FieldInfo for available
419
+ * property values.
420
+ */
421
+ static VALUE
422
+ frb_fis_init(int argc, VALUE *argv, VALUE self)
423
+ {
424
+ VALUE roptions;
425
+ FieldInfos *fis;
426
+ StoreValue store = STORE_YES;
427
+ IndexValue index = INDEX_YES;
428
+ TermVectorValue term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
429
+ float boost;
430
+
431
+ rb_scan_args(argc, argv, "01", &roptions);
432
+ if (argc > 0) {
433
+ frb_fi_get_params(roptions, &store, &index, &term_vector, &boost);
434
+ }
435
+ fis = fis_new(store, index, term_vector);
436
+ Frt_Wrap_Struct(self, &frb_fis_mark, &frb_fis_free, fis);
437
+ object_add(fis, self);
438
+ return self;
439
+ }
440
+
441
+ /*
442
+ * call-seq:
443
+ * fis.to_a -> array
444
+ *
445
+ * Return an array of the FieldInfo objects contained but this FieldInfos
446
+ * object.
447
+ */
448
+ static VALUE
449
+ frb_fis_to_a(VALUE self)
450
+ {
451
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
452
+ VALUE rary = rb_ary_new();
453
+ int i;
454
+
455
+ for (i = 0; i < fis->size; i++) {
456
+ rb_ary_push(rary, frb_get_field_info(fis->fields[i]));
457
+ }
458
+ return rary;
459
+ }
460
+
461
+ /*
462
+ * call-seq:
463
+ * fis[name] -> field_info
464
+ * fis[number] -> field_info
465
+ *
466
+ * Get the FieldInfo object. FieldInfo objects can be referenced by either
467
+ * their field-number of the field-name (which must be a symbol). For
468
+ * example;
469
+ *
470
+ * fi = fis[:name]
471
+ * fi = fis[2]
472
+ */
473
+ static VALUE
474
+ frb_fis_get(VALUE self, VALUE ridx)
475
+ {
476
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
477
+ VALUE rfi = Qnil;
478
+ switch (TYPE(ridx)) {
479
+ case T_FIXNUM: {
480
+ int index = FIX2INT(ridx);
481
+ if (index < 0) index += fis->size;
482
+ if (index < 0 || index >= fis->size) {
483
+ rb_raise(rb_eArgError, "index of %d is out of range (0..%d)\n",
484
+ index, fis->size - 1);
485
+ }
486
+ rfi = frb_get_field_info(fis->fields[index]);
487
+ break;
488
+ }
489
+ case T_SYMBOL:
490
+ case T_STRING:
491
+ rfi = frb_get_field_info(fis_get_field(fis, frb_field(ridx)));
492
+ break;
493
+ /*
494
+ case T_STRING:
495
+ rfi = frb_get_field_info(fis_get_field(fis, StringValuePtr(ridx)));
496
+ break;
497
+ */
498
+ default:
499
+ rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
500
+ rs2s(rb_obj_as_string(ridx)));
501
+ break;
502
+ }
503
+ return rfi;
504
+ }
505
+
506
+ /*
507
+ * call-seq:
508
+ * fis << fi -> fis
509
+ * fis.add(fi) -> fis
510
+ *
511
+ * Add a FieldInfo object. Use the FieldInfos#add_field method where
512
+ * possible.
513
+ */
514
+ static VALUE
515
+ frb_fis_add(VALUE self, VALUE rfi)
516
+ {
517
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
518
+ FieldInfo *fi = (FieldInfo *)frb_rb_data_ptr(rfi);
519
+ fis_add_field(fis, fi);
520
+ REF(fi);
521
+ return self;
522
+ }
523
+
524
+ /*
525
+ * call-seq:
526
+ * fis.add_field(name, properties = {} -> fis
527
+ *
528
+ * Add a new field to the FieldInfos object. See FieldInfo for a description
529
+ * of the available properties.
530
+ */
531
+ static VALUE
532
+ frb_fis_add_field(int argc, VALUE *argv, VALUE self)
533
+ {
534
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
535
+ FieldInfo *fi;
536
+ StoreValue store = fis->store;
537
+ IndexValue index = fis->index;
538
+ TermVectorValue term_vector = fis->term_vector;
539
+ float boost = 1.0f;
540
+ VALUE rname, roptions;
541
+
542
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
543
+ if (argc > 1) {
544
+ frb_fi_get_params(roptions, &store, &index, &term_vector, &boost);
545
+ }
546
+ fi = fi_new(frb_field(rname), store, index, term_vector);
547
+ fi->boost = boost;
548
+ fis_add_field(fis, fi);
549
+ return self;
550
+ }
551
+
552
+ /*
553
+ * call-seq:
554
+ * fis.each {|fi| do_something } -> fis
555
+ *
556
+ * Iterate through the FieldInfo objects.
557
+ */
558
+ static VALUE
559
+ frb_fis_each(VALUE self)
560
+ {
561
+ int i;
562
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
563
+
564
+ for (i = 0; i < fis->size; i++) {
565
+ rb_yield(frb_get_field_info(fis->fields[i]));
566
+ }
567
+ return self;
568
+ }
569
+
570
+ /*
571
+ * call-seq:
572
+ * fis.to_s -> string
573
+ *
574
+ * Return a string representation of the FieldInfos object.
575
+ */
576
+ static VALUE
577
+ frb_fis_to_s(VALUE self)
578
+ {
579
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
580
+ char *fis_s = fis_to_s(fis);
581
+ VALUE rfis_s = rb_str_new2(fis_s);
582
+ free(fis_s);
583
+ return rfis_s;
584
+ }
585
+
586
+ /*
587
+ * call-seq:
588
+ * fis.size -> int
589
+ *
590
+ * Return the number of fields in the FieldInfos object.
591
+ */
592
+ static VALUE
593
+ frb_fis_size(VALUE self)
594
+ {
595
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
596
+ return INT2FIX(fis->size);
597
+ }
598
+
599
+ /*
600
+ * call-seq:
601
+ * fis.create_index(dir) -> self
602
+ *
603
+ * Create a new index in the directory specified. The directory +dir+ can
604
+ * either be a string path representing a directory on the file-system or an
605
+ * actual directory object. Care should be taken when using this method. Any
606
+ * existing index (or other files for that matter) will be deleted from the
607
+ * directory and overwritten by the new index.
608
+ */
609
+ static VALUE
610
+ frb_fis_create_index(VALUE self, VALUE rdir)
611
+ {
612
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
613
+ Store *store = NULL;
614
+ if (TYPE(rdir) == T_DATA) {
615
+ store = DATA_PTR(rdir);
616
+ REF(store);
617
+ } else {
618
+ StringValue(rdir);
619
+ frb_create_dir(rdir);
620
+ store = open_fs_store(rs2s(rdir));
621
+ }
622
+ index_create(store, fis);
623
+ store_deref(store);
624
+ return self;
625
+ }
626
+
627
+ /*
628
+ * call-seq:
629
+ * fis.fields -> symbol array
630
+ * fis.field_names -> symbol array
631
+ *
632
+ * Return a list of the field names (as symbols) of all the fields in the
633
+ * index.
634
+ */
635
+ static VALUE
636
+ frb_fis_get_fields(VALUE self)
637
+ {
638
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
639
+ VALUE rfield_names = rb_ary_new();
640
+ int i;
641
+ for (i = 0; i < fis->size; i++) {
642
+ rb_ary_push(rfield_names, FSYM2SYM(fis->fields[i]->name));
643
+ }
644
+ return rfield_names;
645
+ }
646
+
647
+ /*
648
+ * call-seq:
649
+ * fis.tokenized_fields -> symbol array
650
+ *
651
+ * Return a list of the field names (as symbols) of all the tokenized fields
652
+ * in the index.
653
+ */
654
+ static VALUE
655
+ frb_fis_get_tk_fields(VALUE self)
656
+ {
657
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
658
+ VALUE rfield_names = rb_ary_new();
659
+ int i;
660
+ for (i = 0; i < fis->size; i++) {
661
+ if (!fi_is_tokenized(fis->fields[i])) continue;
662
+ rb_ary_push(rfield_names, FSYM2SYM(fis->fields[i]->name));
663
+ }
664
+ return rfield_names;
665
+ }
666
+
667
+ /****************************************************************************
668
+ *
669
+ * TermEnum Methods
670
+ *
671
+ ****************************************************************************/
672
+
673
+ static void
674
+ frb_te_free(void *p)
675
+ {
676
+ TermEnum *te = (TermEnum *)p;
677
+ te->close(te);
678
+ }
679
+
680
+ static VALUE
681
+ frb_te_get_set_term(VALUE self, const char *term)
682
+ {
683
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
684
+ VALUE str = term ? rb_str_new(term, te->curr_term_len) : Qnil;
685
+ rb_ivar_set(self, id_term, str);
686
+ return str;
687
+ }
688
+
689
+ static VALUE
690
+ frb_get_te(VALUE rir, TermEnum *te)
691
+ {
692
+ VALUE self = Qnil;
693
+ if (te != NULL) {
694
+ self = Data_Wrap_Struct(cTermEnum, NULL, &frb_te_free, te);
695
+ frb_te_get_set_term(self, te->curr_term);
696
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
697
+ }
698
+ return self;
699
+ }
700
+
701
+ /*
702
+ * call-seq:
703
+ * term_enum.next -> term_string
704
+ *
705
+ * Returns the next term in the enumeration or nil otherwise.
706
+ */
707
+ static VALUE
708
+ frb_te_next(VALUE self)
709
+ {
710
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
711
+ return frb_te_get_set_term(self, te->next(te));
712
+ }
713
+
714
+ /*
715
+ * call-seq:
716
+ * term_enum.term -> term_string
717
+ *
718
+ * Returns the current term pointed to by the enum. This method should only
719
+ * be called after a successful call to TermEnum#next.
720
+ */
721
+ static VALUE
722
+ frb_te_term(VALUE self)
723
+ {
724
+ return rb_ivar_get(self, id_term);
725
+ }
726
+
727
+ /*
728
+ * call-seq:
729
+ * term_enum.doc_freq -> integer
730
+ *
731
+ * Returns the document frequency of the current term pointed to by the enum.
732
+ * That is the number of documents that this term appears in. The method
733
+ * should only be called after a successful call to TermEnum#next.
734
+ */
735
+ static VALUE
736
+ frb_te_doc_freq(VALUE self)
737
+ {
738
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
739
+ return INT2FIX(te->curr_ti.doc_freq);
740
+ }
741
+
742
+ /*
743
+ * call-seq:
744
+ * term_enum.skip_to(target) -> term
745
+ *
746
+ * Skip to term +target+. This method can skip forwards or backwards. If you
747
+ * want to skip back to the start, pass the empty string "". That is;
748
+ *
749
+ * term_enum.skip_to("")
750
+ *
751
+ * Returns the first term greater than or equal to +target+
752
+ */
753
+ static VALUE
754
+ frb_te_skip_to(VALUE self, VALUE rterm)
755
+ {
756
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
757
+ return frb_te_get_set_term(self, te->skip_to(te, rs2s(rterm)));
758
+ }
759
+
760
+ /*
761
+ * call-seq:
762
+ * term_enum.each {|term, doc_freq| do_something() } -> term_count
763
+ *
764
+ * Iterates through all the terms in the field, yielding the term and the
765
+ * document frequency.
766
+ */
767
+ static VALUE
768
+ frb_te_each(VALUE self)
769
+ {
770
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
771
+ char *term;
772
+ int term_cnt = 0;
773
+ VALUE vals = rb_ary_new2(2);
774
+ rb_ary_store(vals, 0, Qnil);
775
+ rb_ary_store(vals, 1, Qnil);
776
+
777
+ /* each is being called so there will be no current term */
778
+ rb_ivar_set(self, id_term, Qnil);
779
+
780
+
781
+ while (NULL != (term = te->next(te))) {
782
+ term_cnt++;
783
+ RARRAY_PTR(vals)[0] = rb_str_new(term, te->curr_term_len);
784
+ RARRAY_PTR(vals)[1] = INT2FIX(te->curr_ti.doc_freq);
785
+ rb_yield(vals);
786
+ }
787
+ return INT2FIX(term_cnt);
788
+ }
789
+
790
+ /*
791
+ * call-seq:
792
+ * term_enum.set_field(field) -> self
793
+ *
794
+ * Set the field for the term_enum. The field value should be a symbol as
795
+ * usual. For example, to scan all title terms you'd do this;
796
+ *
797
+ * term_enum.set_field(:title).each do |term, doc_freq|
798
+ * do_something()
799
+ * end
800
+ */
801
+ static VALUE
802
+ frb_te_set_field(VALUE self, VALUE rfield)
803
+ {
804
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
805
+ int field_num = 0;
806
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
807
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
808
+ if (rfnum != Qnil) {
809
+ field_num = FIX2INT(rfnum);
810
+ rb_ivar_set(self, id_field_num, rfnum);
811
+ } else {
812
+ Check_Type(rfield, T_SYMBOL);
813
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
814
+ frb_field(rfield));
815
+ }
816
+ te->set_field(te, field_num);
817
+
818
+ return self;
819
+ }
820
+
821
+ /*
822
+ * call-seq:
823
+ * term_enum.to_json() -> string
824
+ *
825
+ * Returns a JSON representation of the term enum. You can speed this up by
826
+ * having the method return arrays instead of objects, simply by passing an
827
+ * argument to the to_json method. For example;
828
+ *
829
+ * term_enum.to_json() #=>
830
+ * # [
831
+ * # {"term":"apple","frequency":12},
832
+ * # {"term":"banana","frequency":2},
833
+ * # {"term":"cantaloupe","frequency":12}
834
+ * # ]
835
+ *
836
+ * term_enum.to_json(:fast) #=>
837
+ * # [
838
+ * # ["apple",12],
839
+ * # ["banana",2],
840
+ * # ["cantaloupe",12]
841
+ * # ]
842
+ */
843
+ static VALUE
844
+ frb_te_to_json(int argc, VALUE *argv, VALUE self)
845
+ {
846
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
847
+ VALUE rjson;
848
+ char *json, *jp;
849
+ char *term;
850
+ int capa = 65536;
851
+ jp = json = ALLOC_N(char, capa);
852
+ *(jp++) = '[';
853
+
854
+ if (argc > 0) {
855
+ while (NULL != (term = te->next(te))) {
856
+ /* enough room for for term after converting " to '"' and frequency
857
+ * plus some extra for good measure */
858
+ *(jp++) = '[';
859
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
860
+ capa <<= 1;
861
+ REALLOC_N(json, char, capa);
862
+ }
863
+ jp = json_concat_string(jp, term);
864
+ *(jp++) = ',';
865
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
866
+ jp += strlen(jp);
867
+ *(jp++) = ']';
868
+ *(jp++) = ',';
869
+ }
870
+ }
871
+ else {
872
+ while (NULL != (term = te->next(te))) {
873
+ /* enough room for for term after converting " to '"' and frequency
874
+ * plus some extra for good measure */
875
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
876
+ capa <<= 1;
877
+ REALLOC_N(json, char, capa);
878
+ }
879
+ *(jp++) = '{';
880
+ memcpy(jp, "\"term\":", 7);
881
+ jp += 7;
882
+ jp = json_concat_string(jp, term);
883
+ *(jp++) = ',';
884
+ memcpy(jp, "\"frequency\":", 12);
885
+ jp += 12;
886
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
887
+ jp += strlen(jp);
888
+ *(jp++) = '}';
889
+ *(jp++) = ',';
890
+ }
891
+ }
892
+ if (*(jp-1) == ',') jp--;
893
+ *(jp++) = ']';
894
+ *jp = '\0';
895
+
896
+ rjson = rb_str_new2(json);
897
+ free(json);
898
+ return rjson;
899
+ }
900
+
901
+ /****************************************************************************
902
+ *
903
+ * TermDocEnum Methods
904
+ *
905
+ ****************************************************************************/
906
+
907
+ static void
908
+ frb_tde_free(void *p)
909
+ {
910
+ TermDocEnum *tde = (TermDocEnum *)p;
911
+ tde->close(tde);
912
+ }
913
+
914
+ static VALUE
915
+ frb_get_tde(VALUE rir, TermDocEnum *tde)
916
+ {
917
+ VALUE self = Data_Wrap_Struct(cTermDocEnum, NULL, &frb_tde_free, tde);
918
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
919
+ return self;
920
+ }
921
+
922
+ /*
923
+ * call-seq:
924
+ * term_doc_enum.seek(field, term) -> self
925
+ *
926
+ * Seek the term +term+ in the index for +field+. After you call this method
927
+ * you can call next or each to skip through the documents and positions of
928
+ * this particular term.
929
+ */
930
+ static VALUE
931
+ frb_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
932
+ {
933
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
934
+ char *term;
935
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
936
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
937
+ int field_num = -1;
938
+ term = StringValuePtr(rterm);
939
+ if (rfnum != Qnil) {
940
+ field_num = FIX2INT(rfnum);
941
+ } else {
942
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
943
+ frb_field(rfield));
944
+ }
945
+ tde->seek(tde, field_num, term);
946
+ return self;
947
+ }
948
+
949
+ /*
950
+ * call-seq:
951
+ * term_doc_enum.seek_term_enum(term_enum) -> self
952
+ *
953
+ * Seek the current term in +term_enum+. You could just use the standard seek
954
+ * method like this;
955
+ *
956
+ * term_doc_enum.seek(term_enum.term)
957
+ *
958
+ * However the +seek_term_enum+ method saves an index lookup so should offer
959
+ * a large performance improvement.
960
+ */
961
+ static VALUE
962
+ frb_tde_seek_te(VALUE self, VALUE rterm_enum)
963
+ {
964
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
965
+ TermEnum *te = (TermEnum *)frb_rb_data_ptr(rterm_enum);
966
+ tde->seek_te(tde, te);
967
+ return self;
968
+ }
969
+
970
+ /*
971
+ * call-seq:
972
+ * term_doc_enum.doc -> doc_id
973
+ *
974
+ * Returns the current document number pointed to by the +term_doc_enum+.
975
+ */
976
+ static VALUE
977
+ frb_tde_doc(VALUE self)
978
+ {
979
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
980
+ return INT2FIX(tde->doc_num(tde));
981
+ }
982
+
983
+ /*
984
+ * call-seq:
985
+ * term_doc_enum.doc -> doc_id
986
+ *
987
+ * Returns the frequency of the current document pointed to by the
988
+ * +term_doc_enum+.
989
+ */
990
+ static VALUE
991
+ frb_tde_freq(VALUE self)
992
+ {
993
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
994
+ return INT2FIX(tde->freq(tde));
995
+ }
996
+
997
+ /*
998
+ * call-seq:
999
+ * term_doc_enum.doc -> doc_id
1000
+ *
1001
+ * Move forward to the next document in the enumeration. Returns +true+ if
1002
+ * there is another document or +false+ otherwise.
1003
+ */
1004
+ static VALUE
1005
+ frb_tde_next(VALUE self)
1006
+ {
1007
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1008
+ return tde->next(tde) ? Qtrue : Qfalse;
1009
+ }
1010
+
1011
+ /*
1012
+ * call-seq:
1013
+ * term_doc_enum.doc -> doc_id
1014
+ *
1015
+ * Move forward to the next document in the enumeration. Returns +true+ if
1016
+ * there is another document or +false+ otherwise.
1017
+ */
1018
+ static VALUE
1019
+ frb_tde_next_position(VALUE self)
1020
+ {
1021
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1022
+ int pos;
1023
+ if (tde->next_position == NULL) {
1024
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1025
+ "the TermDocEnum with Index#term_positions method rather "
1026
+ "than the Index#term_docs method");
1027
+ }
1028
+ pos = tde->next_position(tde);
1029
+ return pos >= 0 ? INT2FIX(pos) : Qnil;
1030
+ }
1031
+
1032
+ /*
1033
+ * call-seq:
1034
+ * term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
1035
+ *
1036
+ * Iterate through the documents and document frequencies in the
1037
+ * +term_doc_enum+.
1038
+ *
1039
+ * NOTE: this method can only be called once after each seek. If you need to
1040
+ * call +#each+ again then you should call +#seek+ again too.
1041
+ */
1042
+ static VALUE
1043
+ frb_tde_each(VALUE self)
1044
+ {
1045
+ int doc_cnt = 0;
1046
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1047
+ VALUE vals = rb_ary_new2(2);
1048
+ rb_ary_store(vals, 0, Qnil);
1049
+ rb_ary_store(vals, 1, Qnil);
1050
+
1051
+ while (tde->next(tde)) {
1052
+ doc_cnt++;
1053
+ RARRAY_PTR(vals)[0] = INT2FIX(tde->doc_num(tde));
1054
+ RARRAY_PTR(vals)[1] = INT2FIX(tde->freq(tde));
1055
+ rb_yield(vals);
1056
+
1057
+ }
1058
+ return INT2FIX(doc_cnt);
1059
+ }
1060
+
1061
+ /*
1062
+ * call-seq:
1063
+ * term_doc_enum.to_json() -> string
1064
+ *
1065
+ * Returns a json representation of the term doc enum. It will also add the
1066
+ * term positions if they are available. You can speed this up by having the
1067
+ * method return arrays instead of objects, simply by passing an argument to
1068
+ * the to_json method. For example;
1069
+ *
1070
+ * term_doc_enum.to_json() #=>
1071
+ * # [
1072
+ * # {"document":1,"frequency":12},
1073
+ * # {"document":11,"frequency":1},
1074
+ * # {"document":29,"frequency":120},
1075
+ * # {"document":30,"frequency":3}
1076
+ * # ]
1077
+ *
1078
+ * term_doc_enum.to_json(:fast) #=>
1079
+ * # [
1080
+ * # [1,12],
1081
+ * # [11,1],
1082
+ * # [29,120],
1083
+ * # [30,3]
1084
+ * # ]
1085
+ */
1086
+ static VALUE
1087
+ frb_tde_to_json(int argc, VALUE *argv, VALUE self)
1088
+ {
1089
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1090
+ VALUE rjson;
1091
+ char *json, *jp;
1092
+ int capa = 65536;
1093
+ char *format;
1094
+ char close = (argc > 0) ? ']' : '}';
1095
+ bool do_positions = tde->next_position != NULL;
1096
+ jp = json = ALLOC_N(char, capa);
1097
+ *(jp++) = '[';
1098
+
1099
+ if (do_positions) {
1100
+ if (argc == 0) {
1101
+ format = "{\"document\":%d,\"frequency\":%d,\"positions\":[";
1102
+ }
1103
+ else {
1104
+ format = "[%d,%d,[";
1105
+ }
1106
+ }
1107
+ else {
1108
+ if (argc == 0) {
1109
+ format = "{\"document\":%d,\"frequency\":%d},";
1110
+ }
1111
+ else {
1112
+ format = "[%d,%d],";
1113
+ }
1114
+ }
1115
+ while (tde->next(tde)) {
1116
+ /* 100 chars should be enough room for an extra entry */
1117
+ if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) {
1118
+ capa <<= 1;
1119
+ REALLOC_N(json, char, capa);
1120
+ }
1121
+ sprintf(jp, format, tde->doc_num(tde), tde->freq(tde));
1122
+ jp += strlen(jp);
1123
+ if (do_positions) {
1124
+ int pos;
1125
+ while (0 <= (pos = tde->next_position(tde))) {
1126
+ sprintf(jp, "%d,", pos);
1127
+ jp += strlen(jp);
1128
+ }
1129
+ if (*(jp - 1) == ',') jp--;
1130
+ *(jp++) = ']';
1131
+ *(jp++) = close;
1132
+ *(jp++) = ',';
1133
+ }
1134
+ }
1135
+ if (*(jp - 1) == ',') jp--;
1136
+ *(jp++) = ']';
1137
+ *jp = '\0';
1138
+
1139
+ rjson = rb_str_new2(json);
1140
+ free(json);
1141
+ return rjson;
1142
+ }
1143
+
1144
+ /*
1145
+ * call-seq:
1146
+ * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
1147
+ *
1148
+ * Iterate through each of the positions occupied by the current term in the
1149
+ * current document. This can only be called once per document. It can be
1150
+ * used within the each method. For example, to print the terms documents and
1151
+ * positions;
1152
+ *
1153
+ * tde.each do |doc_id, freq|
1154
+ * puts "term appeared #{freq} times in document #{doc_id}:"
1155
+ * positions = []
1156
+ * tde.each_position {|pos| positions << pos}
1157
+ * puts " #{positions.join(', ')}"
1158
+ * end
1159
+ */
1160
+ static VALUE
1161
+ frb_tde_each_position(VALUE self)
1162
+ {
1163
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1164
+ int pos;
1165
+ if (tde->next_position == NULL) {
1166
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1167
+ "the TermDocEnum with Index#term_positions method rather "
1168
+ "than the Index#term_docs method");
1169
+ }
1170
+ while (0 <= (pos = tde->next_position(tde))) {
1171
+ rb_yield(INT2FIX(pos));
1172
+ }
1173
+ return self;
1174
+ }
1175
+
1176
+ /*
1177
+ * call-seq:
1178
+ * term_doc_enum.skip_to(target) -> bool
1179
+ *
1180
+ * Skip to the required document number +target+ and return true if there is
1181
+ * a document >= +target+.
1182
+ */
1183
+ static VALUE
1184
+ frb_tde_skip_to(VALUE self, VALUE rtarget)
1185
+ {
1186
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
1187
+ return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
1188
+ }
1189
+
1190
+ /****************************************************************************
1191
+ *
1192
+ * TVOffsets Methods
1193
+ *
1194
+ ****************************************************************************/
1195
+
1196
+ static VALUE
1197
+ frb_get_tv_offsets(Offset *offset)
1198
+ {
1199
+ return rb_struct_new(cTVOffsets,
1200
+ ULL2NUM((u64)offset->start),
1201
+ ULL2NUM((u64)offset->end),
1202
+ NULL);
1203
+ }
1204
+
1205
+ /****************************************************************************
1206
+ *
1207
+ * TVTerm Methods
1208
+ *
1209
+ ****************************************************************************/
1210
+
1211
+ static VALUE
1212
+ frb_get_tv_term(TVTerm *tv_term)
1213
+ {
1214
+ int i;
1215
+ const int freq = tv_term->freq;
1216
+ VALUE rtext;
1217
+ VALUE rpositions = Qnil;
1218
+ rtext = rb_str_new2(tv_term->text);
1219
+ if (tv_term->positions) {
1220
+ int *positions = tv_term->positions;
1221
+ rpositions = rb_ary_new2(freq);
1222
+ for (i = 0; i < freq; i++) {
1223
+ rb_ary_store(rpositions, i, INT2FIX(positions[i]));
1224
+ }
1225
+ }
1226
+ return rb_struct_new(cTVTerm, rtext, INT2FIX(freq), rpositions, NULL);
1227
+ }
1228
+
1229
+ /****************************************************************************
1230
+ *
1231
+ * TermVector Methods
1232
+ *
1233
+ ****************************************************************************/
1234
+
1235
+ static VALUE
1236
+ frb_get_tv(TermVector *tv)
1237
+ {
1238
+ int i;
1239
+ TVTerm *terms = tv->terms;
1240
+ const int t_cnt = tv->term_cnt;
1241
+ const int o_cnt = tv->offset_cnt;
1242
+ VALUE rfield, rterms;
1243
+ VALUE roffsets = Qnil;
1244
+ rfield = FSYM2SYM(tv->field);
1245
+
1246
+ rterms = rb_ary_new2(t_cnt);
1247
+ for (i = 0; i < t_cnt; i++) {
1248
+ rb_ary_store(rterms, i, frb_get_tv_term(&terms[i]));
1249
+ }
1250
+
1251
+ if (tv->offsets) {
1252
+ Offset *offsets = tv->offsets;
1253
+ roffsets = rb_ary_new2(o_cnt);
1254
+ for (i = 0; i < o_cnt; i++) {
1255
+ rb_ary_store(roffsets, i, frb_get_tv_offsets(&offsets[i]));
1256
+ }
1257
+ }
1258
+
1259
+ return rb_struct_new(cTermVector, rfield, rterms, roffsets, NULL);
1260
+ }
1261
+
1262
+ /****************************************************************************
1263
+ *
1264
+ * IndexWriter Methods
1265
+ *
1266
+ ****************************************************************************/
1267
+
1268
+ void
1269
+ frb_iw_free(void *p)
1270
+ {
1271
+ iw_close((IndexWriter *)p);
1272
+ }
1273
+
1274
+ void
1275
+ frb_iw_mark(void *p)
1276
+ {
1277
+ IndexWriter *iw = (IndexWriter *)p;
1278
+ frb_gc_mark(iw->analyzer);
1279
+ frb_gc_mark(iw->store);
1280
+ frb_gc_mark(iw->fis);
1281
+ }
1282
+
1283
+ /*
1284
+ * call-seq:
1285
+ * index_writer.close -> nil
1286
+ *
1287
+ * Close the IndexWriter. This will close and free all resources used
1288
+ * exclusively by the index writer. The garbage collector will do this
1289
+ * automatically if not called explicitly.
1290
+ */
1291
+ static VALUE
1292
+ frb_iw_close(VALUE self)
1293
+ {
1294
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1295
+ Frt_Unwrap_Struct(self);
1296
+ iw_close(iw);
1297
+ return Qnil;
1298
+ }
1299
+
1300
+ #define SET_INT_ATTR(attr) \
1301
+ do {\
1302
+ if (RTEST(rval = rb_hash_aref(roptions, sym_##attr)))\
1303
+ config.attr = FIX2INT(rval);\
1304
+ } while (0)
1305
+
1306
+ /*
1307
+ * call-seq:
1308
+ * IndexWriter.new(options = {}) -> index_writer
1309
+ *
1310
+ * Create a new IndexWriter. You should either pass a path or a directory to
1311
+ * this constructor. For example, here are three ways you can create an
1312
+ * IndexWriter;
1313
+ *
1314
+ * dir = RAMDirectory.new()
1315
+ * iw = IndexWriter.new(:dir => dir)
1316
+ *
1317
+ * dir = FSDirectory.new("/path/to/index")
1318
+ * iw = IndexWriter.new(:dir => dir)
1319
+ *
1320
+ * iw = IndexWriter.new(:path => "/path/to/index")
1321
+ *
1322
+ * See IndexWriter for more options.
1323
+ */
1324
+ static VALUE
1325
+ frb_iw_init(int argc, VALUE *argv, VALUE self)
1326
+ {
1327
+ VALUE roptions, rval;
1328
+ bool create = false;
1329
+ bool create_if_missing = true;
1330
+ Store *store = NULL;
1331
+ Analyzer *analyzer = NULL;
1332
+ IndexWriter *volatile iw = NULL;
1333
+ Config config = default_config;
1334
+
1335
+ rb_scan_args(argc, argv, "01", &roptions);
1336
+ if (argc > 0) {
1337
+ Check_Type(roptions, T_HASH);
1338
+
1339
+ if ((rval = rb_hash_aref(roptions, sym_dir)) != Qnil) {
1340
+ Check_Type(rval, T_DATA);
1341
+ store = DATA_PTR(rval);
1342
+ } else if ((rval = rb_hash_aref(roptions, sym_path)) != Qnil) {
1343
+ StringValue(rval);
1344
+ frb_create_dir(rval);
1345
+ store = open_fs_store(rs2s(rval));
1346
+ DEREF(store);
1347
+ }
1348
+
1349
+ /* Let ruby's garbage collector handle the closing of the store
1350
+ if (!close_dir) {
1351
+ close_dir = RTEST(rb_hash_aref(roptions, sym_close_dir));
1352
+ }
1353
+ */
1354
+ /* use_compound_file defaults to true */
1355
+ config.use_compound_file =
1356
+ (rb_hash_aref(roptions, sym_use_compound_file) == Qfalse)
1357
+ ? false
1358
+ : true;
1359
+
1360
+ if ((rval = rb_hash_aref(roptions, sym_analyzer)) != Qnil) {
1361
+ analyzer = frb_get_cwrapped_analyzer(rval);
1362
+ }
1363
+
1364
+ create = RTEST(rb_hash_aref(roptions, sym_create));
1365
+ if ((rval = rb_hash_aref(roptions, sym_create_if_missing)) != Qnil) {
1366
+ create_if_missing = RTEST(rval);
1367
+ }
1368
+ SET_INT_ATTR(chunk_size);
1369
+ SET_INT_ATTR(max_buffer_memory);
1370
+ SET_INT_ATTR(index_interval);
1371
+ SET_INT_ATTR(skip_interval);
1372
+ SET_INT_ATTR(merge_factor);
1373
+ SET_INT_ATTR(max_buffered_docs);
1374
+ SET_INT_ATTR(max_merge_docs);
1375
+ SET_INT_ATTR(max_field_length);
1376
+ }
1377
+ if (NULL == store) {
1378
+ store = open_ram_store();
1379
+ DEREF(store);
1380
+ }
1381
+ if (!create && create_if_missing && !store->exists(store, "segments")) {
1382
+ create = true;
1383
+ }
1384
+ if (create) {
1385
+ FieldInfos *fis;
1386
+ if ((rval = rb_hash_aref(roptions, sym_field_infos)) != Qnil) {
1387
+ Data_Get_Struct(rval, FieldInfos, fis);
1388
+ index_create(store, fis);
1389
+ } else {
1390
+ fis = fis_new(STORE_YES, INDEX_YES,
1391
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS);
1392
+ index_create(store, fis);
1393
+ fis_deref(fis);
1394
+ }
1395
+ }
1396
+
1397
+ iw = iw_open(store, analyzer, &config);
1398
+
1399
+ Frt_Wrap_Struct(self, &frb_iw_mark, &frb_iw_free, iw);
1400
+
1401
+ if (rb_block_given_p()) {
1402
+ rb_yield(self);
1403
+ frb_iw_close(self);
1404
+ return Qnil;
1405
+ } else {
1406
+ return self;
1407
+ }
1408
+ }
1409
+
1410
+ /*
1411
+ * call-seq:
1412
+ * iw.doc_count -> number
1413
+ *
1414
+ * Returns the number of documents in the Index. Note that deletions won't be
1415
+ * taken into account until the IndexWriter has been committed.
1416
+ */
1417
+ static VALUE
1418
+ frb_iw_get_doc_count(VALUE self)
1419
+ {
1420
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1421
+ return INT2FIX(iw_doc_count(iw));
1422
+ }
1423
+
1424
+ static int
1425
+ frb_hash_to_doc_i(VALUE key, VALUE value, VALUE arg)
1426
+ {
1427
+ if (key == Qundef) {
1428
+ return ST_CONTINUE;
1429
+ } else {
1430
+ Document *doc = (Document *)arg;
1431
+ Symbol field = frb_field(key);
1432
+ VALUE val;
1433
+ DocField *df;
1434
+ if (NULL == (df = doc_get_field(doc, field))) {
1435
+ df = df_new(field);
1436
+ }
1437
+ if (rb_respond_to(value, id_boost)) {
1438
+ df->boost = (float)NUM2DBL(rb_funcall(value, id_boost, 0));
1439
+ }
1440
+ switch (TYPE(value)) {
1441
+ case T_ARRAY:
1442
+ {
1443
+ int i;
1444
+ df->destroy_data = true;
1445
+ for (i = 0; i < RARRAY_LEN(value); i++) {
1446
+ val = rb_obj_as_string(RARRAY_PTR(value)[i]);
1447
+ df_add_data_len(df, rstrdup(val), RSTRING_LEN(val));
1448
+ }
1449
+ }
1450
+ break;
1451
+ case T_STRING:
1452
+ df_add_data_len(df, rs2s(value), RSTRING_LEN(value));
1453
+ break;
1454
+ default:
1455
+ val = rb_obj_as_string(value);
1456
+ df->destroy_data = true;
1457
+ df_add_data_len(df, rstrdup(val), RSTRING_LEN(val));
1458
+ break;
1459
+ }
1460
+ doc_add_field(doc, df);
1461
+ }
1462
+ return ST_CONTINUE;
1463
+ }
1464
+
1465
+ static Document *
1466
+ frb_get_doc(VALUE rdoc)
1467
+ {
1468
+ VALUE val;
1469
+ Document *doc = doc_new();
1470
+ DocField *df;
1471
+
1472
+ if (rb_respond_to(rdoc, id_boost)) {
1473
+ doc->boost = (float)NUM2DBL(rb_funcall(rdoc, id_boost, 0));
1474
+ }
1475
+
1476
+ switch (TYPE(rdoc)) {
1477
+ case T_HASH:
1478
+ rb_hash_foreach(rdoc, frb_hash_to_doc_i, (VALUE)doc);
1479
+ break;
1480
+ case T_ARRAY:
1481
+ {
1482
+ int i;
1483
+ df = df_new(fsym_content);
1484
+ df->destroy_data = true;
1485
+ for (i = 0; i < RARRAY_LEN(rdoc); i++) {
1486
+ val = rb_obj_as_string(RARRAY_PTR(rdoc)[i]);
1487
+ df_add_data_len(df, rstrdup(val), RSTRING_LEN(val));
1488
+ }
1489
+ doc_add_field(doc, df);
1490
+ }
1491
+ break;
1492
+ case T_SYMBOL:
1493
+ /* TODO: clean up this ugly cast */
1494
+ df = df_add_data(df_new(fsym_content), (char *)rb_id2name(SYM2ID(rdoc)));
1495
+ doc_add_field(doc, df);
1496
+ break;
1497
+ case T_STRING:
1498
+ df = df_add_data_len(df_new(fsym_content), rs2s(rdoc),
1499
+ RSTRING_LEN(rdoc));
1500
+ doc_add_field(doc, df);
1501
+ break;
1502
+ default:
1503
+ val = rb_obj_as_string(rdoc);
1504
+ df = df_add_data_len(df_new(fsym_content), rstrdup(val),
1505
+ RSTRING_LEN(val));
1506
+ df->destroy_data = true;
1507
+ doc_add_field(doc, df);
1508
+ break;
1509
+ }
1510
+ return doc;
1511
+ }
1512
+
1513
+ /*
1514
+ * call-seq:
1515
+ * iw << document -> iw
1516
+ * iw.add_document(document) -> iw
1517
+ *
1518
+ * Add a document to the index. See Document. A document can also be a simple
1519
+ * hash object.
1520
+ */
1521
+ static VALUE
1522
+ frb_iw_add_doc(VALUE self, VALUE rdoc)
1523
+ {
1524
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1525
+ Document *doc = frb_get_doc(rdoc);
1526
+ iw_add_doc(iw, doc);
1527
+ doc_destroy(doc);
1528
+ return self;
1529
+ }
1530
+
1531
+ /*
1532
+ * call-seq:
1533
+ * iw.optimize -> iw
1534
+ *
1535
+ * Optimize the index for searching. This commits any unwritten data to the
1536
+ * index and optimizes the index into a single segment to improve search
1537
+ * performance. This is an expensive operation and should not be called too
1538
+ * often. The best time to call this is at the end of a long batch indexing
1539
+ * process. Note that calling the optimize method do not in any way effect
1540
+ * indexing speed (except for the time taken to complete the optimization
1541
+ * process).
1542
+ */
1543
+ static VALUE
1544
+ frb_iw_optimize(VALUE self)
1545
+ {
1546
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1547
+ iw_optimize(iw);
1548
+ return self;
1549
+ }
1550
+
1551
+ /*
1552
+ * call-seq:
1553
+ * iw.commit -> iw
1554
+ *
1555
+ * Explicitly commit any changes to the index that may be hanging around in
1556
+ * memory. You should call this method if you want to read the latest index
1557
+ * with an IndexWriter.
1558
+ */
1559
+ static VALUE
1560
+ frb_iw_commit(VALUE self)
1561
+ {
1562
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1563
+ iw_commit(iw);
1564
+ return self;
1565
+ }
1566
+
1567
+ /*
1568
+ * call-seq:
1569
+ * iw.add_readers(reader_array) -> iw
1570
+ *
1571
+ * Use this method to merge other indexes into the one being written by
1572
+ * IndexWriter. This is useful for parallel indexing. You can have several
1573
+ * indexing processes running in parallel, possibly even on different
1574
+ * machines. Then you can finish by merging all of the indexes into a single
1575
+ * index.
1576
+ */
1577
+ static VALUE
1578
+ frb_iw_add_readers(VALUE self, VALUE rreaders)
1579
+ {
1580
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1581
+ int i;
1582
+ IndexReader **irs;
1583
+ Check_Type(rreaders, T_ARRAY);
1584
+
1585
+ irs = ALLOC_N(IndexReader *, RARRAY_LEN(rreaders));
1586
+ i = RARRAY_LEN(rreaders);
1587
+ while (i-- > 0) {
1588
+ IndexReader *ir;
1589
+ Data_Get_Struct(RARRAY_PTR(rreaders)[i], IndexReader, ir);
1590
+ irs[i] = ir;
1591
+ }
1592
+ iw_add_readers(iw, irs, RARRAY_LEN(rreaders));
1593
+ free(irs);
1594
+ return self;
1595
+ }
1596
+
1597
+ /*
1598
+ * call-seq:
1599
+ * iw.delete(field, term) -> iw
1600
+ * iw.delete(field, terms) -> iw
1601
+ *
1602
+ * Delete all documents in the index with the given +term+ or +terms+ in the
1603
+ * field +field+. You should usually have a unique document id which you use
1604
+ * with this method, rather then deleting all documents with the word "the"
1605
+ * in them. There are of course exceptions to this rule. For example, you may
1606
+ * want to delete all documents with the term "viagra" when deleting spam.
1607
+ */
1608
+ static VALUE
1609
+ frb_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
1610
+ {
1611
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1612
+ if (TYPE(rterm) == T_ARRAY) {
1613
+ const int term_cnt = RARRAY_LEN(rterm);
1614
+ int i;
1615
+ char **terms = ALLOC_N(char *, term_cnt);
1616
+ for (i = 0; i < term_cnt; i++) {
1617
+ terms[i] = StringValuePtr(RARRAY_PTR(rterm)[i]);
1618
+ }
1619
+ iw_delete_terms(iw, frb_field(rfield), terms, term_cnt);
1620
+ free(terms);
1621
+ } else {
1622
+ iw_delete_term(iw, frb_field(rfield), StringValuePtr(rterm));
1623
+ }
1624
+ return self;
1625
+ }
1626
+
1627
+ /*
1628
+ * call-seq:
1629
+ * index_writer.field_infos -> FieldInfos
1630
+ *
1631
+ * Get the FieldInfos object for this IndexWriter. This is useful if you need
1632
+ * to dynamically add new fields to the index with specific properties.
1633
+ */
1634
+ static VALUE
1635
+ frb_iw_field_infos(VALUE self)
1636
+ {
1637
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1638
+ return frb_get_field_infos(iw->fis);
1639
+ }
1640
+
1641
+ /*
1642
+ * call-seq:
1643
+ * index_writer.analyzer -> Analyzer
1644
+ *
1645
+ * Get the Analyzer for this IndexWriter. This is useful if you need
1646
+ * to use the same analyzer in a QueryParser.
1647
+ */
1648
+ static VALUE
1649
+ frb_iw_get_analyzer(VALUE self)
1650
+ {
1651
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1652
+ return frb_get_analyzer(iw->analyzer);
1653
+ }
1654
+
1655
+ /*
1656
+ * call-seq:
1657
+ * index_writer.analyzer -> Analyzer
1658
+ *
1659
+ * Set the Analyzer for this IndexWriter. This is useful if you need to
1660
+ * change the analyzer for a special document. It is risky though as the
1661
+ * same analyzer will be used for all documents during search.
1662
+ */
1663
+ static VALUE
1664
+ frb_iw_set_analyzer(VALUE self, VALUE ranalyzer)
1665
+ {
1666
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1667
+
1668
+ a_deref(iw->analyzer);
1669
+ iw->analyzer = frb_get_cwrapped_analyzer(ranalyzer);
1670
+ return ranalyzer;
1671
+ }
1672
+
1673
+ /*
1674
+ * call-seq:
1675
+ * index_writer.version -> int
1676
+ *
1677
+ * Returns the current version of the index writer.
1678
+ */
1679
+ static VALUE
1680
+ frb_iw_version(VALUE self)
1681
+ {
1682
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1683
+ return ULL2NUM(iw->sis->version);
1684
+ }
1685
+
1686
+ /*
1687
+ * call-seq:
1688
+ * iw.chunk_size -> number
1689
+ *
1690
+ * Return the current value of chunk_size
1691
+ */
1692
+ static VALUE
1693
+ frb_iw_get_chunk_size(VALUE self)
1694
+ {
1695
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1696
+ return INT2FIX(iw->config.chunk_size);
1697
+ }
1698
+
1699
+ /*
1700
+ * call-seq:
1701
+ * iw.chunk_size = chunk_size -> chunk_size
1702
+ *
1703
+ * Set the chunk_size parameter
1704
+ */
1705
+ static VALUE
1706
+ frb_iw_set_chunk_size(VALUE self, VALUE rval)
1707
+ {
1708
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1709
+ iw->config.chunk_size = FIX2INT(rval);
1710
+ return rval;
1711
+ }
1712
+
1713
+ /*
1714
+ * call-seq:
1715
+ * iw.max_buffer_memory -> number
1716
+ *
1717
+ * Return the current value of max_buffer_memory
1718
+ */
1719
+ static VALUE
1720
+ frb_iw_get_max_buffer_memory(VALUE self)
1721
+ {
1722
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1723
+ return INT2FIX(iw->config.max_buffer_memory);
1724
+ }
1725
+
1726
+ /*
1727
+ * call-seq:
1728
+ * iw.max_buffer_memory = max_buffer_memory -> max_buffer_memory
1729
+ *
1730
+ * Set the max_buffer_memory parameter
1731
+ */
1732
+ static VALUE
1733
+ frb_iw_set_max_buffer_memory(VALUE self, VALUE rval)
1734
+ {
1735
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1736
+ iw->config.max_buffer_memory = FIX2INT(rval);
1737
+ return rval;
1738
+ }
1739
+
1740
+ /*
1741
+ * call-seq:
1742
+ * iw.term_index_interval -> number
1743
+ *
1744
+ * Return the current value of term_index_interval
1745
+ */
1746
+ static VALUE
1747
+ frb_iw_get_index_interval(VALUE self)
1748
+ {
1749
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1750
+ return INT2FIX(iw->config.index_interval);
1751
+ }
1752
+
1753
+ /*
1754
+ * call-seq:
1755
+ * iw.term_index_interval = term_index_interval -> term_index_interval
1756
+ *
1757
+ * Set the term_index_interval parameter
1758
+ */
1759
+ static VALUE
1760
+ frb_iw_set_index_interval(VALUE self, VALUE rval)
1761
+ {
1762
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1763
+ iw->config.index_interval = FIX2INT(rval);
1764
+ return rval;
1765
+ }
1766
+
1767
+ /*
1768
+ * call-seq:
1769
+ * iw.doc_skip_interval -> number
1770
+ *
1771
+ * Return the current value of doc_skip_interval
1772
+ */
1773
+ static VALUE
1774
+ frb_iw_get_skip_interval(VALUE self)
1775
+ {
1776
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1777
+ return INT2FIX(iw->config.skip_interval);
1778
+ }
1779
+
1780
+ /*
1781
+ * call-seq:
1782
+ * iw.doc_skip_interval = doc_skip_interval -> doc_skip_interval
1783
+ *
1784
+ * Set the doc_skip_interval parameter
1785
+ */
1786
+ static VALUE
1787
+ frb_iw_set_skip_interval(VALUE self, VALUE rval)
1788
+ {
1789
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1790
+ iw->config.skip_interval = FIX2INT(rval);
1791
+ return rval;
1792
+ }
1793
+
1794
+ /*
1795
+ * call-seq:
1796
+ * iw.merge_factor -> number
1797
+ *
1798
+ * Return the current value of merge_factor
1799
+ */
1800
+ static VALUE
1801
+ frb_iw_get_merge_factor(VALUE self)
1802
+ {
1803
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1804
+ return INT2FIX(iw->config.merge_factor);
1805
+ }
1806
+
1807
+ /*
1808
+ * call-seq:
1809
+ * iw.merge_factor = merge_factor -> merge_factor
1810
+ *
1811
+ * Set the merge_factor parameter
1812
+ */
1813
+ static VALUE
1814
+ frb_iw_set_merge_factor(VALUE self, VALUE rval)
1815
+ {
1816
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1817
+ iw->config.merge_factor = FIX2INT(rval);
1818
+ return rval;
1819
+ }
1820
+
1821
+ /*
1822
+ * call-seq:
1823
+ * iw.max_buffered_docs -> number
1824
+ *
1825
+ * Return the current value of max_buffered_docs
1826
+ */
1827
+ static VALUE
1828
+ frb_iw_get_max_buffered_docs(VALUE self)
1829
+ {
1830
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1831
+ return INT2FIX(iw->config.max_buffered_docs);
1832
+ }
1833
+
1834
+ /*
1835
+ * call-seq:
1836
+ * iw.max_buffered_docs = max_buffered_docs -> max_buffered_docs
1837
+ *
1838
+ * Set the max_buffered_docs parameter
1839
+ */
1840
+ static VALUE
1841
+ frb_iw_set_max_buffered_docs(VALUE self, VALUE rval)
1842
+ {
1843
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1844
+ iw->config.max_buffered_docs = FIX2INT(rval);
1845
+ return rval;
1846
+ }
1847
+
1848
+ /*
1849
+ * call-seq:
1850
+ * iw.max_merge_docs -> number
1851
+ *
1852
+ * Return the current value of max_merge_docs
1853
+ */
1854
+ static VALUE
1855
+ frb_iw_get_max_merge_docs(VALUE self)
1856
+ {
1857
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1858
+ return INT2FIX(iw->config.max_merge_docs);
1859
+ }
1860
+
1861
+ /*
1862
+ * call-seq:
1863
+ * iw.max_merge_docs = max_merge_docs -> max_merge_docs
1864
+ *
1865
+ * Set the max_merge_docs parameter
1866
+ */
1867
+ static VALUE
1868
+ frb_iw_set_max_merge_docs(VALUE self, VALUE rval)
1869
+ {
1870
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1871
+ iw->config.max_merge_docs = FIX2INT(rval);
1872
+ return rval;
1873
+ }
1874
+
1875
+ /*
1876
+ * call-seq:
1877
+ * iw.max_field_length -> number
1878
+ *
1879
+ * Return the current value of max_field_length
1880
+ */
1881
+ static VALUE
1882
+ frb_iw_get_max_field_length(VALUE self)
1883
+ {
1884
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1885
+ return INT2FIX(iw->config.max_field_length);
1886
+ }
1887
+
1888
+ /*
1889
+ * call-seq:
1890
+ * iw.max_field_length = max_field_length -> max_field_length
1891
+ *
1892
+ * Set the max_field_length parameter
1893
+ */
1894
+ static VALUE
1895
+ frb_iw_set_max_field_length(VALUE self, VALUE rval)
1896
+ {
1897
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1898
+ iw->config.max_field_length = FIX2INT(rval);
1899
+ return rval;
1900
+ }
1901
+
1902
+ /*
1903
+ * call-seq:
1904
+ * iw.use_compound_file -> number
1905
+ *
1906
+ * Return the current value of use_compound_file
1907
+ */
1908
+ static VALUE
1909
+ frb_iw_get_use_compound_file(VALUE self)
1910
+ {
1911
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1912
+ return iw->config.use_compound_file ? Qtrue : Qfalse;
1913
+ }
1914
+
1915
+ /*
1916
+ * call-seq:
1917
+ * iw.use_compound_file = use_compound_file -> use_compound_file
1918
+ *
1919
+ * Set the use_compound_file parameter
1920
+ */
1921
+ static VALUE
1922
+ frb_iw_set_use_compound_file(VALUE self, VALUE rval)
1923
+ {
1924
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1925
+ iw->config.use_compound_file = RTEST(rval);
1926
+ return rval;
1927
+ }
1928
+
1929
+ /****************************************************************************
1930
+ *
1931
+ * LazyDoc Methods
1932
+ *
1933
+ ****************************************************************************/
1934
+
1935
+ static void
1936
+ frb_lzd_date_free(void *p)
1937
+ {
1938
+ lazy_doc_close((LazyDoc *)p);
1939
+ }
1940
+
1941
+ static VALUE
1942
+ frb_lazy_df_load(VALUE self, VALUE rkey, LazyDocField *lazy_df)
1943
+ {
1944
+ VALUE rdata = Qnil;
1945
+ if (lazy_df) {
1946
+ if (lazy_df->size == 1) {
1947
+ char *data = lazy_df_get_data(lazy_df, 0);
1948
+ rdata = rb_str_new(data, lazy_df->len);
1949
+ } else {
1950
+ int i;
1951
+ rdata = rb_ary_new2(lazy_df->size);
1952
+ for (i = 0; i < lazy_df->size; i++) {
1953
+ char *data = lazy_df_get_data(lazy_df, i);
1954
+ rb_ary_store(rdata, i, rb_str_new(data, lazy_df->data[i].length));
1955
+ }
1956
+ }
1957
+ rb_hash_aset(self, rkey, rdata);
1958
+ }
1959
+ return rdata;
1960
+ }
1961
+
1962
+ /*
1963
+ * call-seq:
1964
+ * lazy_doc.default(key) -> string
1965
+ *
1966
+ * This method is used internally to lazily load fields. You should never
1967
+ * really need to call it yourself.
1968
+ */
1969
+ static VALUE
1970
+ frb_lzd_default(VALUE self, VALUE rkey)
1971
+ {
1972
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1973
+ Symbol field = frb_field(rkey);
1974
+ VALUE rfield = FSYM2SYM(field);
1975
+
1976
+ return frb_lazy_df_load(self, rfield, lazy_doc_get(lazy_doc, field));
1977
+ }
1978
+
1979
+ /*
1980
+ * call-seq:
1981
+ * lazy_doc.fields -> array of available fields
1982
+ *
1983
+ * Returns the list of fields stored for this particular document. If you try
1984
+ * to access any of these fields in the document the field will be loaded.
1985
+ * Try to access any other field an nil will be returned.
1986
+ */
1987
+ static VALUE
1988
+ frb_lzd_fields(VALUE self)
1989
+ {
1990
+ return rb_ivar_get(self, id_fields);
1991
+ }
1992
+
1993
+ /*
1994
+ * call-seq:
1995
+ * lazy_doc.load -> lazy_doc
1996
+ *
1997
+ * Load all unloaded fields in the document from the index.
1998
+ */
1999
+ static VALUE
2000
+ frb_lzd_load(VALUE self)
2001
+ {
2002
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
2003
+ int i;
2004
+ for (i = 0; i < lazy_doc->size; i++) {
2005
+ LazyDocField *lazy_df = lazy_doc->fields[i];
2006
+ frb_lazy_df_load(self, FSYM2SYM(lazy_df->name), lazy_df);
2007
+ }
2008
+ return self;
2009
+ }
2010
+
2011
+ VALUE
2012
+ frb_get_lazy_doc(LazyDoc *lazy_doc)
2013
+ {
2014
+ int i;
2015
+ VALUE rfields = rb_ary_new2(lazy_doc->size);
2016
+
2017
+ VALUE self, rdata;
2018
+ self = rb_hash_new();
2019
+ OBJSETUP(self, cLazyDoc, T_HASH);
2020
+
2021
+ rdata = Data_Wrap_Struct(cLazyDocData, NULL, &frb_lzd_date_free, lazy_doc);
2022
+ rb_ivar_set(self, id_data, rdata);
2023
+
2024
+ for (i = 0; i < lazy_doc->size; i++) {
2025
+ rb_ary_store(rfields, i, FSYM2SYM(lazy_doc->fields[i]->name));
2026
+ }
2027
+ rb_ivar_set(self, id_fields, rfields);
2028
+
2029
+ return self;
2030
+ }
2031
+
2032
+ /****************************************************************************
2033
+ *
2034
+ * IndexReader Methods
2035
+ *
2036
+ ****************************************************************************/
2037
+
2038
+ void
2039
+ frb_ir_free(void *p)
2040
+ {
2041
+ object_del(p);
2042
+ ir_close((IndexReader *)p);
2043
+ }
2044
+
2045
+ void
2046
+ frb_ir_mark(void *p)
2047
+ {
2048
+ IndexReader *ir = (IndexReader *)p;
2049
+ frb_gc_mark(ir->store);
2050
+ }
2051
+
2052
+ static VALUE frb_ir_close(VALUE self);
2053
+
2054
+ void
2055
+ frb_mr_mark(void *p)
2056
+ {
2057
+ MultiReader *mr = (MultiReader *)p;
2058
+ int i;
2059
+ for (i = 0; i < mr->r_cnt; i++) {
2060
+ frb_gc_mark(mr->sub_readers[i]);
2061
+ }
2062
+ }
2063
+
2064
+ /*
2065
+ * call-seq:
2066
+ * IndexReader.new(dir) -> index_reader
2067
+ *
2068
+ * Create a new IndexReader. You can either pass a string path to a
2069
+ * file-system directory or an actual Ferret::Store::Directory object. For
2070
+ * example;
2071
+ *
2072
+ * dir = RAMDirectory.new()
2073
+ * iw = IndexReader.new(dir)
2074
+ *
2075
+ * dir = FSDirectory.new("/path/to/index")
2076
+ * iw = IndexReader.new(dir)
2077
+ *
2078
+ * iw = IndexReader.new("/path/to/index")
2079
+ *
2080
+ * You can also create a what used to be known as a MultiReader by passing an
2081
+ * array of IndexReader objects, Ferret::Store::Directory objects or
2082
+ * file-system paths;
2083
+ *
2084
+ * iw = IndexReader.new([dir, dir2, dir3])
2085
+ *
2086
+ * iw = IndexReader.new([reader1, reader2, reader3])
2087
+ *
2088
+ * iw = IndexReader.new(["/path/to/index1", "/path/to/index2"])
2089
+ */
2090
+ static VALUE
2091
+ frb_ir_init(VALUE self, VALUE rdir)
2092
+ {
2093
+ Store *store = NULL;
2094
+ IndexReader *ir;
2095
+ int i;
2096
+ FieldInfos *fis;
2097
+ VALUE rfield_num_map = rb_hash_new();
2098
+
2099
+ if (TYPE(rdir) == T_ARRAY) {
2100
+ VALUE rdirs = rdir;
2101
+ const int reader_cnt = RARRAY_LEN(rdir);
2102
+ IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
2103
+ int i;
2104
+ for (i = 0; i < reader_cnt; i++) {
2105
+ rdir = RARRAY_PTR(rdirs)[i];
2106
+ switch (TYPE(rdir)) {
2107
+ case T_DATA:
2108
+ if (CLASS_OF(rdir) == cIndexReader) {
2109
+ Data_Get_Struct(rdir, IndexReader, sub_readers[i]);
2110
+ REF(sub_readers[i]);
2111
+ continue;
2112
+ } else if (RTEST(rb_obj_is_kind_of(rdir, cDirectory))) {
2113
+ store = DATA_PTR(rdir);
2114
+ } else {
2115
+ rb_raise(rb_eArgError, "A Multi-IndexReader can only "
2116
+ "be created from other IndexReaders, "
2117
+ "Directory objects or file-system paths. "
2118
+ "Not %s",
2119
+ rs2s(rb_obj_as_string(rdir)));
2120
+ }
2121
+ break;
2122
+ case T_STRING:
2123
+ frb_create_dir(rdir);
2124
+ store = open_fs_store(rs2s(rdir));
2125
+ DEREF(store);
2126
+ break;
2127
+ default:
2128
+ rb_raise(rb_eArgError, "%s isn't a valid directory "
2129
+ "argument. You should use either a String or "
2130
+ "a Directory",
2131
+ rs2s(rb_obj_as_string(rdir)));
2132
+ break;
2133
+ }
2134
+ sub_readers[i] = ir_open(store);
2135
+ }
2136
+ ir = mr_open(sub_readers, reader_cnt);
2137
+ Frt_Wrap_Struct(self, &frb_mr_mark, &frb_ir_free, ir);
2138
+ } else {
2139
+ switch (TYPE(rdir)) {
2140
+ case T_DATA:
2141
+ store = DATA_PTR(rdir);
2142
+ break;
2143
+ case T_STRING:
2144
+ frb_create_dir(rdir);
2145
+ store = open_fs_store(rs2s(rdir));
2146
+ DEREF(store);
2147
+ break;
2148
+ default:
2149
+ rb_raise(rb_eArgError, "%s isn't a valid directory argument. "
2150
+ "You should use either a String or a Directory",
2151
+ rs2s(rb_obj_as_string(rdir)));
2152
+ break;
2153
+ }
2154
+ ir = ir_open(store);
2155
+ Frt_Wrap_Struct(self, &frb_ir_mark, &frb_ir_free, ir);
2156
+ }
2157
+ object_add(ir, self);
2158
+
2159
+ fis = ir->fis;
2160
+ for (i = 0; i < fis->size; i++) {
2161
+ FieldInfo *fi = fis->fields[i];
2162
+ rb_hash_aset(rfield_num_map,
2163
+ FSYM2SYM(fi->name),
2164
+ INT2FIX(fi->number));
2165
+ }
2166
+ rb_ivar_set(self, id_fld_num_map, rfield_num_map);
2167
+
2168
+ return self;
2169
+ }
2170
+
2171
+ /*
2172
+ * call-seq:
2173
+ * index_reader.set_norm(doc_id, field, val)
2174
+ *
2175
+ * Expert: change the boost value for a +field+ in document at +doc_id+.
2176
+ * +val+ should be an integer in the range 0..255 which corresponds to an
2177
+ * encoded float value.
2178
+ */
2179
+ static VALUE
2180
+ frb_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
2181
+ {
2182
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2183
+ ir_set_norm(ir, FIX2INT(rdoc_id), frb_field(rfield), (uchar)NUM2CHR(rval));
2184
+ return self;
2185
+ }
2186
+
2187
+ /*
2188
+ * call-seq:
2189
+ * index_reader.norms(field) -> string
2190
+ *
2191
+ * Expert: Returns a string containing the norm values for a field. The
2192
+ * string length will be equal to the number of documents in the index and it
2193
+ * could have null bytes.
2194
+ */
2195
+ static VALUE
2196
+ frb_ir_norms(VALUE self, VALUE rfield)
2197
+ {
2198
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2199
+ uchar *norms;
2200
+ norms = ir_get_norms(ir, frb_field(rfield));
2201
+ if (norms) {
2202
+ return rb_str_new((char *)norms, ir->max_doc(ir));
2203
+ } else {
2204
+ return Qnil;
2205
+ }
2206
+ }
2207
+
2208
+ /*
2209
+ * call-seq:
2210
+ * index_reader.get_norms_into(field, buffer, offset) -> buffer
2211
+ *
2212
+ * Expert: Get the norm values into a string +buffer+ starting at +offset+.
2213
+ */
2214
+ static VALUE
2215
+ frb_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
2216
+ {
2217
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2218
+ int offset;
2219
+ offset = FIX2INT(roffset);
2220
+ Check_Type(rnorms, T_STRING);
2221
+ if (RSTRING_LEN(rnorms) < offset + ir->max_doc(ir)) {
2222
+ rb_raise(rb_eArgError, "supplied a string of length:%d to "
2223
+ "IndexReader#get_norms_into but needed a string of length "
2224
+ "offset:%d + maxdoc:%d",
2225
+ RSTRING_LEN(rnorms), offset, ir->max_doc(ir));
2226
+ }
2227
+
2228
+ ir_get_norms_into(ir, frb_field(rfield),
2229
+ (uchar *)rs2s(rnorms) + offset);
2230
+ return rnorms;
2231
+ }
2232
+
2233
+ /*
2234
+ * call-seq:
2235
+ * index_reader.commit -> index_reader
2236
+ *
2237
+ * Commit any deletes made by this particular IndexReader to the index. This
2238
+ * will use open a Commit lock.
2239
+ */
2240
+ static VALUE
2241
+ frb_ir_commit(VALUE self)
2242
+ {
2243
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2244
+ ir_commit(ir);
2245
+ return self;
2246
+ }
2247
+
2248
+ /*
2249
+ * call-seq:
2250
+ * index_reader.close -> index_reader
2251
+ *
2252
+ * Close the IndexReader. This method also commits any deletions made by this
2253
+ * IndexReader. This method will be called explicitly by the garbage
2254
+ * collector but you should call it explicitly to commit any changes as soon
2255
+ * as possible and to close any locks held by the object to prevent locking
2256
+ * errors.
2257
+ */
2258
+ static VALUE
2259
+ frb_ir_close(VALUE self)
2260
+ {
2261
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2262
+ object_del(ir);
2263
+ Frt_Unwrap_Struct(self);
2264
+ ir_close(ir);
2265
+ return self;
2266
+ }
2267
+
2268
+ /*
2269
+ * call-seq:
2270
+ * index_reader.has_deletions? -> bool
2271
+ *
2272
+ * Return true if the index has any deletions, either uncommitted by this
2273
+ * IndexReader or committed by any other IndexReader.
2274
+ */
2275
+ static VALUE
2276
+ frb_ir_has_deletions(VALUE self)
2277
+ {
2278
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2279
+ return ir->has_deletions(ir) ? Qtrue : Qfalse;
2280
+ }
2281
+
2282
+ /*
2283
+ * call-seq:
2284
+ * index_reader.delete(doc_id) -> index_reader
2285
+ *
2286
+ * Delete document referenced internally by document id +doc_id+. The
2287
+ * document_id is the number used to reference documents in the index and is
2288
+ * returned by search methods.
2289
+ */
2290
+ static VALUE
2291
+ frb_ir_delete(VALUE self, VALUE rdoc_id)
2292
+ {
2293
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2294
+ ir_delete_doc(ir, FIX2INT(rdoc_id));
2295
+ return self;
2296
+ }
2297
+
2298
+ /*
2299
+ * call-seq:
2300
+ * index_reader.deleted?(doc_id) -> bool
2301
+ *
2302
+ * Returns true if the document at +doc_id+ has been deleted.
2303
+ */
2304
+ static VALUE
2305
+ frb_ir_is_deleted(VALUE self, VALUE rdoc_id)
2306
+ {
2307
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2308
+ return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse;
2309
+ }
2310
+
2311
+ /*
2312
+ * call-seq:
2313
+ * index_reader.max_doc -> number
2314
+ *
2315
+ * Returns 1 + the maximum document id in the index. It is the
2316
+ * document_id that will be used by the next document added to the index. If
2317
+ * there are no deletions, this number also refers to the number of documents
2318
+ * in the index.
2319
+ */
2320
+ static VALUE
2321
+ frb_ir_max_doc(VALUE self)
2322
+ {
2323
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2324
+ return INT2FIX(ir->max_doc(ir));
2325
+ }
2326
+
2327
+ /*
2328
+ * call-seq:
2329
+ * index_reader.num_docs -> number
2330
+ *
2331
+ * Returns the number of accessible (not deleted) documents in the index.
2332
+ * This will be equal to IndexReader#max_doc if there have been no documents
2333
+ * deleted from the index.
2334
+ */
2335
+ static VALUE
2336
+ frb_ir_num_docs(VALUE self)
2337
+ {
2338
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2339
+ return INT2FIX(ir->num_docs(ir));
2340
+ }
2341
+
2342
+ /*
2343
+ * call-seq:
2344
+ * index_reader.undelete_all -> index_reader
2345
+ *
2346
+ * Undelete all deleted documents in the index. This is kind of like a
2347
+ * rollback feature. Not that once an index is committed or a merge happens
2348
+ * during index, deletions will be committed and undelete_all will have no
2349
+ * effect on these documents.
2350
+ */
2351
+ static VALUE
2352
+ frb_ir_undelete_all(VALUE self)
2353
+ {
2354
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2355
+ ir_undelete_all(ir);
2356
+ return self;
2357
+ }
2358
+
2359
+ static VALUE
2360
+ frb_get_doc_range(IndexReader *ir, int pos, int len, int max)
2361
+ {
2362
+ VALUE ary;
2363
+ int i;
2364
+ max = min2(max, pos+len);
2365
+ len = max - pos;
2366
+ ary = rb_ary_new2(len);
2367
+ for (i = 0; i < len; i++) {
2368
+ rb_ary_store(ary, i, frb_get_lazy_doc(ir->get_lazy_doc(ir, i + pos)));
2369
+ }
2370
+ return ary;
2371
+ }
2372
+
2373
+ /*
2374
+ * call-seq:
2375
+ * index_reader.get_document(doc_id) -> LazyDoc
2376
+ * index_reader[doc_id] -> LazyDoc
2377
+ *
2378
+ * Retrieve a document from the index. See LazyDoc for more details on the
2379
+ * document returned. Documents are referenced internally by document ids
2380
+ * which are returned by the Searchers search methods.
2381
+ */
2382
+ static VALUE
2383
+ frb_ir_get_doc(int argc, VALUE *argv, VALUE self)
2384
+ {
2385
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2386
+ VALUE arg1, arg2;
2387
+ long pos, len;
2388
+ long max = ir->max_doc(ir);
2389
+ rb_scan_args(argc, argv, "11", &arg1, &arg2);
2390
+ if (argc == 1) {
2391
+ if (FIXNUM_P(arg1)) {
2392
+ pos = FIX2INT(arg1);
2393
+ pos = (pos < 0) ? (max + pos) : pos;
2394
+ if (pos < 0 || pos >= max) {
2395
+ rb_raise(rb_eArgError, "index %d is out of range [%d..%d] for "
2396
+ "IndexReader#[]", pos, 0, max, -1);
2397
+ }
2398
+ return frb_get_lazy_doc(ir->get_lazy_doc(ir, pos));
2399
+ }
2400
+
2401
+ /* check if idx is Range */
2402
+ /* FIXME: test this with dodgy values */
2403
+ switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) {
2404
+ case Qfalse:
2405
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
2406
+ "IndexReader.get_document(index)",
2407
+ rb_id2name(SYM2ID(arg1)));
2408
+ case Qnil:
2409
+ return Qnil;
2410
+ default:
2411
+ return frb_get_doc_range(ir, pos, len, max);
2412
+ }
2413
+ }
2414
+ else {
2415
+ pos = FIX2LONG(arg1);
2416
+ len = FIX2LONG(arg2);
2417
+ return frb_get_doc_range(ir, pos, len, max);
2418
+ }
2419
+ }
2420
+
2421
+ /*
2422
+ * call-seq:
2423
+ * index_reader.is_latest? -> bool
2424
+ *
2425
+ * Return true if the index version referenced by this IndexReader is the
2426
+ * latest version of the index. If it isn't you should close and reopen the
2427
+ * index to search the latest documents added to the index.
2428
+ */
2429
+ static VALUE
2430
+ frb_ir_is_latest(VALUE self)
2431
+ {
2432
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2433
+ return ir_is_latest(ir) ? Qtrue : Qfalse;
2434
+ }
2435
+
2436
+ /*
2437
+ * call-seq:
2438
+ * index_reader.term_vector(doc_id, field) -> TermVector
2439
+ *
2440
+ * Return the TermVector for the field +field+ in the document at +doc_id+ in
2441
+ * the index. Return nil if no such term_vector exists. See TermVector.
2442
+ */
2443
+ static VALUE
2444
+ frb_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield)
2445
+ {
2446
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2447
+ TermVector *tv;
2448
+ VALUE rtv;
2449
+ tv = ir->term_vector(ir, FIX2INT(rdoc_id), frb_field(rfield));
2450
+ if (tv) {
2451
+ rtv = frb_get_tv(tv);
2452
+ tv_destroy(tv);
2453
+ return rtv;
2454
+ }
2455
+ else {
2456
+ return Qnil;
2457
+ }
2458
+ }
2459
+
2460
+ static void
2461
+ frb_add_each_tv(void *key, void *value, void *rtvs)
2462
+ {
2463
+ rb_hash_aset((VALUE)rtvs, ID2SYM((ID)key), frb_get_tv(value));
2464
+ }
2465
+
2466
+ /*
2467
+ * call-seq:
2468
+ * index_reader.term_vectors(doc_id) -> hash of TermVector
2469
+ *
2470
+ * Return the TermVectors for the document at +doc_id+ in the index. The
2471
+ * value returned is a hash of the TermVectors for each field in the document
2472
+ * and they are referenced by field names (as symbols).
2473
+ */
2474
+ static VALUE
2475
+ frb_ir_term_vectors(VALUE self, VALUE rdoc_id)
2476
+ {
2477
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2478
+ Hash *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id));
2479
+ VALUE rtvs = rb_hash_new();
2480
+ h_each(tvs, &frb_add_each_tv, (void *)rtvs);
2481
+ h_destroy(tvs);
2482
+
2483
+ return rtvs;
2484
+ }
2485
+
2486
+ /*
2487
+ * call-seq:
2488
+ * index_reader.term_docs -> TermDocEnum
2489
+ *
2490
+ * Builds a TermDocEnum (term-document enumerator) for the index. You can use
2491
+ * this object to iterate through the documents in which certain terms occur.
2492
+ * See TermDocEnum for more info.
2493
+ */
2494
+ static VALUE
2495
+ frb_ir_term_docs(VALUE self)
2496
+ {
2497
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2498
+ return frb_get_tde(self, ir->term_docs(ir));
2499
+ }
2500
+
2501
+ /*
2502
+ * call-seq:
2503
+ * index_reader.term_docs_for(field, term) -> TermDocEnum
2504
+ *
2505
+ * Builds a TermDocEnum to iterate through the documents that contain the
2506
+ * term +term+ in the field +field+. See TermDocEnum for more info.
2507
+ */
2508
+ static VALUE
2509
+ frb_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm)
2510
+ {
2511
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2512
+ return frb_get_tde(self, ir_term_docs_for(ir,
2513
+ frb_field(rfield),
2514
+ StringValuePtr(rterm)));
2515
+ }
2516
+
2517
+ /*
2518
+ * call-seq:
2519
+ * index_reader.term_positions -> TermDocEnum
2520
+ *
2521
+ * Same as IndexReader#term_docs except the TermDocEnum will also allow you
2522
+ * to scan through the positions at which a term occurs. See TermDocEnum for
2523
+ * more info.
2524
+ */
2525
+ static VALUE
2526
+ frb_ir_term_positions(VALUE self)
2527
+ {
2528
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2529
+ return frb_get_tde(self, ir->term_positions(ir));
2530
+ }
2531
+
2532
+ /*
2533
+ * call-seq:
2534
+ * index_reader.term_positions_for(field, term) -> TermDocEnum
2535
+ *
2536
+ * Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will
2537
+ * also allow you to scan through the positions at which a term occurs. See
2538
+ * TermDocEnum for more info.
2539
+ */
2540
+ static VALUE
2541
+ frb_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm)
2542
+ {
2543
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2544
+ return frb_get_tde(self, ir_term_positions_for(ir,
2545
+ frb_field(rfield),
2546
+ StringValuePtr(rterm)));
2547
+ }
2548
+
2549
+ /*
2550
+ * call-seq:
2551
+ * index_reader.doc_freq(field, term) -> integer
2552
+ *
2553
+ * Return the number of documents in which the term +term+ appears in the
2554
+ * field +field+.
2555
+ */
2556
+ static VALUE
2557
+ frb_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
2558
+ {
2559
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2560
+ return INT2FIX(ir_doc_freq(ir,
2561
+ frb_field(rfield),
2562
+ StringValuePtr(rterm)));
2563
+ }
2564
+
2565
+ /*
2566
+ * call-seq:
2567
+ * index_reader.terms(field) -> TermEnum
2568
+ *
2569
+ * Returns a term enumerator which allows you to iterate through all the
2570
+ * terms in the field +field+ in the index.
2571
+ */
2572
+ static VALUE
2573
+ frb_ir_terms(VALUE self, VALUE rfield)
2574
+ {
2575
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2576
+ return frb_get_te(self, ir_terms(ir, frb_field(rfield)));
2577
+ }
2578
+
2579
+ /*
2580
+ * call-seq:
2581
+ * index_reader.terms_from(field, term) -> TermEnum
2582
+ *
2583
+ * Same as IndexReader#terms(fields) except that it starts the enumerator off
2584
+ * at term +term+.
2585
+ */
2586
+ static VALUE
2587
+ frb_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2588
+ {
2589
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2590
+ return frb_get_te(self, ir_terms_from(ir,
2591
+ frb_field(rfield),
2592
+ StringValuePtr(rterm)));
2593
+ }
2594
+
2595
+ /*
2596
+ * call-seq:
2597
+ * index_reader.term_count(field) -> int
2598
+ *
2599
+ * Same return a count of the number of terms in the field
2600
+ */
2601
+ static VALUE
2602
+ frb_ir_term_count(VALUE self, VALUE rfield)
2603
+ {
2604
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2605
+ TermEnum *te = ir_terms(ir, frb_field(rfield));
2606
+ int count = 0;
2607
+ while (te->next(te)) {
2608
+ count++;
2609
+ }
2610
+ te->close(te);
2611
+ return INT2FIX(count);
2612
+ }
2613
+
2614
+ /*
2615
+ * call-seq:
2616
+ * index_reader.fields -> array of field-names
2617
+ *
2618
+ * Returns an array of field names in the index. This can be used to pass to
2619
+ * the QueryParser so that the QueryParser knows how to expand the "*"
2620
+ * wild-card to all fields in the index. A list of field names can also be
2621
+ * gathered from the FieldInfos object.
2622
+ */
2623
+ static VALUE
2624
+ frb_ir_fields(VALUE self)
2625
+ {
2626
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2627
+ FieldInfos *fis = ir->fis;
2628
+ VALUE rfield_names = rb_ary_new();
2629
+ int i;
2630
+ for (i = 0; i < fis->size; i++) {
2631
+ rb_ary_push(rfield_names, FSYM2SYM(fis->fields[i]->name));
2632
+ }
2633
+ return rfield_names;
2634
+ }
2635
+
2636
+ /*
2637
+ * call-seq:
2638
+ * index_reader.field_infos -> FieldInfos
2639
+ *
2640
+ * Get the FieldInfos object for this IndexReader.
2641
+ */
2642
+ static VALUE
2643
+ frb_ir_field_infos(VALUE self)
2644
+ {
2645
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2646
+ return frb_get_field_infos(ir->fis);
2647
+ }
2648
+
2649
+ /*
2650
+ * call-seq:
2651
+ * index_reader.tokenized_fields -> array of field-names
2652
+ *
2653
+ * Returns an array of field names of all of the tokenized fields in the
2654
+ * index. This can be used to pass to the QueryParser so that the QueryParser
2655
+ * knows how to expand the "*" wild-card to all fields in the index. A list
2656
+ * of field names can also be gathered from the FieldInfos object.
2657
+ */
2658
+ static VALUE
2659
+ frb_ir_tk_fields(VALUE self)
2660
+ {
2661
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2662
+ FieldInfos *fis = ir->fis;
2663
+ VALUE rfield_names = rb_ary_new();
2664
+ int i;
2665
+ for (i = 0; i < fis->size; i++) {
2666
+ if (!fi_is_tokenized(fis->fields[i])) continue;
2667
+ rb_ary_push(rfield_names, FSYM2SYM(fis->fields[i]->name));
2668
+ }
2669
+ return rfield_names;
2670
+ }
2671
+
2672
+ /*
2673
+ * call-seq:
2674
+ * index_reader.version -> int
2675
+ *
2676
+ * Returns the current version of the index reader.
2677
+ */
2678
+ static VALUE
2679
+ frb_ir_version(VALUE self)
2680
+ {
2681
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2682
+ return ULL2NUM(ir->sis->version);
2683
+ }
2684
+
2685
+ /****************************************************************************
2686
+ *
2687
+ * Init Functions
2688
+ *
2689
+ ****************************************************************************/
2690
+
2691
+
2692
+ /*
2693
+ * Document-class: Ferret::Index::FieldInfo
2694
+ *
2695
+ * == Summary
2696
+ *
2697
+ * The FieldInfo class is the field descriptor for the index. It specifies
2698
+ * whether a field is compressed or not or whether it should be indexed and
2699
+ * tokenized. Every field has a name which must be a symbol. There are three
2700
+ * properties that you can set, +:store+, +:index+ and +:term_vector+. You
2701
+ * can also set the default +:boost+ for a field as well.
2702
+ *
2703
+ * == Properties
2704
+ *
2705
+ * === :store
2706
+ *
2707
+ * The +:store+ property allows you to specify how a field is stored. You can
2708
+ * leave a field unstored (+:no+), store it in it's original format (+:yes+)
2709
+ * or store it in compressed format (+:compressed+). By default the document
2710
+ * is stored in its original format. If the field is large and it is stored
2711
+ * elsewhere where it is easily accessible you might want to leave it
2712
+ * unstored. This will keep the index size a lot smaller and make the
2713
+ * indexing process a lot faster. For example, you should probably leave the
2714
+ * +:content+ field unstored when indexing all the documents in your
2715
+ * file-system.
2716
+ *
2717
+ * === :index
2718
+ *
2719
+ * The +:index+ property allows you to specify how a field is indexed. A
2720
+ * field must be indexed to be searchable. However, a field doesn't need to
2721
+ * be indexed to be store in the Ferret index. You may want to use the index
2722
+ * as a simple database and store things like images or MP3s in the index. By
2723
+ * default each field is indexed and tokenized (split into tokens) (+:yes+).
2724
+ * If you don't want to index the field use +:no+. If you want the field
2725
+ * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2726
+ * wish to sort by. There are two other values for +:index+; +:omit_norms+
2727
+ * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
2728
+ * +:untokenized+ respectively and are useful if you are not boosting any
2729
+ * fields and you'd like to speed up the index. The norms file is the file
2730
+ * which contains the boost values for each document for a particular field.
2731
+ *
2732
+ * === :term_vector
2733
+ *
2734
+ * See TermVector for a description of term-vectors. You can specify whether
2735
+ * or not you would like to store term-vectors. The available options are
2736
+ * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2737
+ * +:with_positions_offsets+. Note that you need to store the positions to
2738
+ * associate offsets with individual terms in the term_vector.
2739
+ *
2740
+ * == Property Table
2741
+ *
2742
+ * Property Value Description
2743
+ * ------------------------------------------------------------------------
2744
+ * :store | :no | Don't store field
2745
+ * | |
2746
+ * | :yes (default) | Store field in its original
2747
+ * | | format. Use this value if you
2748
+ * | | want to highlight matches.
2749
+ * | | or print match excerpts a la
2750
+ * | | Google search.
2751
+ * | |
2752
+ * | :compressed | Store field in compressed
2753
+ * | | format.
2754
+ * -------------|-------------------------|------------------------------
2755
+ * :index | :no | Do not make this field
2756
+ * | | searchable.
2757
+ * | |
2758
+ * | :yes (default) | Make this field searchable and
2759
+ * | | tokenized its contents.
2760
+ * | |
2761
+ * | :untokenized | Make this field searchable but
2762
+ * | | do not tokenize its contents.
2763
+ * | | use this value for fields you
2764
+ * | | wish to sort by.
2765
+ * | |
2766
+ * | :omit_norms | Same as :yes except omit the
2767
+ * | | norms file. The norms file can
2768
+ * | | be omitted if you don't boost
2769
+ * | | any fields and you don't need
2770
+ * | | scoring based on field length.
2771
+ * | |
2772
+ * | :untokenized_omit_norms | Same as :untokenized except omit
2773
+ * | | the norms file. Norms files can
2774
+ * | | be omitted if you don't boost
2775
+ * | | any fields and you don't need
2776
+ * | | scoring based on field length.
2777
+ * | |
2778
+ * -------------|-------------------------|------------------------------
2779
+ * :term_vector | :no | Don't store term-vectors
2780
+ * | |
2781
+ * | :yes | Store term-vectors without
2782
+ * | | storing positions or offsets.
2783
+ * | |
2784
+ * | :with_positions | Store term-vectors with
2785
+ * | | positions.
2786
+ * | |
2787
+ * | :with_offsets | Store term-vectors with
2788
+ * | | offsets.
2789
+ * | |
2790
+ * | :with_positions_offsets | Store term-vectors with
2791
+ * | (default) | positions and offsets.
2792
+ * -------------|-------------------------|------------------------------
2793
+ * :boost | Float | The boost property is used to
2794
+ * | | set the default boost for a
2795
+ * | | field. This boost value will
2796
+ * | | used for all instances of the
2797
+ * | | field in the index unless
2798
+ * | | otherwise specified when you
2799
+ * | | create the field. All values
2800
+ * | | should be positive.
2801
+ * | |
2802
+ *
2803
+ * == Examples
2804
+ *
2805
+ * fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
2806
+ * :boost => 10.0)
2807
+ *
2808
+ * fi = FieldInfo.new(:content)
2809
+ *
2810
+ * fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
2811
+ * :term_vector => :no)
2812
+ *
2813
+ * fi = FieldInfo.new(:image, :store => :compressed, :index => :no,
2814
+ * :term_vector => :no)
2815
+ */
2816
+ static void
2817
+ Init_FieldInfo(void)
2818
+ {
2819
+ sym_store = ID2SYM(rb_intern("store"));
2820
+ sym_index = ID2SYM(rb_intern("index"));
2821
+ sym_term_vector = ID2SYM(rb_intern("term_vector"));
2822
+
2823
+ sym_compress = ID2SYM(rb_intern("compress"));
2824
+ sym_compressed = ID2SYM(rb_intern("compressed"));
2825
+
2826
+ sym_untokenized = ID2SYM(rb_intern("untokenized"));
2827
+ sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
2828
+ sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
2829
+
2830
+ sym_with_positions = ID2SYM(rb_intern("with_positions"));
2831
+ sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
2832
+ sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
2833
+
2834
+ cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
2835
+ rb_define_alloc_func(cFieldInfo, frb_data_alloc);
2836
+
2837
+ rb_define_method(cFieldInfo, "initialize", frb_fi_init, -1);
2838
+ rb_define_method(cFieldInfo, "name", frb_fi_name, 0);
2839
+ rb_define_method(cFieldInfo, "stored?", frb_fi_is_stored, 0);
2840
+ rb_define_method(cFieldInfo, "compressed?", frb_fi_is_compressed, 0);
2841
+ rb_define_method(cFieldInfo, "indexed?", frb_fi_is_indexed, 0);
2842
+ rb_define_method(cFieldInfo, "tokenized?", frb_fi_is_tokenized, 0);
2843
+ rb_define_method(cFieldInfo, "omit_norms?", frb_fi_omit_norms, 0);
2844
+ rb_define_method(cFieldInfo, "store_term_vector?",
2845
+ frb_fi_store_term_vector, 0);
2846
+ rb_define_method(cFieldInfo, "store_positions?",
2847
+ frb_fi_store_positions, 0);
2848
+ rb_define_method(cFieldInfo, "store_offsets?",
2849
+ frb_fi_store_offsets, 0);
2850
+ rb_define_method(cFieldInfo, "has_norms?", frb_fi_has_norms, 0);
2851
+ rb_define_method(cFieldInfo, "boost", frb_fi_boost, 0);
2852
+ rb_define_method(cFieldInfo, "to_s", frb_fi_to_s, 0);
2853
+ }
2854
+
2855
+ /*
2856
+ * Document-class: Ferret::Index::FieldInfos
2857
+ *
2858
+ * == Summary
2859
+ *
2860
+ * The FieldInfos class holds all the field descriptors for an index. It is
2861
+ * this class that is used to create a new index using the
2862
+ * FieldInfos#create_index method. If you are happy with the default
2863
+ * properties for FieldInfo then you don't need to worry about this class.
2864
+ * IndexWriter can create the index for you. Otherwise you should set up the
2865
+ * index like in the example;
2866
+ *
2867
+ * == Example
2868
+ *
2869
+ * field_infos = FieldInfos.new(:term_vector => :no)
2870
+ *
2871
+ * field_infos.add_field(:title, :index => :untokenized, :term_vector => :no,
2872
+ * :boost => 10.0)
2873
+ *
2874
+ * field_infos.add_field(:content)
2875
+ *
2876
+ * field_infos.add_field(:created_on, :index => :untokenized_omit_norms,
2877
+ * :term_vector => :no)
2878
+ *
2879
+ * field_infos.add_field(:image, :store => :compressed, :index => :no,
2880
+ * :term_vector => :no)
2881
+ *
2882
+ * field_infos.create_index("/path/to/index")
2883
+ *
2884
+ * == Default Properties
2885
+ *
2886
+ * See FieldInfo for the available field property values.
2887
+ *
2888
+ * When you create the FieldInfos object you specify the default properties
2889
+ * for the fields. Often you'll specify all of the fields in the index before
2890
+ * you create the index so the default values won't come into play. However,
2891
+ * it is possible to continue to dynamically add fields as indexing goes
2892
+ * along. If you add a document to the index which has fields that the index
2893
+ * doesn't know about then the default properties are used for the new field.
2894
+ */
2895
+ static void
2896
+ Init_FieldInfos(void)
2897
+ {
2898
+ Init_FieldInfo();
2899
+
2900
+ cFieldInfos = rb_define_class_under(mIndex, "FieldInfos", rb_cObject);
2901
+ rb_define_alloc_func(cFieldInfos, frb_data_alloc);
2902
+
2903
+ rb_define_method(cFieldInfos, "initialize", frb_fis_init, -1);
2904
+ rb_define_method(cFieldInfos, "to_a", frb_fis_to_a, 0);
2905
+ rb_define_method(cFieldInfos, "[]", frb_fis_get, 1);
2906
+ rb_define_method(cFieldInfos, "add", frb_fis_add, 1);
2907
+ rb_define_method(cFieldInfos, "<<", frb_fis_add, 1);
2908
+ rb_define_method(cFieldInfos, "add_field", frb_fis_add_field, -1);
2909
+ rb_define_method(cFieldInfos, "each", frb_fis_each, 0);
2910
+ rb_define_method(cFieldInfos, "to_s", frb_fis_to_s, 0);
2911
+ rb_define_method(cFieldInfos, "size", frb_fis_size, 0);
2912
+ rb_define_method(cFieldInfos, "create_index",
2913
+ frb_fis_create_index, 1);
2914
+ rb_define_method(cFieldInfos, "fields", frb_fis_get_fields, 0);
2915
+ rb_define_method(cFieldInfos, "tokenized_fields", frb_fis_get_tk_fields, 0);
2916
+ }
2917
+
2918
+ /*
2919
+ * Document-class: Ferret::Index::TermEnum
2920
+ *
2921
+ * == Summary
2922
+ *
2923
+ * The TermEnum object is used to iterate through the terms in a field. To
2924
+ * get a TermEnum you need to use the IndexReader#terms(field) method.
2925
+ *
2926
+ * == Example
2927
+ *
2928
+ * te = index_reader.terms(:content)
2929
+ *
2930
+ * te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
2931
+ *
2932
+ * # or you could do it like this;
2933
+ * te = index_reader.terms(:content)
2934
+ *
2935
+ * while te.next?
2936
+ * puts "#{te.term} occured in #{te.doc_freq} documents in the index"
2937
+ * end
2938
+ */
2939
+ static void
2940
+ Init_TermEnum(void)
2941
+ {
2942
+ id_term = rb_intern("@term");
2943
+
2944
+ cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
2945
+ rb_define_alloc_func(cTermEnum, frb_data_alloc);
2946
+
2947
+ rb_define_method(cTermEnum, "next?", frb_te_next, 0);
2948
+ rb_define_method(cTermEnum, "term", frb_te_term, 0);
2949
+ rb_define_method(cTermEnum, "doc_freq", frb_te_doc_freq, 0);
2950
+ rb_define_method(cTermEnum, "skip_to", frb_te_skip_to, 1);
2951
+ rb_define_method(cTermEnum, "each", frb_te_each, 0);
2952
+ rb_define_method(cTermEnum, "field=", frb_te_set_field, 1);
2953
+ rb_define_method(cTermEnum, "set_field",frb_te_set_field, 1);
2954
+ rb_define_method(cTermEnum, "to_json", frb_te_to_json, -1);
2955
+ }
2956
+
2957
+ /*
2958
+ * Document-class: Ferret::Index::TermDocEnum
2959
+ *
2960
+ * == Summary
2961
+ *
2962
+ * Use a TermDocEnum to iterate through the documents that contain a
2963
+ * particular term. You can also iterate through the positions which the term
2964
+ * occurs in a document.
2965
+ *
2966
+ *
2967
+ * == Example
2968
+ *
2969
+ * tde = index_reader.term_docs_for(:content, "fox")
2970
+ *
2971
+ * tde.each do |doc_id, freq|
2972
+ * puts "fox appeared #{freq} times in document #{doc_id}:"
2973
+ * positions = []
2974
+ * tde.each_position {|pos| positions << pos}
2975
+ * puts " #{positions.join(', ')}"
2976
+ * end
2977
+ *
2978
+ * # or you can do it like this;
2979
+ * tde.seek(:title, "red")
2980
+ * while tde.next?
2981
+ * puts "red appeared #{tde.freq} times in document #{tde.doc}:"
2982
+ * positions = []
2983
+ * while pos = tde.next_position
2984
+ * positions << pos
2985
+ * end
2986
+ * puts " #{positions.join(', ')}"
2987
+ * end
2988
+ */
2989
+ static void
2990
+ Init_TermDocEnum(void)
2991
+ {
2992
+ id_fld_num_map = rb_intern("@field_num_map");
2993
+ id_field_num = rb_intern("@field_num");
2994
+
2995
+ cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
2996
+ rb_define_alloc_func(cTermDocEnum, frb_data_alloc);
2997
+ rb_define_method(cTermDocEnum, "seek", frb_tde_seek, 2);
2998
+ rb_define_method(cTermDocEnum, "seek_term_enum", frb_tde_seek_te, 1);
2999
+ rb_define_method(cTermDocEnum, "doc", frb_tde_doc, 0);
3000
+ rb_define_method(cTermDocEnum, "freq", frb_tde_freq, 0);
3001
+ rb_define_method(cTermDocEnum, "next?", frb_tde_next, 0);
3002
+ rb_define_method(cTermDocEnum, "next_position", frb_tde_next_position, 0);
3003
+ rb_define_method(cTermDocEnum, "each", frb_tde_each, 0);
3004
+ rb_define_method(cTermDocEnum, "each_position", frb_tde_each_position, 0);
3005
+ rb_define_method(cTermDocEnum, "skip_to", frb_tde_skip_to, 1);
3006
+ rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
3007
+ }
3008
+
3009
+ /* rdochack
3010
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3011
+ */
3012
+
3013
+ /*
3014
+ * Document-class: Ferret::Index::TermVector::TVOffsets
3015
+ *
3016
+ * == Summary
3017
+ *
3018
+ * Holds the start and end byte-offsets of a term in a field. For example, if
3019
+ * the field was "the quick brown fox" then the start and end offsets of:
3020
+ *
3021
+ * ["the", "quick", "brown", "fox"]
3022
+ *
3023
+ * Would be:
3024
+ *
3025
+ * [(0,3), (4,9), (10,15), (16,19)]
3026
+ *
3027
+ * See the Analysis module for more information on setting the offsets.
3028
+ */
3029
+ static void
3030
+ Init_TVOffsets(void)
3031
+ {
3032
+ const char *tv_offsets_class = "TVOffsets";
3033
+ /* rdochack
3034
+ cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
3035
+ */
3036
+ cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
3037
+ rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
3038
+ rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
3039
+ }
3040
+
3041
+ /*
3042
+ * Document-class: Ferret::Index::TermVector::TVTerm
3043
+ *
3044
+ * == Summary
3045
+ *
3046
+ * The TVTerm class holds the term information for each term in a TermVector.
3047
+ * That is it holds the term's text and its positions in the document. You
3048
+ * can use those positions to reference the offsets for the term.
3049
+ *
3050
+ * == Example
3051
+ *
3052
+ * tv = index_reader.term_vector(:content)
3053
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3054
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3055
+ */
3056
+ static void
3057
+ Init_TVTerm(void)
3058
+ {
3059
+ const char *tv_term_class = "TVTerm";
3060
+ /* rdochack
3061
+ cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
3062
+ */
3063
+ cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
3064
+ rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
3065
+ rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
3066
+ }
3067
+
3068
+ /*
3069
+ * Document-class: Ferret::Index::TermVector
3070
+ *
3071
+ * == Summary
3072
+ *
3073
+ * TermVectors are most commonly used for creating search result excerpts and
3074
+ * highlight search matches in results. This is all done internally so you
3075
+ * won't need to worry about the TermVector object. There are some other
3076
+ * reasons you may want to use the TermVectors object however. For example,
3077
+ * you may wish to see which terms are the most commonly occurring terms in a
3078
+ * document to implement a MoreLikeThis search.
3079
+ *
3080
+ * == Example
3081
+ *
3082
+ * tv = index_reader.term_vector(doc_id, :content)
3083
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3084
+ *
3085
+ * # get the term frequency
3086
+ * term_freq = tv_term.positions.size
3087
+ *
3088
+ * # get the offsets for a term
3089
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3090
+ *
3091
+ * == Note
3092
+ *
3093
+ * +positions+ and +offsets+ can be +nil+ depending on what you set the
3094
+ * +:term_vector+ to when you set the FieldInfo object for the field. Note in
3095
+ * particular that you need to store both positions and offsets if you want
3096
+ * to associate offsets with particular terms.
3097
+ */
3098
+ static void
3099
+ Init_TermVector(void)
3100
+ {
3101
+ const char *tv_class = "TermVector";
3102
+ /* rdochack
3103
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3104
+ */
3105
+ cTermVector = rb_struct_define(tv_class,
3106
+ "field", "terms", "offsets", NULL);
3107
+ rb_set_class_path(cTermVector, mIndex, tv_class);
3108
+ rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
3109
+
3110
+ Init_TVOffsets();
3111
+ Init_TVTerm();
3112
+ }
3113
+
3114
+ /*
3115
+ * Document-class: Ferret::Index::IndexWriter
3116
+ *
3117
+ * == Summary
3118
+ *
3119
+ * The IndexWriter is the class used to add documents to an index. You can
3120
+ * also delete documents from the index using this class. The indexing
3121
+ * process is highly customizable and the IndexWriter has the following
3122
+ * parameters;
3123
+ *
3124
+ * dir:: This is an Ferret::Store::Directory object. You
3125
+ * should either pass a +:dir+ or a +:path+ when
3126
+ * creating an index.
3127
+ * path:: A string representing the path to the index
3128
+ * directory. If you are creating the index for the
3129
+ * first time the directory will be created if it's
3130
+ * missing. You should not choose a directory which
3131
+ * contains other files as they could be over-written.
3132
+ * To protect against this set +:create_if_missing+ to
3133
+ * false.
3134
+ * create_if_missing:: Default: true. Create the index if no index is
3135
+ * found in the specified directory. Otherwise, use
3136
+ * the existing index.
3137
+ * create:: Default: false. Creates the index, even if one
3138
+ * already exists. That means any existing index will
3139
+ * be deleted. It is probably better to use the
3140
+ * create_if_missing option so that the index is only
3141
+ * created the first time when it doesn't exist.
3142
+ * field_infos:: Default FieldInfos.new. The FieldInfos object to use
3143
+ * when creating a new index if +:create_if_missing+ or
3144
+ * +:create+ is set to true. If an existing index is
3145
+ * opened then this parameter is ignored.
3146
+ * analyzer:: Default: Ferret::Analysis::StandardAnalyzer.
3147
+ * Sets the default analyzer for the index. This is
3148
+ * used by both the IndexWriter and the QueryParser
3149
+ * to tokenize the input. The default is the
3150
+ * StandardAnalyzer.
3151
+ * chunk_size:: Default: 0x100000 or 1Mb. Memory performance tuning
3152
+ * parameter. Sets the default size of chunks of memory
3153
+ * malloced for use during indexing. You can usually
3154
+ * leave this parameter as is.
3155
+ * max_buffer_memory:: Default: 0x1000000 or 16Mb. Memory performance
3156
+ * tuning parameter. Sets the amount of memory to be
3157
+ * used by the indexing process. Set to a larger value
3158
+ * to increase indexing speed. Note that this only
3159
+ * includes memory used by the indexing process, not
3160
+ * the rest of your ruby application.
3161
+ * term_index_interval:: Default: 128. The skip interval between terms in the
3162
+ * term dictionary. A smaller value will possibly
3163
+ * increase search performance while also increasing
3164
+ * memory usage and impacting negatively impacting
3165
+ * indexing performance.
3166
+ * doc_skip_interval:: Default: 16. The skip interval for document numbers
3167
+ * in the index. As with +:term_index_interval+ you
3168
+ * have a trade-off. A smaller number may increase
3169
+ * search performance while also increasing memory
3170
+ * usage and impacting negatively impacting indexing
3171
+ * performance.
3172
+ * merge_factor:: Default: 10. This must never be less than 2.
3173
+ * Specifies the number of segments of a certain size
3174
+ * that must exist before they are merged. A larger
3175
+ * value will improve indexing performance while
3176
+ * slowing search performance.
3177
+ * max_buffered_docs:: Default: 10000. The maximum number of documents that
3178
+ * may be stored in memory before being written to the
3179
+ * index. If you have a lot of memory and are indexing
3180
+ * a large number of small documents (like products in
3181
+ * a product database for example) you may want to set
3182
+ * this to a much higher number (like
3183
+ * Ferret::FIX_INT_MAX). If you are worried about your
3184
+ * application crashing during the middle of index you
3185
+ * might set this to a smaller number so that the index
3186
+ * is committed more often. This is like having an
3187
+ * auto-save in a word processor application.
3188
+ * max_merge_docs:: Set this value to limit the number of documents that
3189
+ * go into a single segment. Use this to avoid
3190
+ * extremely long merge times during indexing which can
3191
+ * make your application seem unresponsive. This is
3192
+ * only necessary for very large indexes (millions of
3193
+ * documents).
3194
+ * max_field_length:: Default: 10000. The maximum number of terms added to
3195
+ * a single field. This can be useful to protect the
3196
+ * indexer when indexing documents from the web for
3197
+ * example. Usually the most important terms will occur
3198
+ * early on in a document so you can often safely
3199
+ * ignore the terms in a field after a certain number
3200
+ * of them. If you wanted to speed up indexing and same
3201
+ * space in your index you may only want to index the
3202
+ * first 1000 terms in a field. On the other hand, if
3203
+ * you want to be more thorough and you are indexing
3204
+ * documents from your file-system you may set this
3205
+ * parameter to Ferret::FIX_INT_MAX.
3206
+ * use_compound_file:: Default: true. Uses a compound file to store the
3207
+ * index. This prevents an error being raised for
3208
+ * having too many files open at the same time. The
3209
+ * default is true but performance is better if this is
3210
+ * set to false.
3211
+ *
3212
+ *
3213
+ * === Deleting Documents
3214
+ *
3215
+ * Both IndexReader and IndexWriter allow you to delete documents. You should
3216
+ * use the IndexReader to delete documents by document id and IndexWriter to
3217
+ * delete documents by term which we'll explain now. It is preferrable to
3218
+ * delete documents from an index using IndexWriter for performance reasons.
3219
+ * To delete documents using the IndexWriter you should give each document in
3220
+ * the index a unique ID. If you are indexing documents from the file-system
3221
+ * this unique ID will be the full file path. If indexing documents from the
3222
+ * database you should use the primary key as the ID field. You can then
3223
+ * use the delete method to delete a file referenced by the ID. For example;
3224
+ *
3225
+ * index_writer.delete(:id, "/path/to/indexed/file")
3226
+ */
3227
+ void
3228
+ Init_IndexWriter(void)
3229
+ {
3230
+ id_boost = rb_intern("boost");
3231
+
3232
+ sym_create = ID2SYM(rb_intern("create"));
3233
+ sym_create_if_missing = ID2SYM(rb_intern("create_if_missing"));
3234
+ sym_field_infos = ID2SYM(rb_intern("field_infos"));
3235
+
3236
+ sym_chunk_size = ID2SYM(rb_intern("chunk_size"));
3237
+ sym_max_buffer_memory = ID2SYM(rb_intern("max_buffer_memory"));
3238
+ sym_index_interval = ID2SYM(rb_intern("term_index_interval"));
3239
+ sym_skip_interval = ID2SYM(rb_intern("doc_skip_interval"));
3240
+ sym_merge_factor = ID2SYM(rb_intern("merge_factor"));
3241
+ sym_max_buffered_docs = ID2SYM(rb_intern("max_buffered_docs"));
3242
+ sym_max_merge_docs = ID2SYM(rb_intern("max_merge_docs"));
3243
+ sym_max_field_length = ID2SYM(rb_intern("max_field_length"));
3244
+ sym_use_compound_file = ID2SYM(rb_intern("use_compound_file"));
3245
+
3246
+ cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
3247
+ rb_define_alloc_func(cIndexWriter, frb_data_alloc);
3248
+
3249
+ rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
3250
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
3251
+ rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
3252
+ rb_str_new2(WRITE_LOCK_NAME));
3253
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
3254
+ rb_str_new2(COMMIT_LOCK_NAME));
3255
+ rb_define_const(cIndexWriter, "DEFAULT_CHUNK_SIZE",
3256
+ INT2FIX(default_config.chunk_size));
3257
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFER_MEMORY",
3258
+ INT2FIX(default_config.max_buffer_memory));
3259
+ rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
3260
+ INT2FIX(default_config.index_interval));
3261
+ rb_define_const(cIndexWriter, "DEFAULT_DOC_SKIP_INTERVAL",
3262
+ INT2FIX(default_config.skip_interval));
3263
+ rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
3264
+ INT2FIX(default_config.merge_factor));
3265
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFERED_DOCS",
3266
+ INT2FIX(default_config.max_buffered_docs));
3267
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
3268
+ INT2FIX(default_config.max_merge_docs));
3269
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
3270
+ INT2FIX(default_config.max_field_length));
3271
+ rb_define_const(cIndexWriter, "DEFAULT_USE_COMPOUND_FILE",
3272
+ default_config.use_compound_file ? Qtrue : Qfalse);
3273
+
3274
+ rb_define_method(cIndexWriter, "initialize", frb_iw_init, -1);
3275
+ rb_define_method(cIndexWriter, "doc_count", frb_iw_get_doc_count, 0);
3276
+ rb_define_method(cIndexWriter, "close", frb_iw_close, 0);
3277
+ rb_define_method(cIndexWriter, "add_document", frb_iw_add_doc, 1);
3278
+ rb_define_method(cIndexWriter, "<<", frb_iw_add_doc, 1);
3279
+ rb_define_method(cIndexWriter, "optimize", frb_iw_optimize, 0);
3280
+ rb_define_method(cIndexWriter, "commit", frb_iw_commit, 0);
3281
+ rb_define_method(cIndexWriter, "add_readers", frb_iw_add_readers, 1);
3282
+ rb_define_method(cIndexWriter, "delete", frb_iw_delete, 2);
3283
+ rb_define_method(cIndexWriter, "field_infos", frb_iw_field_infos, 0);
3284
+ rb_define_method(cIndexWriter, "analyzer", frb_iw_get_analyzer, 0);
3285
+ rb_define_method(cIndexWriter, "analyzer=", frb_iw_set_analyzer, 1);
3286
+ rb_define_method(cIndexWriter, "version", frb_iw_version, 0);
3287
+
3288
+ rb_define_method(cIndexWriter, "chunk_size",
3289
+ frb_iw_get_chunk_size, 0);
3290
+ rb_define_method(cIndexWriter, "chunk_size=",
3291
+ frb_iw_set_chunk_size, 1);
3292
+
3293
+ rb_define_method(cIndexWriter, "max_buffer_memory",
3294
+ frb_iw_get_max_buffer_memory, 0);
3295
+ rb_define_method(cIndexWriter, "max_buffer_memory=",
3296
+ frb_iw_set_max_buffer_memory, 1);
3297
+
3298
+ rb_define_method(cIndexWriter, "term_index_interval",
3299
+ frb_iw_get_index_interval, 0);
3300
+ rb_define_method(cIndexWriter, "term_index_interval=",
3301
+ frb_iw_set_index_interval, 1);
3302
+
3303
+ rb_define_method(cIndexWriter, "doc_skip_interval",
3304
+ frb_iw_get_skip_interval, 0);
3305
+ rb_define_method(cIndexWriter, "doc_skip_interval=",
3306
+ frb_iw_set_skip_interval, 1);
3307
+
3308
+ rb_define_method(cIndexWriter, "merge_factor",
3309
+ frb_iw_get_merge_factor, 0);
3310
+ rb_define_method(cIndexWriter, "merge_factor=",
3311
+ frb_iw_set_merge_factor, 1);
3312
+
3313
+ rb_define_method(cIndexWriter, "max_buffered_docs",
3314
+ frb_iw_get_max_buffered_docs, 0);
3315
+ rb_define_method(cIndexWriter, "max_buffered_docs=",
3316
+ frb_iw_set_max_buffered_docs, 1);
3317
+
3318
+ rb_define_method(cIndexWriter, "max_merge_docs",
3319
+ frb_iw_get_max_merge_docs, 0);
3320
+ rb_define_method(cIndexWriter, "max_merge_docs=",
3321
+ frb_iw_set_max_merge_docs, 1);
3322
+
3323
+ rb_define_method(cIndexWriter, "max_field_length",
3324
+ frb_iw_get_max_field_length, 0);
3325
+ rb_define_method(cIndexWriter, "max_field_length=",
3326
+ frb_iw_set_max_field_length, 1);
3327
+
3328
+ rb_define_method(cIndexWriter, "use_compound_file",
3329
+ frb_iw_get_use_compound_file, 0);
3330
+ rb_define_method(cIndexWriter, "use_compound_file=",
3331
+ frb_iw_set_use_compound_file, 1);
3332
+
3333
+ }
3334
+
3335
+ /*
3336
+ * Document-class: Ferret::Index::LazyDoc
3337
+ *
3338
+ * == Summary
3339
+ *
3340
+ * When a document is retrieved from the index a LazyDoc is returned.
3341
+ * Actually, LazyDoc is just a modified Hash object which lazily adds fields
3342
+ * to itself when they are accessed. You should not that they keys method
3343
+ * will return nothing until you actually access one of the fields. To see
3344
+ * what fields are available use LazyDoc#fields rather than LazyDoc#keys. To
3345
+ * load all fields use the LazyDoc#load method.
3346
+ *
3347
+ * == Example
3348
+ *
3349
+ * doc = index_reader[0]
3350
+ *
3351
+ * doc.keys #=> []
3352
+ * doc.values #=> []
3353
+ * doc.fields #=> [:title, :content]
3354
+ *
3355
+ * title = doc[:title] #=> "the title"
3356
+ * doc.keys #=> [:title]
3357
+ * doc.values #=> ["the title"]
3358
+ * doc.fields #=> [:title, :content]
3359
+ *
3360
+ * doc.load
3361
+ * doc.keys #=> [:title, :content]
3362
+ * doc.values #=> ["the title", "the content"]
3363
+ * doc.fields #=> [:title, :content]
3364
+ */
3365
+ void
3366
+ Init_LazyDoc(void)
3367
+ {
3368
+ id_fields = rb_intern("@fields");
3369
+
3370
+
3371
+ cLazyDoc = rb_define_class_under(mIndex, "LazyDoc", rb_cHash);
3372
+ rb_define_method(cLazyDoc, "default", frb_lzd_default, 1);
3373
+ rb_define_method(cLazyDoc, "load", frb_lzd_load, 0);
3374
+ rb_define_method(cLazyDoc, "fields", frb_lzd_fields, 0);
3375
+
3376
+ cLazyDocData = rb_define_class_under(cLazyDoc, "LazyDocData", rb_cObject);
3377
+ rb_define_alloc_func(cLazyDocData, frb_data_alloc);
3378
+ }
3379
+
3380
+ /*
3381
+ * Document-class: Ferret::Index::IndexReader
3382
+ *
3383
+ * == Summary
3384
+ *
3385
+ * IndexReader is used for reading data from the index. This class is usually
3386
+ * used directly for more advanced tasks like iterating through terms in an
3387
+ * index, accessing term-vectors or deleting documents by document id. It is
3388
+ * also used internally by IndexSearcher.
3389
+ */
3390
+ void
3391
+ Init_IndexReader(void)
3392
+ {
3393
+ cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
3394
+ rb_define_alloc_func(cIndexReader, frb_data_alloc);
3395
+ rb_define_method(cIndexReader, "initialize", frb_ir_init, 1);
3396
+ rb_define_method(cIndexReader, "set_norm", frb_ir_set_norm, 3);
3397
+ rb_define_method(cIndexReader, "norms", frb_ir_norms, 1);
3398
+ rb_define_method(cIndexReader, "get_norms_into",frb_ir_get_norms_into, 3);
3399
+ rb_define_method(cIndexReader, "commit", frb_ir_commit, 0);
3400
+ rb_define_method(cIndexReader, "close", frb_ir_close, 0);
3401
+ rb_define_method(cIndexReader, "has_deletions?",frb_ir_has_deletions, 0);
3402
+ rb_define_method(cIndexReader, "delete", frb_ir_delete, 1);
3403
+ rb_define_method(cIndexReader, "deleted?", frb_ir_is_deleted, 1);
3404
+ rb_define_method(cIndexReader, "max_doc", frb_ir_max_doc, 0);
3405
+ rb_define_method(cIndexReader, "num_docs", frb_ir_num_docs, 0);
3406
+ rb_define_method(cIndexReader, "undelete_all", frb_ir_undelete_all, 0);
3407
+ rb_define_method(cIndexReader, "latest?", frb_ir_is_latest, 0);
3408
+ rb_define_method(cIndexReader, "get_document", frb_ir_get_doc, -1);
3409
+ rb_define_method(cIndexReader, "[]", frb_ir_get_doc, -1);
3410
+ rb_define_method(cIndexReader, "term_vector", frb_ir_term_vector, 2);
3411
+ rb_define_method(cIndexReader, "term_vectors", frb_ir_term_vectors, 1);
3412
+ rb_define_method(cIndexReader, "term_docs", frb_ir_term_docs, 0);
3413
+ rb_define_method(cIndexReader, "term_positions",frb_ir_term_positions, 0);
3414
+ rb_define_method(cIndexReader, "term_docs_for", frb_ir_term_docs_for, 2);
3415
+ rb_define_method(cIndexReader, "term_positions_for", frb_ir_t_pos_for, 2);
3416
+ rb_define_method(cIndexReader, "doc_freq", frb_ir_doc_freq, 2);
3417
+ rb_define_method(cIndexReader, "terms", frb_ir_terms, 1);
3418
+ rb_define_method(cIndexReader, "terms_from", frb_ir_terms_from, 2);
3419
+ rb_define_method(cIndexReader, "term_count", frb_ir_term_count, 1);
3420
+ rb_define_method(cIndexReader, "fields", frb_ir_fields, 0);
3421
+ rb_define_method(cIndexReader, "field_names", frb_ir_fields, 0);
3422
+ rb_define_method(cIndexReader, "field_infos", frb_ir_field_infos, 0);
3423
+ rb_define_method(cIndexReader, "tokenized_fields", frb_ir_tk_fields, 0);
3424
+ rb_define_method(cIndexReader, "version", frb_ir_version, 0);
3425
+ }
3426
+
3427
+ /* rdoc hack
3428
+ extern VALUE mFerret = rb_define_module("Ferret");
3429
+ */
3430
+
3431
+ /*
3432
+ * Document-module: Ferret::Index
3433
+ *
3434
+ * == Summary
3435
+ *
3436
+ * The Index module contains all the classes used for adding to and
3437
+ * retrieving from the index. The important classes to know about are;
3438
+ *
3439
+ * * FieldInfo
3440
+ * * FieldInfos
3441
+ * * IndexWriter
3442
+ * * IndexReader
3443
+ * * LazyDoc
3444
+ *
3445
+ * The other classes in this module are useful for more advanced uses like
3446
+ * building tag clouds, creating more-like-this queries, custom highlighting
3447
+ * etc. They are also useful for index browsers.
3448
+ */
3449
+ void
3450
+ Init_Index(void)
3451
+ {
3452
+ mIndex = rb_define_module_under(mFerret, "Index");
3453
+
3454
+ sym_boost = ID2SYM(rb_intern("boost"));
3455
+ sym_analyzer = ID2SYM(rb_intern("analyzer"));
3456
+ sym_close_dir = ID2SYM(rb_intern("close_dir"));
3457
+ fsym_content = I("content");
3458
+
3459
+ Init_TermVector();
3460
+ Init_TermEnum();
3461
+ Init_TermDocEnum();
3462
+
3463
+ Init_FieldInfos();
3464
+
3465
+ Init_LazyDoc();
3466
+ Init_IndexWriter();
3467
+ Init_IndexReader();
3468
+ }