isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,3457 @@
1
+ #include "frt_index.h"
2
+ #include "isomorfeus_ferret.h"
3
+ #include <ruby/st.h>
4
+
5
+ VALUE mIndex;
6
+
7
+ VALUE cFieldInfo;
8
+ VALUE cFieldInfos;
9
+
10
+ VALUE cTVOffsets;
11
+ VALUE cTVTerm;
12
+ VALUE cTermVector;
13
+
14
+ VALUE cTermEnum;
15
+ VALUE cTermDocEnum;
16
+
17
+ VALUE cLazyDoc;
18
+ VALUE cLazyDocData;
19
+ VALUE cIndexWriter;
20
+ VALUE cIndexReader;
21
+
22
+ VALUE sym_analyzer;
23
+ static VALUE sym_close_dir;
24
+ static VALUE sym_create;
25
+ static VALUE sym_create_if_missing;
26
+
27
+ static VALUE sym_chunk_size;
28
+ static VALUE sym_max_buffer_memory;
29
+ static VALUE sym_index_interval;
30
+ static VALUE sym_skip_interval;
31
+ static VALUE sym_merge_factor;
32
+ static VALUE sym_max_buffered_docs;
33
+ static VALUE sym_max_merge_docs;
34
+ static VALUE sym_max_field_length;
35
+ static VALUE sym_use_compound_file;
36
+
37
+ static VALUE sym_boost;
38
+ static VALUE sym_field_infos;
39
+
40
+ static VALUE sym_store;
41
+ static VALUE sym_index;
42
+ static VALUE sym_term_vector;
43
+
44
+ static VALUE sym_untokenized;
45
+ static VALUE sym_omit_norms;
46
+ static VALUE sym_untokenized_omit_norms;
47
+
48
+ static VALUE sym_with_positions;
49
+ static VALUE sym_with_offsets;
50
+ static VALUE sym_with_positions_offsets;
51
+
52
+ static FrtSymbol fsym_content;
53
+
54
+ static ID id_term;
55
+ static ID id_fields;
56
+ static ID id_fld_num_map;
57
+ static ID id_field_num;
58
+ static ID id_boost;
59
+
60
+ extern void frb_set_term(VALUE rterm, FrtTerm *t);
61
+ extern FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer);
62
+ extern VALUE frb_get_analyzer(FrtAnalyzer *a);
63
+
64
+ /****************************************************************************
65
+ *
66
+ * FieldInfo Methods
67
+ *
68
+ ****************************************************************************/
69
+
70
+ static void
71
+ frb_fi_free(void *p)
72
+ {
73
+ object_del(p);
74
+ frt_fi_deref((FrtFieldInfo *)p);
75
+ }
76
+
77
+ static void
78
+ frb_fi_get_params(VALUE roptions,
79
+ FrtStoreValue *store,
80
+ FrtIndexValue *index,
81
+ FrtTermVectorValue *term_vector,
82
+ float *boost)
83
+ {
84
+ VALUE v;
85
+ Check_Type(roptions, T_HASH);
86
+ v = rb_hash_aref(roptions, sym_boost);
87
+ if (Qnil != v) {
88
+ *boost = (float)NUM2DBL(v);
89
+ } else {
90
+ *boost = 1.0f;
91
+ }
92
+ v = rb_hash_aref(roptions, sym_store);
93
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
94
+ if (v == sym_no || v == sym_false || v == Qfalse) {
95
+ *store = FRT_STORE_NO;
96
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
97
+ *store = FRT_STORE_YES;
98
+ } else if (v == Qnil) {
99
+ /* leave as default */
100
+ } else {
101
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
102
+ " Please choose from [:yes, :no]",
103
+ rb_id2name(SYM2ID(v)));
104
+ }
105
+
106
+ v = rb_hash_aref(roptions, sym_index);
107
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
108
+ if (v == sym_no || v == sym_false || v == Qfalse) {
109
+ *index = FRT_INDEX_NO;
110
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
111
+ *index = FRT_INDEX_YES;
112
+ } else if (v == sym_untokenized) {
113
+ *index = FRT_INDEX_UNTOKENIZED;
114
+ } else if (v == sym_omit_norms) {
115
+ *index = FRT_INDEX_YES_OMIT_NORMS;
116
+ } else if (v == sym_untokenized_omit_norms) {
117
+ *index = FRT_INDEX_UNTOKENIZED_OMIT_NORMS;
118
+ } else if (v == Qnil) {
119
+ /* leave as default */
120
+ } else {
121
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
122
+ " Please choose from [:no, :yes, :untokenized, "
123
+ ":omit_norms, :untokenized_omit_norms]",
124
+ rb_id2name(SYM2ID(v)));
125
+ }
126
+
127
+ v = rb_hash_aref(roptions, sym_term_vector);
128
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
129
+ if (v == sym_no || v == sym_false || v == Qfalse) {
130
+ *term_vector = FRT_TERM_VECTOR_NO;
131
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
132
+ *term_vector = FRT_TERM_VECTOR_YES;
133
+ } else if (v == sym_with_positions) {
134
+ *term_vector = FRT_TERM_VECTOR_WITH_POSITIONS;
135
+ } else if (v == sym_with_offsets) {
136
+ *term_vector = FRT_TERM_VECTOR_WITH_OFFSETS;
137
+ } else if (v == sym_with_positions_offsets) {
138
+ *term_vector = FRT_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
139
+ } else if (v == Qnil) {
140
+ /* leave as default */
141
+ } else {
142
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
143
+ ":term_vector. Please choose from [:no, :yes, "
144
+ ":with_positions, :with_offsets, "
145
+ ":with_positions_offsets]",
146
+ rb_id2name(SYM2ID(v)));
147
+ }
148
+ }
149
+
150
+ static VALUE
151
+ frb_get_field_info(FrtFieldInfo *fi)
152
+ {
153
+
154
+ VALUE rfi = Qnil;
155
+ if (fi) {
156
+ rfi = object_get(fi);
157
+ if (rfi == Qnil) {
158
+ rfi = Data_Wrap_Struct(cFieldInfo, NULL, &frb_fi_free, fi);
159
+ FRT_REF(fi);
160
+ object_add(fi, rfi);
161
+ }
162
+ }
163
+ return rfi;
164
+ }
165
+
166
+ /*
167
+ * call-seq:
168
+ * FieldInfo.new(name, options = {}) -> field_info
169
+ *
170
+ * Create a new FieldInfo object with the name +name+ and the properties
171
+ * specified in +options+. The available options are [:store, :index,
172
+ * :term_vector, :boost]. See the description of FieldInfo for more
173
+ * information on these properties.
174
+ */
175
+ static VALUE
176
+ frb_fi_init(int argc, VALUE *argv, VALUE self)
177
+ {
178
+ VALUE roptions, rname;
179
+ FrtFieldInfo *fi;
180
+ FrtStoreValue store = FRT_STORE_YES;
181
+ FrtIndexValue index = FRT_INDEX_YES;
182
+ FrtTermVectorValue term_vector = FRT_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
183
+ float boost = 1.0f;
184
+
185
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
186
+ if (argc > 1) {
187
+ frb_fi_get_params(roptions, &store, &index, &term_vector, &boost);
188
+ }
189
+ fi = frt_fi_new(frb_field(rname), store, index, term_vector);
190
+ fi->boost = boost;
191
+ Frt_Wrap_Struct(self, NULL, &frb_fi_free, fi);
192
+ object_add(fi, self);
193
+ return self;
194
+ }
195
+
196
+ /*
197
+ * call-seq:
198
+ * fi.name -> symbol
199
+ *
200
+ * Return the name of the field
201
+ */
202
+ static VALUE
203
+ frb_fi_name(VALUE self)
204
+ {
205
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
206
+ return rb_str_new_cstr(rb_id2name(fi->name));
207
+ }
208
+
209
+ /*
210
+ * call-seq:
211
+ * fi.stored? -> bool
212
+ *
213
+ * Return true if the field is stored in the index.
214
+ */
215
+ static VALUE
216
+ frb_fi_is_stored(VALUE self)
217
+ {
218
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
219
+ return fi_is_stored(fi) ? Qtrue : Qfalse;
220
+ }
221
+
222
+ /*
223
+ * call-seq:
224
+ * fi.indexed? -> bool
225
+ *
226
+ * Return true if the field is indexed, ie searchable in the index.
227
+ */
228
+ static VALUE
229
+ frb_fi_is_indexed(VALUE self)
230
+ {
231
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
232
+ return fi_is_indexed(fi) ? Qtrue : Qfalse;
233
+ }
234
+
235
+ /*
236
+ * call-seq:
237
+ * fi.tokenized? -> bool
238
+ *
239
+ * Return true if the field is tokenized. Tokenizing is the process of
240
+ * breaking the field up into tokens. That is "the quick brown fox" becomes:
241
+ *
242
+ * ["the", "quick", "brown", "fox"]
243
+ *
244
+ * A field can only be tokenized if it is indexed.
245
+ */
246
+ static VALUE
247
+ frb_fi_is_tokenized(VALUE self)
248
+ {
249
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
250
+ return fi_is_tokenized(fi) ? Qtrue : Qfalse;
251
+ }
252
+
253
+ /*
254
+ * call-seq:
255
+ * fi.omit_norms? -> bool
256
+ *
257
+ * Return true if the field omits the norm file. The norm file is the file
258
+ * used to store the field boosts for an indexed field. If you do not boost
259
+ * any fields, and you can live without scoring based on field length then
260
+ * you can omit the norms file. This will give the index a slight performance
261
+ * boost and it will use less memory, especially for indexes which have a
262
+ * large number of documents.
263
+ */
264
+ static VALUE
265
+ frb_fi_omit_norms(VALUE self)
266
+ {
267
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
268
+ return fi_omit_norms(fi) ? Qtrue : Qfalse;
269
+ }
270
+
271
+ /*
272
+ * call-seq:
273
+ * fi.store_term_vector? -> bool
274
+ *
275
+ * Return true if the term-vectors are stored for this field.
276
+ */
277
+ static VALUE
278
+ frb_fi_store_term_vector(VALUE self)
279
+ {
280
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
281
+ return fi_store_term_vector(fi) ? Qtrue : Qfalse;
282
+ }
283
+
284
+ /*
285
+ * call-seq:
286
+ * fi.store_positions? -> bool
287
+ *
288
+ * Return true if positions are stored with the term-vectors for this field.
289
+ */
290
+ static VALUE
291
+ frb_fi_store_positions(VALUE self)
292
+ {
293
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
294
+ return fi_store_positions(fi) ? Qtrue : Qfalse;
295
+ }
296
+
297
+ /*
298
+ * call-seq:
299
+ * fi.store_offsets? -> bool
300
+ *
301
+ * Return true if offsets are stored with the term-vectors for this field.
302
+ */
303
+ static VALUE
304
+ frb_fi_store_offsets(VALUE self)
305
+ {
306
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
307
+ return fi_store_offsets(fi) ? Qtrue : Qfalse;
308
+ }
309
+
310
+ /*
311
+ * call-seq:
312
+ * fi.has_norms? -> bool
313
+ *
314
+ * Return true if this field has a norms file. This is the same as calling;
315
+ *
316
+ * fi.indexed? and not fi.omit_norms?
317
+ */
318
+ static VALUE
319
+ frb_fi_has_norms(VALUE self)
320
+ {
321
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
322
+ return fi_has_norms(fi) ? Qtrue : Qfalse;
323
+ }
324
+
325
+ /*
326
+ * call-seq:
327
+ * fi.boost -> boost
328
+ *
329
+ * Return the default boost for this field
330
+ */
331
+ static VALUE
332
+ frb_fi_boost(VALUE self)
333
+ {
334
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
335
+ return rb_float_new((double)fi->boost);
336
+ }
337
+
338
+ /*
339
+ * call-seq:
340
+ * fi.to_s -> string
341
+ *
342
+ * Return a string representation of the FieldInfo object.
343
+ */
344
+ static VALUE
345
+ frb_fi_to_s(VALUE self)
346
+ {
347
+ FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
348
+ char *fi_s = frt_fi_to_s(fi);
349
+ VALUE rfi_s = rb_str_new2(fi_s);
350
+ free(fi_s);
351
+ return rfi_s;
352
+ }
353
+
354
+ /****************************************************************************
355
+ *
356
+ * FieldInfos Methods
357
+ *
358
+ ****************************************************************************/
359
+
360
+ static void
361
+ frb_fis_free(void *p)
362
+ {
363
+ object_del(p);
364
+ frt_fis_deref((FrtFieldInfos *)p);
365
+ }
366
+
367
+ static void
368
+ frb_fis_mark(void *p)
369
+ {
370
+ int i;
371
+ FrtFieldInfos *fis = (FrtFieldInfos *)p;
372
+
373
+ for (i = 0; i < fis->size; i++) {
374
+ frb_gc_mark(fis->fields[i]);
375
+ }
376
+ }
377
+
378
+ static VALUE
379
+ frb_get_field_infos(FrtFieldInfos *fis)
380
+ {
381
+
382
+ VALUE rfis = Qnil;
383
+ if (fis) {
384
+ rfis = object_get(fis);
385
+ if (rfis == Qnil) {
386
+ rfis = Data_Wrap_Struct(cFieldInfos, &frb_fis_mark, &frb_fis_free,
387
+ fis);
388
+ FRT_REF(fis);
389
+ object_add(fis, rfis);
390
+ }
391
+ }
392
+ return rfis;
393
+ }
394
+
395
+ /*
396
+ * call-seq:
397
+ * FieldInfos.new(defaults = {}) -> field_infos
398
+ *
399
+ * Create a new FieldInfos object which uses the default values for fields
400
+ * specified in the +default+ hash parameter. See FieldInfo for available
401
+ * property values.
402
+ */
403
+ static VALUE
404
+ frb_fis_init(int argc, VALUE *argv, VALUE self)
405
+ {
406
+ VALUE roptions;
407
+ FrtFieldInfos *fis;
408
+ FrtStoreValue store = FRT_STORE_YES;
409
+ FrtIndexValue index = FRT_INDEX_YES;
410
+ FrtTermVectorValue term_vector = FRT_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
411
+ float boost;
412
+
413
+ rb_scan_args(argc, argv, "01", &roptions);
414
+ if (argc > 0) {
415
+ frb_fi_get_params(roptions, &store, &index, &term_vector, &boost);
416
+ }
417
+ fis = frt_fis_new(store, index, term_vector);
418
+ Frt_Wrap_Struct(self, &frb_fis_mark, &frb_fis_free, fis);
419
+ object_add(fis, self);
420
+ return self;
421
+ }
422
+
423
+ /*
424
+ * call-seq:
425
+ * fis.to_a -> array
426
+ *
427
+ * Return an array of the FieldInfo objects contained but this FieldInfos
428
+ * object.
429
+ */
430
+ static VALUE
431
+ frb_fis_to_a(VALUE self)
432
+ {
433
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
434
+ VALUE rary = rb_ary_new();
435
+ int i;
436
+
437
+ for (i = 0; i < fis->size; i++) {
438
+ rb_ary_push(rary, frb_get_field_info(fis->fields[i]));
439
+ }
440
+ return rary;
441
+ }
442
+
443
+ /*
444
+ * call-seq:
445
+ * fis[name] -> field_info
446
+ * fis[number] -> field_info
447
+ *
448
+ * Get the FieldInfo object. FieldInfo objects can be referenced by either
449
+ * their field-number of the field-name (which must be a symbol). For
450
+ * example;
451
+ *
452
+ * fi = fis[:name]
453
+ * fi = fis[2]
454
+ */
455
+ static VALUE
456
+ frb_fis_get(VALUE self, VALUE ridx)
457
+ {
458
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
459
+ VALUE rfi = Qnil;
460
+ switch (TYPE(ridx)) {
461
+ case T_FIXNUM: {
462
+ int index = FIX2INT(ridx);
463
+ if (index < 0) index += fis->size;
464
+ if (index < 0 || index >= fis->size) {
465
+ rb_raise(rb_eArgError, "index of %d is out of range (0..%d)\n",
466
+ index, fis->size - 1);
467
+ }
468
+ rfi = frb_get_field_info(fis->fields[index]);
469
+ break;
470
+ }
471
+ case T_SYMBOL:
472
+ case T_STRING:
473
+ rfi = frb_get_field_info(frt_fis_get_field(fis, frb_field(ridx)));
474
+ break;
475
+ /*
476
+ case T_STRING:
477
+ rfi = frb_get_field_info(frt_fis_get_field(fis, StringValuePtr(ridx)));
478
+ break;
479
+ */
480
+ default:
481
+ rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
482
+ rs2s(rb_obj_as_string(ridx)));
483
+ break;
484
+ }
485
+ return rfi;
486
+ }
487
+
488
+ /*
489
+ * call-seq:
490
+ * fis << fi -> fis
491
+ * fis.add(fi) -> fis
492
+ *
493
+ * Add a FieldInfo object. Use the FieldInfos#add_field method where
494
+ * possible.
495
+ */
496
+ static VALUE
497
+ frb_fis_add(VALUE self, VALUE rfi)
498
+ {
499
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
500
+ FrtFieldInfo *fi = (FrtFieldInfo *)frb_rb_data_ptr(rfi);
501
+ frt_fis_add_field(fis, fi);
502
+ FRT_REF(fi);
503
+ return self;
504
+ }
505
+
506
+ /*
507
+ * call-seq:
508
+ * fis.add_field(name, properties = {} -> fis
509
+ *
510
+ * Add a new field to the FieldInfos object. See FieldInfo for a description
511
+ * of the available properties.
512
+ */
513
+ static VALUE
514
+ frb_fis_add_field(int argc, VALUE *argv, VALUE self)
515
+ {
516
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
517
+ FrtFieldInfo *fi;
518
+ FrtStoreValue store = fis->store;
519
+ FrtIndexValue index = fis->index;
520
+ FrtTermVectorValue term_vector = fis->term_vector;
521
+ float boost = 1.0f;
522
+ VALUE rname, roptions;
523
+
524
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
525
+ if (argc > 1) {
526
+ frb_fi_get_params(roptions, &store, &index, &term_vector, &boost);
527
+ }
528
+ fi = frt_fi_new(frb_field(rname), store, index, term_vector);
529
+ fi->boost = boost;
530
+ frt_fis_add_field(fis, fi);
531
+ return self;
532
+ }
533
+
534
+ /*
535
+ * call-seq:
536
+ * fis.each {|fi| do_something } -> fis
537
+ *
538
+ * Iterate through the FieldInfo objects.
539
+ */
540
+ static VALUE
541
+ frb_fis_each(VALUE self)
542
+ {
543
+ int i;
544
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
545
+
546
+ for (i = 0; i < fis->size; i++) {
547
+ rb_yield(frb_get_field_info(fis->fields[i]));
548
+ }
549
+ return self;
550
+ }
551
+
552
+ /*
553
+ * call-seq:
554
+ * fis.to_s -> string
555
+ *
556
+ * Return a string representation of the FieldInfos object.
557
+ */
558
+ static VALUE
559
+ frb_fis_to_s(VALUE self)
560
+ {
561
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
562
+ char *fis_s = frt_fis_to_s(fis);
563
+ VALUE rfis_s = rb_str_new2(fis_s);
564
+ free(fis_s);
565
+ return rfis_s;
566
+ }
567
+
568
+ /*
569
+ * call-seq:
570
+ * fis.size -> int
571
+ *
572
+ * Return the number of fields in the FieldInfos object.
573
+ */
574
+ static VALUE
575
+ frb_fis_size(VALUE self)
576
+ {
577
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
578
+ return INT2FIX(fis->size);
579
+ }
580
+
581
+ /*
582
+ * call-seq:
583
+ * fis.create_index(dir) -> self
584
+ *
585
+ * Create a new index in the directory specified. The directory +dir+ can
586
+ * either be a string path representing a directory on the file-system or an
587
+ * actual directory object. Care should be taken when using this method. Any
588
+ * existing index (or other files for that matter) will be deleted from the
589
+ * directory and overwritten by the new index.
590
+ */
591
+ static VALUE
592
+ frb_fis_create_index(VALUE self, VALUE rdir)
593
+ {
594
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
595
+ FrtStore *store = NULL;
596
+ if (TYPE(rdir) == T_DATA) {
597
+ store = DATA_PTR(rdir);
598
+ FRT_REF(store);
599
+ } else {
600
+ StringValue(rdir);
601
+ frb_create_dir(rdir);
602
+ store = frt_open_fs_store(rs2s(rdir));
603
+ }
604
+ frt_index_create(store, fis);
605
+ frt_store_deref(store);
606
+ return self;
607
+ }
608
+
609
+ /*
610
+ * call-seq:
611
+ * fis.fields -> symbol array
612
+ * fis.field_names -> symbol array
613
+ *
614
+ * Return a list of the field names (as symbols) of all the fields in the
615
+ * index.
616
+ */
617
+ static VALUE
618
+ frb_fis_get_fields(VALUE self)
619
+ {
620
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
621
+ VALUE rfield_names = rb_ary_new();
622
+ int i;
623
+ for (i = 0; i < fis->size; i++) {
624
+ rb_ary_push(rfield_names, ID2SYM(fis->fields[i]->name));
625
+ }
626
+ return rfield_names;
627
+ }
628
+
629
+ /*
630
+ * call-seq:
631
+ * fis.tokenized_fields -> symbol array
632
+ *
633
+ * Return a list of the field names (as symbols) of all the tokenized fields
634
+ * in the index.
635
+ */
636
+ static VALUE
637
+ frb_fis_get_tk_fields(VALUE self)
638
+ {
639
+ FrtFieldInfos *fis = (FrtFieldInfos *)DATA_PTR(self);
640
+ VALUE rfield_names = rb_ary_new();
641
+ int i;
642
+ for (i = 0; i < fis->size; i++) {
643
+ if (!fi_is_tokenized(fis->fields[i])) continue;
644
+ rb_ary_push(rfield_names, rb_str_new_cstr(rb_id2name(fis->fields[i]->name)));
645
+ }
646
+ return rfield_names;
647
+ }
648
+
649
+ /****************************************************************************
650
+ *
651
+ * TermEnum Methods
652
+ *
653
+ ****************************************************************************/
654
+
655
+ static void
656
+ frb_te_free(void *p)
657
+ {
658
+ FrtTermEnum *te = (FrtTermEnum *)p;
659
+ te->close(te);
660
+ }
661
+
662
+ static VALUE
663
+ frb_te_get_set_term(VALUE self, const char *term)
664
+ {
665
+ FrtTermEnum *te = (FrtTermEnum *)DATA_PTR(self);
666
+ VALUE str = term ? rb_str_new(term, te->curr_term_len) : Qnil;
667
+ rb_ivar_set(self, id_term, str);
668
+ return str;
669
+ }
670
+
671
+ static VALUE
672
+ frb_get_te(VALUE rir, FrtTermEnum *te)
673
+ {
674
+ VALUE self = Qnil;
675
+ if (te != NULL) {
676
+ self = Data_Wrap_Struct(cTermEnum, NULL, &frb_te_free, te);
677
+ frb_te_get_set_term(self, te->curr_term);
678
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
679
+ }
680
+ return self;
681
+ }
682
+
683
+ /*
684
+ * call-seq:
685
+ * term_enum.next -> term_string
686
+ *
687
+ * Returns the next term in the enumeration or nil otherwise.
688
+ */
689
+ static VALUE
690
+ frb_te_next(VALUE self)
691
+ {
692
+ FrtTermEnum *te = (FrtTermEnum *)DATA_PTR(self);
693
+ return frb_te_get_set_term(self, te->next(te));
694
+ }
695
+
696
+ /*
697
+ * call-seq:
698
+ * term_enum.term -> term_string
699
+ *
700
+ * Returns the current term pointed to by the enum. This method should only
701
+ * be called after a successful call to TermEnum#next.
702
+ */
703
+ static VALUE
704
+ frb_te_term(VALUE self)
705
+ {
706
+ return rb_ivar_get(self, id_term);
707
+ }
708
+
709
+ /*
710
+ * call-seq:
711
+ * term_enum.doc_freq -> integer
712
+ *
713
+ * Returns the document frequency of the current term pointed to by the enum.
714
+ * That is the number of documents that this term appears in. The method
715
+ * should only be called after a successful call to TermEnum#next.
716
+ */
717
+ static VALUE
718
+ frb_te_doc_freq(VALUE self)
719
+ {
720
+ FrtTermEnum *te = (FrtTermEnum *)DATA_PTR(self);
721
+ return INT2FIX(te->curr_ti.doc_freq);
722
+ }
723
+
724
+ /*
725
+ * call-seq:
726
+ * term_enum.skip_to(target) -> term
727
+ *
728
+ * Skip to term +target+. This method can skip forwards or backwards. If you
729
+ * want to skip back to the start, pass the empty string "". That is;
730
+ *
731
+ * term_enum.skip_to("")
732
+ *
733
+ * Returns the first term greater than or equal to +target+
734
+ */
735
+ static VALUE
736
+ frb_te_skip_to(VALUE self, VALUE rterm)
737
+ {
738
+ FrtTermEnum *te = (FrtTermEnum *)DATA_PTR(self);
739
+ return frb_te_get_set_term(self, te->skip_to(te, rs2s(rterm)));
740
+ }
741
+
742
+ /*
743
+ * call-seq:
744
+ * term_enum.each {|term, doc_freq| do_something() } -> term_count
745
+ *
746
+ * Iterates through all the terms in the field, yielding the term and the
747
+ * document frequency.
748
+ */
749
+ static VALUE
750
+ frb_te_each(VALUE self)
751
+ {
752
+ FrtTermEnum *te = (FrtTermEnum *)DATA_PTR(self);
753
+ char *term;
754
+ int term_cnt = 0;
755
+ VALUE vals = rb_ary_new2(2);
756
+ rb_ary_store(vals, 0, Qnil);
757
+ rb_ary_store(vals, 1, Qnil);
758
+
759
+ /* each is being called so there will be no current term */
760
+ rb_ivar_set(self, id_term, Qnil);
761
+
762
+
763
+ while (NULL != (term = te->next(te))) {
764
+ term_cnt++;
765
+ RARRAY_PTR(vals)[0] = rb_str_new(term, te->curr_term_len);
766
+ RARRAY_PTR(vals)[1] = INT2FIX(te->curr_ti.doc_freq);
767
+ rb_yield(vals);
768
+ }
769
+ return INT2FIX(term_cnt);
770
+ }
771
+
772
+ /*
773
+ * call-seq:
774
+ * term_enum.set_field(field) -> self
775
+ *
776
+ * Set the field for the term_enum. The field value should be a symbol as
777
+ * usual. For example, to scan all title terms you'd do this;
778
+ *
779
+ * term_enum.set_field(:title).each do |term, doc_freq|
780
+ * do_something()
781
+ * end
782
+ */
783
+ static VALUE
784
+ frb_te_set_field(VALUE self, VALUE rfield)
785
+ {
786
+ FrtTermEnum *te = (FrtTermEnum *)DATA_PTR(self);
787
+ int field_num = 0;
788
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
789
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
790
+ if (rfnum != Qnil) {
791
+ field_num = FIX2INT(rfnum);
792
+ rb_ivar_set(self, id_field_num, rfnum);
793
+ } else {
794
+ Check_Type(rfield, T_SYMBOL);
795
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
796
+ rb_id2name(frb_field(rfield)));
797
+ }
798
+ te->set_field(te, field_num);
799
+
800
+ return self;
801
+ }
802
+
803
+ /*
804
+ * call-seq:
805
+ * term_enum.to_json() -> string
806
+ *
807
+ * Returns a JSON representation of the term enum. You can speed this up by
808
+ * having the method return arrays instead of objects, simply by passing an
809
+ * argument to the to_json method. For example;
810
+ *
811
+ * term_enum.to_json() #=>
812
+ * # [
813
+ * # {"term":"apple","frequency":12},
814
+ * # {"term":"banana","frequency":2},
815
+ * # {"term":"cantaloupe","frequency":12}
816
+ * # ]
817
+ *
818
+ * term_enum.to_json(:fast) #=>
819
+ * # [
820
+ * # ["apple",12],
821
+ * # ["banana",2],
822
+ * # ["cantaloupe",12]
823
+ * # ]
824
+ */
825
+ static VALUE
826
+ frb_te_to_json(int argc, VALUE *argv, VALUE self)
827
+ {
828
+ FrtTermEnum *te = (FrtTermEnum *)DATA_PTR(self);
829
+ VALUE rjson;
830
+ char *json, *jp;
831
+ char *term;
832
+ int capa = 65536;
833
+ jp = json = FRT_ALLOC_N(char, capa);
834
+ *(jp++) = '[';
835
+
836
+ if (argc > 0) {
837
+ while (NULL != (term = te->next(te))) {
838
+ /* enough room for for term after converting " to '"' and frequency
839
+ * plus some extra for good measure */
840
+ *(jp++) = '[';
841
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
842
+ capa <<= 1;
843
+ FRT_REALLOC_N(json, char, capa);
844
+ }
845
+ jp = json_concat_string(jp, term);
846
+ *(jp++) = ',';
847
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
848
+ jp += strlen(jp);
849
+ *(jp++) = ']';
850
+ *(jp++) = ',';
851
+ }
852
+ }
853
+ else {
854
+ while (NULL != (term = te->next(te))) {
855
+ /* enough room for for term after converting " to '"' and frequency
856
+ * plus some extra for good measure */
857
+ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) {
858
+ capa <<= 1;
859
+ FRT_REALLOC_N(json, char, capa);
860
+ }
861
+ *(jp++) = '{';
862
+ memcpy(jp, "\"term\":", 7);
863
+ jp += 7;
864
+ jp = json_concat_string(jp, term);
865
+ *(jp++) = ',';
866
+ memcpy(jp, "\"frequency\":", 12);
867
+ jp += 12;
868
+ sprintf(jp, "%d", te->curr_ti.doc_freq);
869
+ jp += strlen(jp);
870
+ *(jp++) = '}';
871
+ *(jp++) = ',';
872
+ }
873
+ }
874
+ if (*(jp-1) == ',') jp--;
875
+ *(jp++) = ']';
876
+ *jp = '\0';
877
+
878
+ rjson = rb_str_new2(json);
879
+ free(json);
880
+ return rjson;
881
+ }
882
+
883
+ /****************************************************************************
884
+ *
885
+ * TermDocEnum Methods
886
+ *
887
+ ****************************************************************************/
888
+
889
+ static void
890
+ frb_tde_free(void *p)
891
+ {
892
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)p;
893
+ tde->close(tde);
894
+ }
895
+
896
+ static VALUE
897
+ frb_get_tde(VALUE rir, FrtTermDocEnum *tde)
898
+ {
899
+ VALUE self = Data_Wrap_Struct(cTermDocEnum, NULL, &frb_tde_free, tde);
900
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
901
+ return self;
902
+ }
903
+
904
+ /*
905
+ * call-seq:
906
+ * term_doc_enum.seek(field, term) -> self
907
+ *
908
+ * Seek the term +term+ in the index for +field+. After you call this method
909
+ * you can call next or each to skip through the documents and positions of
910
+ * this particular term.
911
+ */
912
+ static VALUE
913
+ frb_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
914
+ {
915
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
916
+ char *term;
917
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
918
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
919
+ int field_num = -1;
920
+ term = StringValuePtr(rterm);
921
+ if (rfnum != Qnil) {
922
+ field_num = FIX2INT(rfnum);
923
+ } else {
924
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
925
+ rb_id2name(frb_field(rfield)));
926
+ }
927
+ tde->seek(tde, field_num, term);
928
+ return self;
929
+ }
930
+
931
+ /*
932
+ * call-seq:
933
+ * term_doc_enum.seek_term_enum(term_enum) -> self
934
+ *
935
+ * Seek the current term in +term_enum+. You could just use the standard seek
936
+ * method like this;
937
+ *
938
+ * term_doc_enum.seek(term_enum.term)
939
+ *
940
+ * However the +seek_term_enum+ method saves an index lookup so should offer
941
+ * a large performance improvement.
942
+ */
943
+ static VALUE
944
+ frb_tde_seek_te(VALUE self, VALUE rterm_enum)
945
+ {
946
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
947
+ FrtTermEnum *te = (FrtTermEnum *)frb_rb_data_ptr(rterm_enum);
948
+ tde->seek_te(tde, te);
949
+ return self;
950
+ }
951
+
952
+ /*
953
+ * call-seq:
954
+ * term_doc_enum.doc -> doc_id
955
+ *
956
+ * Returns the current document number pointed to by the +term_doc_enum+.
957
+ */
958
+ static VALUE
959
+ frb_tde_doc(VALUE self)
960
+ {
961
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
962
+ return INT2FIX(tde->doc_num(tde));
963
+ }
964
+
965
+ /*
966
+ * call-seq:
967
+ * term_doc_enum.doc -> doc_id
968
+ *
969
+ * Returns the frequency of the current document pointed to by the
970
+ * +term_doc_enum+.
971
+ */
972
+ static VALUE
973
+ frb_tde_freq(VALUE self)
974
+ {
975
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
976
+ return INT2FIX(tde->freq(tde));
977
+ }
978
+
979
+ /*
980
+ * call-seq:
981
+ * term_doc_enum.doc -> doc_id
982
+ *
983
+ * Move forward to the next document in the enumeration. Returns +true+ if
984
+ * there is another document or +false+ otherwise.
985
+ */
986
+ static VALUE
987
+ frb_tde_next(VALUE self)
988
+ {
989
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
990
+ return tde->next(tde) ? Qtrue : Qfalse;
991
+ }
992
+
993
+ /*
994
+ * call-seq:
995
+ * term_doc_enum.doc -> doc_id
996
+ *
997
+ * Move forward to the next document in the enumeration. Returns +true+ if
998
+ * there is another document or +false+ otherwise.
999
+ */
1000
+ static VALUE
1001
+ frb_tde_next_position(VALUE self)
1002
+ {
1003
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
1004
+ int pos;
1005
+ if (tde->next_position == NULL) {
1006
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1007
+ "the TermDocEnum with Index#term_positions method rather "
1008
+ "than the Index#term_docs method");
1009
+ }
1010
+ pos = tde->next_position(tde);
1011
+ return pos >= 0 ? INT2FIX(pos) : Qnil;
1012
+ }
1013
+
1014
+ /*
1015
+ * call-seq:
1016
+ * term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
1017
+ *
1018
+ * Iterate through the documents and document frequencies in the
1019
+ * +term_doc_enum+.
1020
+ *
1021
+ * NOTE: this method can only be called once after each seek. If you need to
1022
+ * call +#each+ again then you should call +#seek+ again too.
1023
+ */
1024
+ static VALUE
1025
+ frb_tde_each(VALUE self)
1026
+ {
1027
+ int doc_cnt = 0;
1028
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
1029
+ VALUE vals = rb_ary_new2(2);
1030
+ rb_ary_store(vals, 0, Qnil);
1031
+ rb_ary_store(vals, 1, Qnil);
1032
+
1033
+ while (tde->next(tde)) {
1034
+ doc_cnt++;
1035
+ RARRAY_PTR(vals)[0] = INT2FIX(tde->doc_num(tde));
1036
+ RARRAY_PTR(vals)[1] = INT2FIX(tde->freq(tde));
1037
+ rb_yield(vals);
1038
+
1039
+ }
1040
+ return INT2FIX(doc_cnt);
1041
+ }
1042
+
1043
+ /*
1044
+ * call-seq:
1045
+ * term_doc_enum.to_json() -> string
1046
+ *
1047
+ * Returns a json representation of the term doc enum. It will also add the
1048
+ * term positions if they are available. You can speed this up by having the
1049
+ * method return arrays instead of objects, simply by passing an argument to
1050
+ * the to_json method. For example;
1051
+ *
1052
+ * term_doc_enum.to_json() #=>
1053
+ * # [
1054
+ * # {"document":1,"frequency":12},
1055
+ * # {"document":11,"frequency":1},
1056
+ * # {"document":29,"frequency":120},
1057
+ * # {"document":30,"frequency":3}
1058
+ * # ]
1059
+ *
1060
+ * term_doc_enum.to_json(:fast) #=>
1061
+ * # [
1062
+ * # [1,12],
1063
+ * # [11,1],
1064
+ * # [29,120],
1065
+ * # [30,3]
1066
+ * # ]
1067
+ */
1068
+ static VALUE
1069
+ frb_tde_to_json(int argc, VALUE *argv, VALUE self)
1070
+ {
1071
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
1072
+ VALUE rjson;
1073
+ char *json, *jp;
1074
+ int capa = 65536;
1075
+ const char *format;
1076
+ char close = (argc > 0) ? ']' : '}';
1077
+ bool do_positions = tde->next_position != NULL;
1078
+ jp = json = FRT_ALLOC_N(char, capa);
1079
+ *(jp++) = '[';
1080
+
1081
+ if (do_positions) {
1082
+ if (argc == 0) {
1083
+ format = "{\"document\":%d,\"frequency\":%d,\"positions\":[";
1084
+ }
1085
+ else {
1086
+ format = "[%d,%d,[";
1087
+ }
1088
+ }
1089
+ else {
1090
+ if (argc == 0) {
1091
+ format = "{\"document\":%d,\"frequency\":%d},";
1092
+ }
1093
+ else {
1094
+ format = "[%d,%d],";
1095
+ }
1096
+ }
1097
+ while (tde->next(tde)) {
1098
+ /* 100 chars should be enough room for an extra entry */
1099
+ if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) {
1100
+ capa <<= 1;
1101
+ FRT_REALLOC_N(json, char, capa);
1102
+ }
1103
+ sprintf(jp, format, tde->doc_num(tde), tde->freq(tde));
1104
+ jp += strlen(jp);
1105
+ if (do_positions) {
1106
+ int pos;
1107
+ while (0 <= (pos = tde->next_position(tde))) {
1108
+ sprintf(jp, "%d,", pos);
1109
+ jp += strlen(jp);
1110
+ }
1111
+ if (*(jp - 1) == ',') jp--;
1112
+ *(jp++) = ']';
1113
+ *(jp++) = close;
1114
+ *(jp++) = ',';
1115
+ }
1116
+ }
1117
+ if (*(jp - 1) == ',') jp--;
1118
+ *(jp++) = ']';
1119
+ *jp = '\0';
1120
+
1121
+ rjson = rb_str_new2(json);
1122
+ free(json);
1123
+ return rjson;
1124
+ }
1125
+
1126
+ /*
1127
+ * call-seq:
1128
+ * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
1129
+ *
1130
+ * Iterate through each of the positions occupied by the current term in the
1131
+ * current document. This can only be called once per document. It can be
1132
+ * used within the each method. For example, to print the terms documents and
1133
+ * positions;
1134
+ *
1135
+ * tde.each do |doc_id, freq|
1136
+ * puts "term appeared #{freq} times in document #{doc_id}:"
1137
+ * positions = []
1138
+ * tde.each_position {|pos| positions << pos}
1139
+ * puts " #{positions.join(', ')}"
1140
+ * end
1141
+ */
1142
+ static VALUE
1143
+ frb_tde_each_position(VALUE self)
1144
+ {
1145
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
1146
+ int pos;
1147
+ if (tde->next_position == NULL) {
1148
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
1149
+ "the TermDocEnum with Index#term_positions method rather "
1150
+ "than the Index#term_docs method");
1151
+ }
1152
+ while (0 <= (pos = tde->next_position(tde))) {
1153
+ rb_yield(INT2FIX(pos));
1154
+ }
1155
+ return self;
1156
+ }
1157
+
1158
+ /*
1159
+ * call-seq:
1160
+ * term_doc_enum.skip_to(target) -> bool
1161
+ *
1162
+ * Skip to the required document number +target+ and return true if there is
1163
+ * a document >= +target+.
1164
+ */
1165
+ static VALUE
1166
+ frb_tde_skip_to(VALUE self, VALUE rtarget)
1167
+ {
1168
+ FrtTermDocEnum *tde = (FrtTermDocEnum *)DATA_PTR(self);
1169
+ return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
1170
+ }
1171
+
1172
+ /****************************************************************************
1173
+ *
1174
+ * TVOffsets Methods
1175
+ *
1176
+ ****************************************************************************/
1177
+
1178
+ static VALUE
1179
+ frb_get_tv_offsets(FrtOffset *offset)
1180
+ {
1181
+ return rb_struct_new(cTVOffsets,
1182
+ ULL2NUM((frt_u64)offset->start),
1183
+ ULL2NUM((frt_u64)offset->end),
1184
+ NULL);
1185
+ }
1186
+
1187
+ /****************************************************************************
1188
+ *
1189
+ * TVTerm Methods
1190
+ *
1191
+ ****************************************************************************/
1192
+
1193
+ static VALUE
1194
+ frb_get_tv_term(FrtTVTerm *tv_term)
1195
+ {
1196
+ int i;
1197
+ const int freq = tv_term->freq;
1198
+ VALUE rtext;
1199
+ VALUE rpositions = Qnil;
1200
+ rtext = rb_str_new2(tv_term->text);
1201
+ if (tv_term->positions) {
1202
+ int *positions = tv_term->positions;
1203
+ rpositions = rb_ary_new2(freq);
1204
+ for (i = 0; i < freq; i++) {
1205
+ rb_ary_store(rpositions, i, INT2FIX(positions[i]));
1206
+ }
1207
+ }
1208
+ return rb_struct_new(cTVTerm, rtext, INT2FIX(freq), rpositions, NULL);
1209
+ }
1210
+
1211
+ /****************************************************************************
1212
+ *
1213
+ * TermVector Methods
1214
+ *
1215
+ ****************************************************************************/
1216
+
1217
+ static VALUE
1218
+ frb_get_tv(FrtTermVector *tv)
1219
+ {
1220
+ int i;
1221
+ FrtTVTerm *terms = tv->terms;
1222
+ const int t_cnt = tv->term_cnt;
1223
+ const int o_cnt = tv->offset_cnt;
1224
+ VALUE rfield, rterms;
1225
+ VALUE roffsets = Qnil;
1226
+ rfield = ID2SYM(tv->field);
1227
+
1228
+ rterms = rb_ary_new2(t_cnt);
1229
+ for (i = 0; i < t_cnt; i++) {
1230
+ rb_ary_store(rterms, i, frb_get_tv_term(&terms[i]));
1231
+ }
1232
+
1233
+ if (tv->offsets) {
1234
+ FrtOffset *offsets = tv->offsets;
1235
+ roffsets = rb_ary_new2(o_cnt);
1236
+ for (i = 0; i < o_cnt; i++) {
1237
+ rb_ary_store(roffsets, i, frb_get_tv_offsets(&offsets[i]));
1238
+ }
1239
+ }
1240
+
1241
+ return rb_struct_new(cTermVector, rfield, rterms, roffsets, NULL);
1242
+ }
1243
+
1244
+ /****************************************************************************
1245
+ *
1246
+ * FrtIndexWriter Methods
1247
+ *
1248
+ ****************************************************************************/
1249
+
1250
+ void
1251
+ frb_iw_free(void *p)
1252
+ {
1253
+ frt_iw_close((FrtIndexWriter *)p);
1254
+ }
1255
+
1256
+ void
1257
+ frb_iw_mark(void *p)
1258
+ {
1259
+ FrtIndexWriter *iw = (FrtIndexWriter *)p;
1260
+ frb_gc_mark(iw->analyzer);
1261
+ frb_gc_mark(iw->store);
1262
+ frb_gc_mark(iw->fis);
1263
+ }
1264
+
1265
+ /*
1266
+ * call-seq:
1267
+ * index_writer.close -> nil
1268
+ *
1269
+ * Close the IndexWriter. This will close and free all resources used
1270
+ * exclusively by the index writer. The garbage collector will do this
1271
+ * automatically if not called explicitly.
1272
+ */
1273
+ static VALUE
1274
+ frb_iw_close(VALUE self)
1275
+ {
1276
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1277
+ Frt_Unwrap_Struct(self);
1278
+ frt_iw_close(iw);
1279
+ return Qnil;
1280
+ }
1281
+
1282
+ #define SET_INT_ATTR(attr) \
1283
+ do {\
1284
+ if (RTEST(rval = rb_hash_aref(roptions, sym_##attr)))\
1285
+ config.attr = FIX2INT(rval);\
1286
+ } while (0)
1287
+
1288
+ /*
1289
+ * call-seq:
1290
+ * IndexWriter.new(options = {}) -> index_writer
1291
+ *
1292
+ * Create a new IndexWriter. You should either pass a path or a directory to
1293
+ * this constructor. For example, here are three ways you can create an
1294
+ * IndexWriter;
1295
+ *
1296
+ * dir = RAMDirectory.new()
1297
+ * iw = IndexWriter.new(:dir => dir)
1298
+ *
1299
+ * dir = FSDirectory.new("/path/to/index")
1300
+ * iw = IndexWriter.new(:dir => dir)
1301
+ *
1302
+ * iw = IndexWriter.new(:path => "/path/to/index")
1303
+ *
1304
+ * See FrtIndexWriter for more options.
1305
+ */
1306
+ static VALUE
1307
+ frb_iw_init(int argc, VALUE *argv, VALUE self)
1308
+ {
1309
+ VALUE roptions, rval;
1310
+ bool create = false;
1311
+ bool create_if_missing = true;
1312
+ FrtStore *store = NULL;
1313
+ FrtAnalyzer *analyzer = NULL;
1314
+ FrtIndexWriter *volatile iw = NULL;
1315
+ FrtConfig config = frt_default_config;
1316
+
1317
+ int ex_code = 0;
1318
+ const char *msg = NULL;
1319
+
1320
+ rb_scan_args(argc, argv, "01", &roptions);
1321
+ FRT_TRY
1322
+ if (argc > 0) {
1323
+ Check_Type(roptions, T_HASH);
1324
+
1325
+ if ((rval = rb_hash_aref(roptions, sym_dir)) != Qnil) {
1326
+ Check_Type(rval, T_DATA);
1327
+ store = DATA_PTR(rval);
1328
+ } else if ((rval = rb_hash_aref(roptions, sym_path)) != Qnil) {
1329
+ StringValue(rval);
1330
+ frb_create_dir(rval);
1331
+ store = frt_open_fs_store(rs2s(rval));
1332
+ FRT_DEREF(store);
1333
+ }
1334
+
1335
+ /* Let ruby's garbage collector handle the closing of the store
1336
+ if (!close_dir) {
1337
+ close_dir = RTEST(rb_hash_aref(roptions, sym_close_dir));
1338
+ }
1339
+ */
1340
+ /* use_compound_file defaults to true */
1341
+ config.use_compound_file =
1342
+ (rb_hash_aref(roptions, sym_use_compound_file) == Qfalse)
1343
+ ? false
1344
+ : true;
1345
+
1346
+ if ((rval = rb_hash_aref(roptions, sym_analyzer)) != Qnil) {
1347
+ analyzer = frb_get_cwrapped_analyzer(rval);
1348
+ }
1349
+
1350
+ create = RTEST(rb_hash_aref(roptions, sym_create));
1351
+ if ((rval = rb_hash_aref(roptions, sym_create_if_missing)) != Qnil) {
1352
+ create_if_missing = RTEST(rval);
1353
+ }
1354
+ SET_INT_ATTR(chunk_size);
1355
+ SET_INT_ATTR(max_buffer_memory);
1356
+ SET_INT_ATTR(index_interval);
1357
+ SET_INT_ATTR(skip_interval);
1358
+ SET_INT_ATTR(merge_factor);
1359
+ SET_INT_ATTR(max_buffered_docs);
1360
+ SET_INT_ATTR(max_merge_docs);
1361
+ SET_INT_ATTR(max_field_length);
1362
+ }
1363
+ if (NULL == store) {
1364
+ store = frt_open_ram_store();
1365
+ FRT_DEREF(store);
1366
+ }
1367
+ if (!create && create_if_missing && !store->exists(store, "segments")) {
1368
+ create = true;
1369
+ }
1370
+ if (create) {
1371
+ FrtFieldInfos *fis;
1372
+ if ((rval = rb_hash_aref(roptions, sym_field_infos)) != Qnil) {
1373
+ Data_Get_Struct(rval, FrtFieldInfos, fis);
1374
+ frt_index_create(store, fis);
1375
+ } else {
1376
+ fis = frt_fis_new(FRT_STORE_YES, FRT_INDEX_YES,
1377
+ FRT_TERM_VECTOR_WITH_POSITIONS_OFFSETS);
1378
+ frt_index_create(store, fis);
1379
+ frt_fis_deref(fis);
1380
+ }
1381
+ }
1382
+
1383
+ iw = frt_iw_open(store, analyzer, &config);
1384
+
1385
+ Frt_Wrap_Struct(self, &frb_iw_mark, &frb_iw_free, iw);
1386
+ default:
1387
+ ex_code = xcontext.excode;
1388
+ msg = xcontext.msg;
1389
+ FRT_HANDLED();
1390
+ FRT_XENDTRY
1391
+
1392
+ if (ex_code && msg) { frb_raise(ex_code, msg); }
1393
+
1394
+ if (rb_block_given_p()) {
1395
+ rb_yield(self);
1396
+ frb_iw_close(self);
1397
+ return Qnil;
1398
+ } else {
1399
+ return self;
1400
+ }
1401
+ }
1402
+
1403
+ /*
1404
+ * call-seq:
1405
+ * iw.doc_count -> number
1406
+ *
1407
+ * Returns the number of documents in the Index. Note that deletions won't be
1408
+ * taken into account until the FrtIndexWriter has been committed.
1409
+ */
1410
+ static VALUE
1411
+ frb_iw_get_doc_count(VALUE self)
1412
+ {
1413
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1414
+ return INT2FIX(frt_iw_doc_count(iw));
1415
+ }
1416
+
1417
+ static int
1418
+ frb_hash_to_doc_i(VALUE key, VALUE value, VALUE arg)
1419
+ {
1420
+ if (key == Qundef) {
1421
+ return ST_CONTINUE;
1422
+ } else {
1423
+ FrtDocument *doc = (FrtDocument *)arg;
1424
+ FrtSymbol field = frb_field(key);
1425
+ VALUE val;
1426
+ FrtDocField *df;
1427
+ if (NULL == (df = frt_doc_get_field(doc, field))) {
1428
+ df = frt_df_new(field);
1429
+ }
1430
+ if (rb_respond_to(value, id_boost)) {
1431
+ df->boost = (float)NUM2DBL(rb_funcall(value, id_boost, 0));
1432
+ }
1433
+ switch (TYPE(value)) {
1434
+ case T_ARRAY:
1435
+ {
1436
+ int i;
1437
+ df->destroy_data = true;
1438
+ for (i = 0; i < RARRAY_LEN(value); i++) {
1439
+ val = rb_obj_as_string(RARRAY_PTR(value)[i]);
1440
+ frt_df_add_data_len(df, rstrdup(val), RSTRING_LEN(val));
1441
+ }
1442
+ }
1443
+ break;
1444
+ case T_STRING:
1445
+ frt_df_add_data_len(df, rs2s(value), RSTRING_LEN(value));
1446
+ break;
1447
+ default:
1448
+ val = rb_obj_as_string(value);
1449
+ df->destroy_data = true;
1450
+ frt_df_add_data_len(df, rstrdup(val), RSTRING_LEN(val));
1451
+ break;
1452
+ }
1453
+ frt_doc_add_field(doc, df);
1454
+ }
1455
+ return ST_CONTINUE;
1456
+ }
1457
+
1458
+ static FrtDocument *
1459
+ frb_get_doc(VALUE rdoc)
1460
+ {
1461
+ VALUE val;
1462
+ FrtDocument *doc = frt_doc_new();
1463
+ FrtDocField *df;
1464
+
1465
+ if (rb_respond_to(rdoc, id_boost)) {
1466
+ doc->boost = (float)NUM2DBL(rb_funcall(rdoc, id_boost, 0));
1467
+ }
1468
+
1469
+ switch (TYPE(rdoc)) {
1470
+ case T_HASH:
1471
+ rb_hash_foreach(rdoc, frb_hash_to_doc_i, (VALUE)doc);
1472
+ break;
1473
+ case T_ARRAY:
1474
+ {
1475
+ int i;
1476
+ df = frt_df_new(fsym_content);
1477
+ df->destroy_data = true;
1478
+ for (i = 0; i < RARRAY_LEN(rdoc); i++) {
1479
+ val = rb_obj_as_string(RARRAY_PTR(rdoc)[i]);
1480
+ frt_df_add_data_len(df, rstrdup(val), RSTRING_LEN(val));
1481
+ }
1482
+ frt_doc_add_field(doc, df);
1483
+ }
1484
+ break;
1485
+ case T_SYMBOL:
1486
+ /* TODO: clean up this ugly cast */
1487
+ df = frt_df_add_data(frt_df_new(fsym_content), (char *)rb_id2name(SYM2ID(rdoc)));
1488
+ frt_doc_add_field(doc, df);
1489
+ break;
1490
+ case T_STRING:
1491
+ df = frt_df_add_data_len(frt_df_new(fsym_content), rs2s(rdoc),
1492
+ RSTRING_LEN(rdoc));
1493
+ frt_doc_add_field(doc, df);
1494
+ break;
1495
+ default:
1496
+ val = rb_obj_as_string(rdoc);
1497
+ df = frt_df_add_data_len(frt_df_new(fsym_content), rstrdup(val),
1498
+ RSTRING_LEN(val));
1499
+ df->destroy_data = true;
1500
+ frt_doc_add_field(doc, df);
1501
+ break;
1502
+ }
1503
+ return doc;
1504
+ }
1505
+
1506
+ /*
1507
+ * call-seq:
1508
+ * iw << document -> iw
1509
+ * iw.add_document(document) -> iw
1510
+ *
1511
+ * Add a document to the index. See Document. A document can also be a simple
1512
+ * hash object.
1513
+ */
1514
+ static VALUE
1515
+ frb_iw_add_doc(VALUE self, VALUE rdoc)
1516
+ {
1517
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1518
+ FrtDocument *doc = frb_get_doc(rdoc);
1519
+ frt_iw_add_doc(iw, doc);
1520
+ frt_doc_destroy(doc);
1521
+ return self;
1522
+ }
1523
+
1524
+ /*
1525
+ * call-seq:
1526
+ * iw.optimize -> iw
1527
+ *
1528
+ * Optimize the index for searching. This commits any unwritten data to the
1529
+ * index and optimizes the index into a single segment to improve search
1530
+ * performance. This is an expensive operation and should not be called too
1531
+ * often. The best time to call this is at the end of a long batch indexing
1532
+ * process. Note that calling the optimize method do not in any way effect
1533
+ * indexing speed (except for the time taken to complete the optimization
1534
+ * process).
1535
+ */
1536
+ static VALUE
1537
+ frb_iw_optimize(VALUE self)
1538
+ {
1539
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1540
+ frt_iw_optimize(iw);
1541
+ return self;
1542
+ }
1543
+
1544
+ /*
1545
+ * call-seq:
1546
+ * iw.commit -> iw
1547
+ *
1548
+ * Explicitly commit any changes to the index that may be hanging around in
1549
+ * memory. You should call this method if you want to read the latest index
1550
+ * with an IndexWriter.
1551
+ */
1552
+ static VALUE
1553
+ frb_iw_commit(VALUE self)
1554
+ {
1555
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1556
+ frt_iw_commit(iw);
1557
+ return self;
1558
+ }
1559
+
1560
+ /*
1561
+ * call-seq:
1562
+ * iw.add_readers(reader_array) -> iw
1563
+ *
1564
+ * Use this method to merge other indexes into the one being written by
1565
+ * IndexWriter. This is useful for parallel indexing. You can have several
1566
+ * indexing processes running in parallel, possibly even on different
1567
+ * machines. Then you can finish by merging all of the indexes into a single
1568
+ * index.
1569
+ */
1570
+ static VALUE
1571
+ frb_iw_add_readers(VALUE self, VALUE rreaders)
1572
+ {
1573
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1574
+ int i;
1575
+ FrtIndexReader **irs;
1576
+ Check_Type(rreaders, T_ARRAY);
1577
+
1578
+ irs = FRT_ALLOC_N(FrtIndexReader *, RARRAY_LEN(rreaders));
1579
+ i = RARRAY_LEN(rreaders);
1580
+ while (i-- > 0) {
1581
+ FrtIndexReader *ir;
1582
+ Data_Get_Struct(RARRAY_PTR(rreaders)[i], FrtIndexReader, ir);
1583
+ irs[i] = ir;
1584
+ }
1585
+ frt_iw_add_readers(iw, irs, RARRAY_LEN(rreaders));
1586
+ free(irs);
1587
+ return self;
1588
+ }
1589
+
1590
+ /*
1591
+ * call-seq:
1592
+ * iw.delete(field, term) -> iw
1593
+ * iw.delete(field, terms) -> iw
1594
+ *
1595
+ * Delete all documents in the index with the given +term+ or +terms+ in the
1596
+ * field +field+. You should usually have a unique document id which you use
1597
+ * with this method, rather then deleting all documents with the word "the"
1598
+ * in them. There are of course exceptions to this rule. For example, you may
1599
+ * want to delete all documents with the term "viagra" when deleting spam.
1600
+ */
1601
+ static VALUE
1602
+ frb_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
1603
+ {
1604
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1605
+ if (TYPE(rterm) == T_ARRAY) {
1606
+ const int term_cnt = RARRAY_LEN(rterm);
1607
+ int i;
1608
+ char **terms = FRT_ALLOC_N(char *, term_cnt);
1609
+ for (i = 0; i < term_cnt; i++) {
1610
+ terms[i] = StringValuePtr(RARRAY_PTR(rterm)[i]);
1611
+ }
1612
+ frt_iw_delete_terms(iw, frb_field(rfield), terms, term_cnt);
1613
+ free(terms);
1614
+ } else {
1615
+ frt_iw_delete_term(iw, frb_field(rfield), StringValuePtr(rterm));
1616
+ }
1617
+ return self;
1618
+ }
1619
+
1620
+ /*
1621
+ * call-seq:
1622
+ * index_writer.field_infos -> FieldInfos
1623
+ *
1624
+ * Get the FieldInfos object for this FrtIndexWriter. This is useful if you need
1625
+ * to dynamically add new fields to the index with specific properties.
1626
+ */
1627
+ static VALUE
1628
+ frb_iw_field_infos(VALUE self)
1629
+ {
1630
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1631
+ return frb_get_field_infos(iw->fis);
1632
+ }
1633
+
1634
+ /*
1635
+ * call-seq:
1636
+ * index_writer.analyzer -> FrtAnalyzer
1637
+ *
1638
+ * Get the FrtAnalyzer for this IndexWriter. This is useful if you need
1639
+ * to use the same analyzer in a QueryParser.
1640
+ */
1641
+ static VALUE
1642
+ frb_iw_get_analyzer(VALUE self)
1643
+ {
1644
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1645
+ return frb_get_analyzer(iw->analyzer);
1646
+ }
1647
+
1648
+ /*
1649
+ * call-seq:
1650
+ * index_writer.analyzer -> FrtAnalyzer
1651
+ *
1652
+ * Set the FrtAnalyzer for this IndexWriter. This is useful if you need to
1653
+ * change the analyzer for a special document. It is risky though as the
1654
+ * same analyzer will be used for all documents during search.
1655
+ */
1656
+ static VALUE
1657
+ frb_iw_set_analyzer(VALUE self, VALUE ranalyzer)
1658
+ {
1659
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1660
+
1661
+ frt_a_deref(iw->analyzer);
1662
+ iw->analyzer = frb_get_cwrapped_analyzer(ranalyzer);
1663
+ return ranalyzer;
1664
+ }
1665
+
1666
+ /*
1667
+ * call-seq:
1668
+ * index_writer.version -> int
1669
+ *
1670
+ * Returns the current version of the index writer.
1671
+ */
1672
+ static VALUE
1673
+ frb_iw_version(VALUE self)
1674
+ {
1675
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1676
+ return ULL2NUM(iw->sis->version);
1677
+ }
1678
+
1679
+ /*
1680
+ * call-seq:
1681
+ * iw.chunk_size -> number
1682
+ *
1683
+ * Return the current value of chunk_size
1684
+ */
1685
+ static VALUE
1686
+ frb_iw_get_chunk_size(VALUE self)
1687
+ {
1688
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1689
+ return INT2FIX(iw->config.chunk_size);
1690
+ }
1691
+
1692
+ /*
1693
+ * call-seq:
1694
+ * iw.chunk_size = chunk_size -> chunk_size
1695
+ *
1696
+ * Set the chunk_size parameter
1697
+ */
1698
+ static VALUE
1699
+ frb_iw_set_chunk_size(VALUE self, VALUE rval)
1700
+ {
1701
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1702
+ iw->config.chunk_size = FIX2INT(rval);
1703
+ return rval;
1704
+ }
1705
+
1706
+ /*
1707
+ * call-seq:
1708
+ * iw.max_buffer_memory -> number
1709
+ *
1710
+ * Return the current value of max_buffer_memory
1711
+ */
1712
+ static VALUE
1713
+ frb_iw_get_max_buffer_memory(VALUE self)
1714
+ {
1715
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1716
+ return INT2FIX(iw->config.max_buffer_memory);
1717
+ }
1718
+
1719
+ /*
1720
+ * call-seq:
1721
+ * iw.max_buffer_memory = max_buffer_memory -> max_buffer_memory
1722
+ *
1723
+ * Set the max_buffer_memory parameter
1724
+ */
1725
+ static VALUE
1726
+ frb_iw_set_max_buffer_memory(VALUE self, VALUE rval)
1727
+ {
1728
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1729
+ iw->config.max_buffer_memory = FIX2INT(rval);
1730
+ return rval;
1731
+ }
1732
+
1733
+ /*
1734
+ * call-seq:
1735
+ * iw.term_index_interval -> number
1736
+ *
1737
+ * Return the current value of term_index_interval
1738
+ */
1739
+ static VALUE
1740
+ frb_iw_get_index_interval(VALUE self)
1741
+ {
1742
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1743
+ return INT2FIX(iw->config.index_interval);
1744
+ }
1745
+
1746
+ /*
1747
+ * call-seq:
1748
+ * iw.term_index_interval = term_index_interval -> term_index_interval
1749
+ *
1750
+ * Set the term_index_interval parameter
1751
+ */
1752
+ static VALUE
1753
+ frb_iw_set_index_interval(VALUE self, VALUE rval)
1754
+ {
1755
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1756
+ iw->config.index_interval = FIX2INT(rval);
1757
+ return rval;
1758
+ }
1759
+
1760
+ /*
1761
+ * call-seq:
1762
+ * iw.doc_skip_interval -> number
1763
+ *
1764
+ * Return the current value of doc_skip_interval
1765
+ */
1766
+ static VALUE
1767
+ frb_iw_get_skip_interval(VALUE self)
1768
+ {
1769
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1770
+ return INT2FIX(iw->config.skip_interval);
1771
+ }
1772
+
1773
+ /*
1774
+ * call-seq:
1775
+ * iw.doc_skip_interval = doc_skip_interval -> doc_skip_interval
1776
+ *
1777
+ * Set the doc_skip_interval parameter
1778
+ */
1779
+ static VALUE
1780
+ frb_iw_set_skip_interval(VALUE self, VALUE rval)
1781
+ {
1782
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1783
+ iw->config.skip_interval = FIX2INT(rval);
1784
+ return rval;
1785
+ }
1786
+
1787
+ /*
1788
+ * call-seq:
1789
+ * iw.merge_factor -> number
1790
+ *
1791
+ * Return the current value of merge_factor
1792
+ */
1793
+ static VALUE
1794
+ frb_iw_get_merge_factor(VALUE self)
1795
+ {
1796
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1797
+ return INT2FIX(iw->config.merge_factor);
1798
+ }
1799
+
1800
+ /*
1801
+ * call-seq:
1802
+ * iw.merge_factor = merge_factor -> merge_factor
1803
+ *
1804
+ * Set the merge_factor parameter
1805
+ */
1806
+ static VALUE
1807
+ frb_iw_set_merge_factor(VALUE self, VALUE rval)
1808
+ {
1809
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1810
+ iw->config.merge_factor = FIX2INT(rval);
1811
+ return rval;
1812
+ }
1813
+
1814
+ /*
1815
+ * call-seq:
1816
+ * iw.max_buffered_docs -> number
1817
+ *
1818
+ * Return the current value of max_buffered_docs
1819
+ */
1820
+ static VALUE
1821
+ frb_iw_get_max_buffered_docs(VALUE self)
1822
+ {
1823
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1824
+ return INT2FIX(iw->config.max_buffered_docs);
1825
+ }
1826
+
1827
+ /*
1828
+ * call-seq:
1829
+ * iw.max_buffered_docs = max_buffered_docs -> max_buffered_docs
1830
+ *
1831
+ * Set the max_buffered_docs parameter
1832
+ */
1833
+ static VALUE
1834
+ frb_iw_set_max_buffered_docs(VALUE self, VALUE rval)
1835
+ {
1836
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1837
+ iw->config.max_buffered_docs = FIX2INT(rval);
1838
+ return rval;
1839
+ }
1840
+
1841
+ /*
1842
+ * call-seq:
1843
+ * iw.max_merge_docs -> number
1844
+ *
1845
+ * Return the current value of max_merge_docs
1846
+ */
1847
+ static VALUE
1848
+ frb_iw_get_max_merge_docs(VALUE self)
1849
+ {
1850
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1851
+ return INT2FIX(iw->config.max_merge_docs);
1852
+ }
1853
+
1854
+ /*
1855
+ * call-seq:
1856
+ * iw.max_merge_docs = max_merge_docs -> max_merge_docs
1857
+ *
1858
+ * Set the max_merge_docs parameter
1859
+ */
1860
+ static VALUE
1861
+ frb_iw_set_max_merge_docs(VALUE self, VALUE rval)
1862
+ {
1863
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1864
+ iw->config.max_merge_docs = FIX2INT(rval);
1865
+ return rval;
1866
+ }
1867
+
1868
+ /*
1869
+ * call-seq:
1870
+ * iw.max_field_length -> number
1871
+ *
1872
+ * Return the current value of max_field_length
1873
+ */
1874
+ static VALUE
1875
+ frb_iw_get_max_field_length(VALUE self)
1876
+ {
1877
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1878
+ return INT2FIX(iw->config.max_field_length);
1879
+ }
1880
+
1881
+ /*
1882
+ * call-seq:
1883
+ * iw.max_field_length = max_field_length -> max_field_length
1884
+ *
1885
+ * Set the max_field_length parameter
1886
+ */
1887
+ static VALUE
1888
+ frb_iw_set_max_field_length(VALUE self, VALUE rval)
1889
+ {
1890
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1891
+ iw->config.max_field_length = FIX2INT(rval);
1892
+ return rval;
1893
+ }
1894
+
1895
+ /*
1896
+ * call-seq:
1897
+ * iw.use_compound_file -> number
1898
+ *
1899
+ * Return the current value of use_compound_file
1900
+ */
1901
+ static VALUE
1902
+ frb_iw_get_use_compound_file(VALUE self)
1903
+ {
1904
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1905
+ return iw->config.use_compound_file ? Qtrue : Qfalse;
1906
+ }
1907
+
1908
+ /*
1909
+ * call-seq:
1910
+ * iw.use_compound_file = use_compound_file -> use_compound_file
1911
+ *
1912
+ * Set the use_compound_file parameter
1913
+ */
1914
+ static VALUE
1915
+ frb_iw_set_use_compound_file(VALUE self, VALUE rval)
1916
+ {
1917
+ FrtIndexWriter *iw = (FrtIndexWriter *)DATA_PTR(self);
1918
+ iw->config.use_compound_file = RTEST(rval);
1919
+ return rval;
1920
+ }
1921
+
1922
+ /****************************************************************************
1923
+ *
1924
+ * LazyDoc Methods
1925
+ *
1926
+ ****************************************************************************/
1927
+
1928
+ static void
1929
+ frb_lzd_data_free(void *p)
1930
+ {
1931
+ frt_lazy_doc_close((FrtLazyDoc *)p);
1932
+ }
1933
+
1934
+ static VALUE
1935
+ frb_lazy_df_load(VALUE self, VALUE rkey, FrtLazyDocField *lazy_df)
1936
+ {
1937
+ VALUE rdata = Qnil;
1938
+ if (lazy_df) {
1939
+ if (lazy_df->size == 1) {
1940
+ char *data = frt_lazy_df_get_data(lazy_df, 0);
1941
+ rdata = rb_str_new(data, lazy_df->len);
1942
+ } else {
1943
+ int i;
1944
+ rdata = rb_ary_new2(lazy_df->size);
1945
+ for (i = 0; i < lazy_df->size; i++) {
1946
+ char *data = frt_lazy_df_get_data(lazy_df, i);
1947
+ rb_ary_store(rdata, i, rb_str_new(data, lazy_df->data[i].length));
1948
+ }
1949
+ }
1950
+ rb_hash_aset(self, rkey, rdata);
1951
+ }
1952
+ return rdata;
1953
+ }
1954
+
1955
+ /*
1956
+ * call-seq:
1957
+ * lazy_doc.default(key) -> string
1958
+ *
1959
+ * This method is used internally to lazily load fields. You should never
1960
+ * really need to call it yourself.
1961
+ */
1962
+ static VALUE
1963
+ frb_lzd_default(VALUE self, VALUE rkey)
1964
+ {
1965
+ FrtLazyDoc *lazy_doc = (FrtLazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1966
+ FrtSymbol field = frb_field(rkey);
1967
+ VALUE rfield = ID2SYM(field);
1968
+
1969
+ return frb_lazy_df_load(self, rfield, frt_lazy_doc_get(lazy_doc, field));
1970
+ }
1971
+
1972
+ /*
1973
+ * call-seq:
1974
+ * lazy_doc.fields -> array of available fields
1975
+ *
1976
+ * Returns the list of fields stored for this particular document. If you try
1977
+ * to access any of these fields in the document the field will be loaded.
1978
+ * Try to access any other field an nil will be returned.
1979
+ */
1980
+ static VALUE
1981
+ frb_lzd_fields(VALUE self)
1982
+ {
1983
+ return rb_ivar_get(self, id_fields);
1984
+ }
1985
+
1986
+ /*
1987
+ * call-seq:
1988
+ * lazy_doc.load -> lazy_doc
1989
+ *
1990
+ * Load all unloaded fields in the document from the index.
1991
+ */
1992
+ static VALUE
1993
+ frb_lzd_load(VALUE self)
1994
+ {
1995
+ FrtLazyDoc *lazy_doc = (FrtLazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1996
+ int i;
1997
+ for (i = 0; i < lazy_doc->size; i++) {
1998
+ FrtLazyDocField *lazy_df = lazy_doc->fields[i];
1999
+ frb_lazy_df_load(self, ID2SYM(lazy_df->name), lazy_df);
2000
+ }
2001
+ return self;
2002
+ }
2003
+
2004
+ VALUE
2005
+ frb_get_lazy_doc(FrtLazyDoc *lazy_doc)
2006
+ {
2007
+ int i;
2008
+ VALUE rfields = rb_ary_new2(lazy_doc->size);
2009
+
2010
+ VALUE self, rdata;
2011
+ self = rb_hash_new();
2012
+ OBJSETUP(self, cLazyDoc, T_HASH);
2013
+
2014
+ rdata = Data_Wrap_Struct(cLazyDocData, NULL, &frb_lzd_data_free, lazy_doc);
2015
+ rb_ivar_set(self, id_data, rdata);
2016
+
2017
+ for (i = 0; i < lazy_doc->size; i++) {
2018
+ rb_ary_store(rfields, i, ID2SYM(lazy_doc->fields[i]->name));
2019
+ }
2020
+ rb_ivar_set(self, id_fields, rfields);
2021
+
2022
+ return self;
2023
+ }
2024
+
2025
+ /****************************************************************************
2026
+ *
2027
+ * IndexReader Methods
2028
+ *
2029
+ ****************************************************************************/
2030
+
2031
+ void
2032
+ frb_ir_free(void *p)
2033
+ {
2034
+ object_del(p);
2035
+ frt_ir_close((FrtIndexReader *)p);
2036
+ }
2037
+
2038
+ void
2039
+ frb_ir_mark(void *p)
2040
+ {
2041
+ FrtIndexReader *ir = (FrtIndexReader *)p;
2042
+ frb_gc_mark(ir->store);
2043
+ }
2044
+
2045
+ static VALUE frb_ir_close(VALUE self);
2046
+
2047
+ void
2048
+ frb_mr_mark(void *p)
2049
+ {
2050
+ FrtMultiReader *mr = (FrtMultiReader *)p;
2051
+ int i;
2052
+ for (i = 0; i < mr->r_cnt; i++) {
2053
+ frb_gc_mark(mr->sub_readers[i]);
2054
+ }
2055
+ }
2056
+
2057
+ /*
2058
+ * call-seq:
2059
+ * IndexReader.new(dir) -> index_reader
2060
+ *
2061
+ * Create a new IndexReader. You can either pass a string path to a
2062
+ * file-system directory or an actual Ferret::Store::Directory object. For
2063
+ * example;
2064
+ *
2065
+ * dir = RAMDirectory.new()
2066
+ * iw = IndexReader.new(dir)
2067
+ *
2068
+ * dir = FSDirectory.new("/path/to/index")
2069
+ * iw = IndexReader.new(dir)
2070
+ *
2071
+ * iw = IndexReader.new("/path/to/index")
2072
+ *
2073
+ * You can also create a what used to be known as a MultiReader by passing an
2074
+ * array of IndexReader objects, Ferret::Store::Directory objects or
2075
+ * file-system paths;
2076
+ *
2077
+ * iw = IndexReader.new([dir, dir2, dir3])
2078
+ *
2079
+ * iw = IndexReader.new([reader1, reader2, reader3])
2080
+ *
2081
+ * iw = IndexReader.new(["/path/to/index1", "/path/to/index2"])
2082
+ */
2083
+ static VALUE
2084
+ frb_ir_init(VALUE self, VALUE rdir)
2085
+ {
2086
+ FrtStore *store = NULL;
2087
+ FrtIndexReader *ir;
2088
+ int i;
2089
+ FrtFieldInfos *fis;
2090
+ VALUE rfield_num_map = rb_hash_new();
2091
+ int ex_code = 0;
2092
+ const char *msg = NULL;
2093
+
2094
+ FRT_TRY
2095
+ if (TYPE(rdir) == T_ARRAY) {
2096
+ VALUE rdirs = rdir;
2097
+ const int reader_cnt = RARRAY_LEN(rdir);
2098
+ FrtIndexReader **sub_readers = FRT_ALLOC_N(FrtIndexReader *, reader_cnt);
2099
+ int i;
2100
+ for (i = 0; i < reader_cnt; i++) {
2101
+ rdir = RARRAY_PTR(rdirs)[i];
2102
+ switch (TYPE(rdir)) {
2103
+ case T_DATA:
2104
+ if (CLASS_OF(rdir) == cIndexReader) {
2105
+ Data_Get_Struct(rdir, FrtIndexReader, sub_readers[i]);
2106
+ FRT_REF(sub_readers[i]);
2107
+ continue;
2108
+ } else if (RTEST(rb_obj_is_kind_of(rdir, cDirectory))) {
2109
+ store = DATA_PTR(rdir);
2110
+ } else {
2111
+ FRT_RAISE(FRT_ARG_ERROR, "A Multi-IndexReader can only "
2112
+ "be created from other IndexReaders, "
2113
+ "Directory objects or file-system paths. "
2114
+ "Not %s",
2115
+ rs2s(rb_obj_as_string(rdir)));
2116
+ }
2117
+ break;
2118
+ case T_STRING:
2119
+ frb_create_dir(rdir);
2120
+ store = frt_open_fs_store(rs2s(rdir));
2121
+ FRT_DEREF(store);
2122
+ break;
2123
+ default:
2124
+ FRT_RAISE(FRT_ARG_ERROR, "%s isn't a valid directory "
2125
+ "argument. You should use either a String or "
2126
+ "a Directory",
2127
+ rs2s(rb_obj_as_string(rdir)));
2128
+ break;
2129
+ }
2130
+ sub_readers[i] = frt_ir_open(store);
2131
+ }
2132
+ ir = frt_mr_open(sub_readers, reader_cnt);
2133
+ Frt_Wrap_Struct(self, &frb_mr_mark, &frb_ir_free, ir);
2134
+ } else {
2135
+ switch (TYPE(rdir)) {
2136
+ case T_DATA:
2137
+ store = DATA_PTR(rdir);
2138
+ break;
2139
+ case T_STRING:
2140
+ frb_create_dir(rdir);
2141
+ store = frt_open_fs_store(rs2s(rdir));
2142
+ FRT_DEREF(store);
2143
+ break;
2144
+ default:
2145
+ FRT_RAISE(FRT_ARG_ERROR, "%s isn't a valid directory argument. "
2146
+ "You should use either a String or a Directory",
2147
+ rs2s(rb_obj_as_string(rdir)));
2148
+ break;
2149
+ }
2150
+ ir = frt_ir_open(store);
2151
+ Frt_Wrap_Struct(self, &frb_ir_mark, &frb_ir_free, ir);
2152
+ }
2153
+ default:
2154
+ ex_code = xcontext.excode;
2155
+ msg = xcontext.msg;
2156
+ FRT_HANDLED();
2157
+ FRT_XENDTRY
2158
+
2159
+ if (ex_code && msg) { frb_raise(ex_code, msg); }
2160
+
2161
+ object_add(ir, self);
2162
+
2163
+ fis = ir->fis;
2164
+ for (i = 0; i < fis->size; i++) {
2165
+ FrtFieldInfo *fi = fis->fields[i];
2166
+ rb_hash_aset(rfield_num_map,
2167
+ ID2SYM(fi->name),
2168
+ INT2FIX(fi->number));
2169
+ }
2170
+ rb_ivar_set(self, id_fld_num_map, rfield_num_map);
2171
+
2172
+ return self;
2173
+ }
2174
+
2175
+ /*
2176
+ * call-seq:
2177
+ * index_reader.set_norm(doc_id, field, val)
2178
+ *
2179
+ * Expert: change the boost value for a +field+ in document at +doc_id+.
2180
+ * +val+ should be an integer in the range 0..255 which corresponds to an
2181
+ * encoded float value.
2182
+ */
2183
+ static VALUE
2184
+ frb_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
2185
+ {
2186
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2187
+ frt_ir_set_norm(ir, FIX2INT(rdoc_id), frb_field(rfield), (frt_uchar)NUM2CHR(rval));
2188
+ return self;
2189
+ }
2190
+
2191
+ /*
2192
+ * call-seq:
2193
+ * index_reader.norms(field) -> string
2194
+ *
2195
+ * Expert: Returns a string containing the norm values for a field. The
2196
+ * string length will be equal to the number of documents in the index and it
2197
+ * could have null bytes.
2198
+ */
2199
+ static VALUE
2200
+ frb_ir_norms(VALUE self, VALUE rfield)
2201
+ {
2202
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2203
+ frt_uchar *norms;
2204
+ norms = frt_ir_get_norms(ir, frb_field(rfield));
2205
+ if (norms) {
2206
+ return rb_str_new((char *)norms, ir->max_doc(ir));
2207
+ } else {
2208
+ return Qnil;
2209
+ }
2210
+ }
2211
+
2212
+ /*
2213
+ * call-seq:
2214
+ * index_reader.get_norms_into(field, buffer, offset) -> buffer
2215
+ *
2216
+ * Expert: Get the norm values into a string +buffer+ starting at +offset+.
2217
+ */
2218
+ static VALUE
2219
+ frb_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
2220
+ {
2221
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2222
+ int offset;
2223
+ offset = FIX2INT(roffset);
2224
+ Check_Type(rnorms, T_STRING);
2225
+ if (RSTRING_LEN(rnorms) < offset + ir->max_doc(ir)) {
2226
+ rb_raise(rb_eArgError, "supplied a string of length:%ld to "
2227
+ "IndexReader#get_norms_into but needed a string of length "
2228
+ "offset:%d + maxdoc:%d",
2229
+ RSTRING_LEN(rnorms), offset, ir->max_doc(ir));
2230
+ }
2231
+
2232
+ frt_ir_get_norms_into(ir, frb_field(rfield),
2233
+ (frt_uchar *)rs2s(rnorms) + offset);
2234
+ return rnorms;
2235
+ }
2236
+
2237
+ /*
2238
+ * call-seq:
2239
+ * index_reader.commit -> index_reader
2240
+ *
2241
+ * Commit any deletes made by this particular IndexReader to the index. This
2242
+ * will use open a Commit lock.
2243
+ */
2244
+ static VALUE
2245
+ frb_ir_commit(VALUE self)
2246
+ {
2247
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2248
+ frt_ir_commit(ir);
2249
+ return self;
2250
+ }
2251
+
2252
+ /*
2253
+ * call-seq:
2254
+ * index_reader.close -> index_reader
2255
+ *
2256
+ * Close the IndexReader. This method also commits any deletions made by this
2257
+ * IndexReader. This method will be called explicitly by the garbage
2258
+ * collector but you should call it explicitly to commit any changes as soon
2259
+ * as possible and to close any locks held by the object to prevent locking
2260
+ * errors.
2261
+ */
2262
+ static VALUE
2263
+ frb_ir_close(VALUE self)
2264
+ {
2265
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2266
+ object_del(ir);
2267
+ Frt_Unwrap_Struct(self);
2268
+ frt_ir_close(ir);
2269
+ return self;
2270
+ }
2271
+
2272
+ /*
2273
+ * call-seq:
2274
+ * index_reader.has_deletions? -> bool
2275
+ *
2276
+ * Return true if the index has any deletions, either uncommitted by this
2277
+ * IndexReader or committed by any other IndexReader.
2278
+ */
2279
+ static VALUE
2280
+ frb_ir_has_deletions(VALUE self)
2281
+ {
2282
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2283
+ return ir->has_deletions(ir) ? Qtrue : Qfalse;
2284
+ }
2285
+
2286
+ /*
2287
+ * call-seq:
2288
+ * index_reader.delete(doc_id) -> index_reader
2289
+ *
2290
+ * Delete document referenced internally by document id +doc_id+. The
2291
+ * document_id is the number used to reference documents in the index and is
2292
+ * returned by search methods.
2293
+ */
2294
+ static VALUE
2295
+ frb_ir_delete(VALUE self, VALUE rdoc_id)
2296
+ {
2297
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2298
+ frt_ir_delete_doc(ir, FIX2INT(rdoc_id));
2299
+ return self;
2300
+ }
2301
+
2302
+ /*
2303
+ * call-seq:
2304
+ * index_reader.deleted?(doc_id) -> bool
2305
+ *
2306
+ * Returns true if the document at +doc_id+ has been deleted.
2307
+ */
2308
+ static VALUE
2309
+ frb_ir_is_deleted(VALUE self, VALUE rdoc_id)
2310
+ {
2311
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2312
+ return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse;
2313
+ }
2314
+
2315
+ /*
2316
+ * call-seq:
2317
+ * index_reader.max_doc -> number
2318
+ *
2319
+ * Returns 1 + the maximum document id in the index. It is the
2320
+ * document_id that will be used by the next document added to the index. If
2321
+ * there are no deletions, this number also refers to the number of documents
2322
+ * in the index.
2323
+ */
2324
+ static VALUE
2325
+ frb_ir_max_doc(VALUE self)
2326
+ {
2327
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2328
+ return INT2FIX(ir->max_doc(ir));
2329
+ }
2330
+
2331
+ /*
2332
+ * call-seq:
2333
+ * index_reader.num_docs -> number
2334
+ *
2335
+ * Returns the number of accessible (not deleted) documents in the index.
2336
+ * This will be equal to IndexReader#max_doc if there have been no documents
2337
+ * deleted from the index.
2338
+ */
2339
+ static VALUE
2340
+ frb_ir_num_docs(VALUE self)
2341
+ {
2342
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2343
+ return INT2FIX(ir->num_docs(ir));
2344
+ }
2345
+
2346
+ /*
2347
+ * call-seq:
2348
+ * index_reader.undelete_all -> index_reader
2349
+ *
2350
+ * Undelete all deleted documents in the index. This is kind of like a
2351
+ * rollback feature. Not that once an index is committed or a merge happens
2352
+ * during index, deletions will be committed and undelete_all will have no
2353
+ * effect on these documents.
2354
+ */
2355
+ static VALUE
2356
+ frb_ir_undelete_all(VALUE self)
2357
+ {
2358
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2359
+ frt_ir_undelete_all(ir);
2360
+ return self;
2361
+ }
2362
+
2363
+ static VALUE
2364
+ frb_get_doc_range(FrtIndexReader *ir, int pos, int len, int max)
2365
+ {
2366
+ VALUE ary;
2367
+ int i;
2368
+ max = FRT_MIN(max, pos+len);
2369
+ len = max - pos;
2370
+ ary = rb_ary_new2(len);
2371
+ for (i = 0; i < len; i++) {
2372
+ rb_ary_store(ary, i, frb_get_lazy_doc(ir->get_lazy_doc(ir, i + pos)));
2373
+ }
2374
+ return ary;
2375
+ }
2376
+
2377
+ /*
2378
+ * call-seq:
2379
+ * index_reader.get_document(doc_id) -> LazyDoc
2380
+ * index_reader[doc_id] -> LazyDoc
2381
+ *
2382
+ * Retrieve a document from the index. See LazyDoc for more details on the
2383
+ * document returned. Documents are referenced internally by document ids
2384
+ * which are returned by the Searchers search methods.
2385
+ */
2386
+ static VALUE
2387
+ frb_ir_get_doc(int argc, VALUE *argv, VALUE self)
2388
+ {
2389
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2390
+ VALUE arg1, arg2;
2391
+ long pos, len;
2392
+ long max = ir->max_doc(ir);
2393
+ rb_scan_args(argc, argv, "11", &arg1, &arg2);
2394
+ if (argc == 1) {
2395
+ if (FIXNUM_P(arg1)) {
2396
+ pos = FIX2INT(arg1);
2397
+ pos = (pos < 0) ? (max + pos) : pos;
2398
+ if (pos < 0 || pos >= max) {
2399
+ rb_raise(rb_eArgError, "index %ld is out of range [%d..%ld] for "
2400
+ "IndexReader#[]", pos, 0, max);
2401
+ }
2402
+ return frb_get_lazy_doc(ir->get_lazy_doc(ir, pos));
2403
+ }
2404
+
2405
+ /* check if idx is Range */
2406
+ /* FIXME: test this with dodgy values */
2407
+ switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) {
2408
+ case Qfalse:
2409
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
2410
+ "IndexReader.get_document(index)",
2411
+ rb_id2name(SYM2ID(arg1)));
2412
+ case Qnil:
2413
+ return Qnil;
2414
+ default:
2415
+ return frb_get_doc_range(ir, pos, len, max);
2416
+ }
2417
+ }
2418
+ else {
2419
+ pos = FIX2LONG(arg1);
2420
+ len = FIX2LONG(arg2);
2421
+ return frb_get_doc_range(ir, pos, len, max);
2422
+ }
2423
+ }
2424
+
2425
+ /*
2426
+ * call-seq:
2427
+ * index_reader.is_latest? -> bool
2428
+ *
2429
+ * Return true if the index version referenced by this IndexReader is the
2430
+ * latest version of the index. If it isn't you should close and reopen the
2431
+ * index to search the latest documents added to the index.
2432
+ */
2433
+ static VALUE
2434
+ frb_ir_is_latest(VALUE self)
2435
+ {
2436
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2437
+ return frt_ir_is_latest(ir) ? Qtrue : Qfalse;
2438
+ }
2439
+
2440
+ /*
2441
+ * call-seq:
2442
+ * index_reader.term_vector(doc_id, field) -> TermVector
2443
+ *
2444
+ * Return the TermVector for the field +field+ in the document at +doc_id+ in
2445
+ * the index. Return nil if no such term_vector exists. See TermVector.
2446
+ */
2447
+ static VALUE
2448
+ frb_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield)
2449
+ {
2450
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2451
+ FrtTermVector *tv;
2452
+ VALUE rtv;
2453
+ tv = ir->term_vector(ir, FIX2INT(rdoc_id), frb_field(rfield));
2454
+ if (tv) {
2455
+ rtv = frb_get_tv(tv);
2456
+ frt_tv_destroy(tv);
2457
+ return rtv;
2458
+ }
2459
+ else {
2460
+ return Qnil;
2461
+ }
2462
+ }
2463
+
2464
+ static void
2465
+ frb_add_each_tv(void *key, void *value, void *rtvs)
2466
+ {
2467
+ rb_hash_aset((VALUE)rtvs, ID2SYM((FrtSymbol)key), frb_get_tv(value));
2468
+ }
2469
+
2470
+ /*
2471
+ * call-seq:
2472
+ * index_reader.term_vectors(doc_id) -> hash of TermVector
2473
+ *
2474
+ * Return the TermVectors for the document at +doc_id+ in the index. The
2475
+ * value returned is a hash of the TermVectors for each field in the document
2476
+ * and they are referenced by field names (as symbols).
2477
+ */
2478
+ static VALUE
2479
+ frb_ir_term_vectors(VALUE self, VALUE rdoc_id)
2480
+ {
2481
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2482
+ FrtHash *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id));
2483
+ VALUE rtvs = rb_hash_new();
2484
+ frt_h_each(tvs, &frb_add_each_tv, (void *)rtvs);
2485
+ frt_h_destroy(tvs);
2486
+ return rtvs;
2487
+ }
2488
+
2489
+ /*
2490
+ * call-seq:
2491
+ * index_reader.term_docs -> TermDocEnum
2492
+ *
2493
+ * Builds a TermDocEnum (term-document enumerator) for the index. You can use
2494
+ * this object to iterate through the documents in which certain terms occur.
2495
+ * See TermDocEnum for more info.
2496
+ */
2497
+ static VALUE
2498
+ frb_ir_term_docs(VALUE self)
2499
+ {
2500
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2501
+ return frb_get_tde(self, ir->term_docs(ir));
2502
+ }
2503
+
2504
+ /*
2505
+ * call-seq:
2506
+ * index_reader.term_docs_for(field, term) -> TermDocEnum
2507
+ *
2508
+ * Builds a TermDocEnum to iterate through the documents that contain the
2509
+ * term +term+ in the field +field+. See TermDocEnum for more info.
2510
+ */
2511
+ static VALUE
2512
+ frb_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm)
2513
+ {
2514
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2515
+ return frb_get_tde(self, ir_term_docs_for(ir,
2516
+ frb_field(rfield),
2517
+ StringValuePtr(rterm)));
2518
+ }
2519
+
2520
+ /*
2521
+ * call-seq:
2522
+ * index_reader.term_positions -> TermDocEnum
2523
+ *
2524
+ * Same as IndexReader#term_docs except the TermDocEnum will also allow you
2525
+ * to scan through the positions at which a term occurs. See TermDocEnum for
2526
+ * more info.
2527
+ */
2528
+ static VALUE
2529
+ frb_ir_term_positions(VALUE self)
2530
+ {
2531
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2532
+ return frb_get_tde(self, ir->term_positions(ir));
2533
+ }
2534
+
2535
+ /*
2536
+ * call-seq:
2537
+ * index_reader.term_positions_for(field, term) -> TermDocEnum
2538
+ *
2539
+ * Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will
2540
+ * also allow you to scan through the positions at which a term occurs. See
2541
+ * TermDocEnum for more info.
2542
+ */
2543
+ static VALUE
2544
+ frb_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm)
2545
+ {
2546
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2547
+ return frb_get_tde(self, frt_ir_term_positions_for(ir,
2548
+ frb_field(rfield),
2549
+ StringValuePtr(rterm)));
2550
+ }
2551
+
2552
+ /*
2553
+ * call-seq:
2554
+ * index_reader.doc_freq(field, term) -> integer
2555
+ *
2556
+ * Return the number of documents in which the term +term+ appears in the
2557
+ * field +field+.
2558
+ */
2559
+ static VALUE
2560
+ frb_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
2561
+ {
2562
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2563
+ return INT2FIX(frt_ir_doc_freq(ir,
2564
+ frb_field(rfield),
2565
+ StringValuePtr(rterm)));
2566
+ }
2567
+
2568
+ /*
2569
+ * call-seq:
2570
+ * index_reader.terms(field) -> TermEnum
2571
+ *
2572
+ * Returns a term enumerator which allows you to iterate through all the
2573
+ * terms in the field +field+ in the index.
2574
+ */
2575
+ static VALUE
2576
+ frb_ir_terms(VALUE self, VALUE rfield)
2577
+ {
2578
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2579
+ return frb_get_te(self, frt_ir_terms(ir, frb_field(rfield)));
2580
+ }
2581
+
2582
+ /*
2583
+ * call-seq:
2584
+ * index_reader.terms_from(field, term) -> TermEnum
2585
+ *
2586
+ * Same as IndexReader#terms(fields) except that it starts the enumerator off
2587
+ * at term +term+.
2588
+ */
2589
+ static VALUE
2590
+ frb_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2591
+ {
2592
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2593
+ return frb_get_te(self, frt_ir_terms_from(ir,
2594
+ frb_field(rfield),
2595
+ StringValuePtr(rterm)));
2596
+ }
2597
+
2598
+ /*
2599
+ * call-seq:
2600
+ * index_reader.term_count(field) -> int
2601
+ *
2602
+ * Same return a count of the number of terms in the field
2603
+ */
2604
+ static VALUE
2605
+ frb_ir_term_count(VALUE self, VALUE rfield)
2606
+ {
2607
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2608
+ FrtTermEnum *te = frt_ir_terms(ir, frb_field(rfield));
2609
+ int count = 0;
2610
+ while (te->next(te)) {
2611
+ count++;
2612
+ }
2613
+ te->close(te);
2614
+ return INT2FIX(count);
2615
+ }
2616
+
2617
+ /*
2618
+ * call-seq:
2619
+ * index_reader.fields -> array of field-names
2620
+ *
2621
+ * Returns an array of field names in the index. This can be used to pass to
2622
+ * the QueryParser so that the QueryParser knows how to expand the "*"
2623
+ * wild-card to all fields in the index. A list of field names can also be
2624
+ * gathered from the FieldInfos object.
2625
+ */
2626
+ static VALUE
2627
+ frb_ir_fields(VALUE self)
2628
+ {
2629
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2630
+ FrtFieldInfos *fis = ir->fis;
2631
+ VALUE rfield_names = rb_ary_new();
2632
+ int i;
2633
+ for (i = 0; i < fis->size; i++) {
2634
+ rb_ary_push(rfield_names, ID2SYM(fis->fields[i]->name));
2635
+ }
2636
+ return rfield_names;
2637
+ }
2638
+
2639
+ /*
2640
+ * call-seq:
2641
+ * index_reader.field_infos -> FieldInfos
2642
+ *
2643
+ * Get the FieldInfos object for this IndexReader.
2644
+ */
2645
+ static VALUE
2646
+ frb_ir_field_infos(VALUE self)
2647
+ {
2648
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2649
+ return frb_get_field_infos(ir->fis);
2650
+ }
2651
+
2652
+ /*
2653
+ * call-seq:
2654
+ * index_reader.tokenized_fields -> array of field-names
2655
+ *
2656
+ * Returns an array of field names of all of the tokenized fields in the
2657
+ * index. This can be used to pass to the QueryParser so that the QueryParser
2658
+ * knows how to expand the "*" wild-card to all fields in the index. A list
2659
+ * of field names can also be gathered from the FieldInfos object.
2660
+ */
2661
+ static VALUE
2662
+ frb_ir_tk_fields(VALUE self)
2663
+ {
2664
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2665
+ FrtFieldInfos *fis = ir->fis;
2666
+ VALUE rfield_names = rb_ary_new();
2667
+ int i;
2668
+ for (i = 0; i < fis->size; i++) {
2669
+ if (!fi_is_tokenized(fis->fields[i])) continue;
2670
+ rb_ary_push(rfield_names, rb_str_new_cstr(rb_id2name(fis->fields[i]->name)));
2671
+ }
2672
+ return rfield_names;
2673
+ }
2674
+
2675
+ /*
2676
+ * call-seq:
2677
+ * index_reader.version -> int
2678
+ *
2679
+ * Returns the current version of the index reader.
2680
+ */
2681
+ static VALUE
2682
+ frb_ir_version(VALUE self)
2683
+ {
2684
+ FrtIndexReader *ir = (FrtIndexReader *)DATA_PTR(self);
2685
+ return ULL2NUM(ir->sis->version);
2686
+ }
2687
+
2688
+ /****************************************************************************
2689
+ *
2690
+ * Init Functions
2691
+ *
2692
+ ****************************************************************************/
2693
+
2694
+
2695
+ /*
2696
+ * Document-class: Ferret::Index::FieldInfo
2697
+ *
2698
+ * == Summary
2699
+ *
2700
+ * The FieldInfo class is the field descriptor for the index. It specifies
2701
+ * whether a field should be indexed and
2702
+ * tokenized. Every field has a name which must be a symbol. There are three
2703
+ * properties that you can set, +:store+, +:index+ and +:term_vector+. You
2704
+ * can also set the default +:boost+ for a field as well.
2705
+ *
2706
+ * == Properties
2707
+ *
2708
+ * === :store
2709
+ *
2710
+ * The +:store+ property allows you to specify how a field is stored. You can
2711
+ * leave a field unstored (+:no+), store it in it's original format (+:yes+).
2712
+ * By default the document
2713
+ * is stored in its original format. If the field is large and it is stored
2714
+ * elsewhere where it is easily accessible you might want to leave it
2715
+ * unstored. This will keep the index size a lot smaller and make the
2716
+ * indexing process a lot faster. For example, you should probably leave the
2717
+ * +:content+ field unstored when indexing all the documents in your
2718
+ * file-system.
2719
+ *
2720
+ * === :index
2721
+ *
2722
+ * The +:index+ property allows you to specify how a field is indexed. A
2723
+ * field must be indexed to be searchable. However, a field doesn't need to
2724
+ * be indexed to be store in the Ferret index. You may want to use the index
2725
+ * as a simple database and store things like images or MP3s in the index. By
2726
+ * default each field is indexed and tokenized (split into tokens) (+:yes+).
2727
+ * If you don't want to index the field use +:no+. If you want the field
2728
+ * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2729
+ * wish to sort by. There are two other values for +:index+; +:omit_norms+
2730
+ * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
2731
+ * +:untokenized+ respectively and are useful if you are not boosting any
2732
+ * fields and you'd like to speed up the index. The norms file is the file
2733
+ * which contains the boost values for each document for a particular field.
2734
+ *
2735
+ * === :term_vector
2736
+ *
2737
+ * See TermVector for a description of term-vectors. You can specify whether
2738
+ * or not you would like to store term-vectors. The available options are
2739
+ * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2740
+ * +:with_positions_offsets+. Note that you need to store the positions to
2741
+ * associate offsets with individual terms in the term_vector.
2742
+ *
2743
+ * == Property Table
2744
+ *
2745
+ * Property Value Description
2746
+ * ------------------------------------------------------------------------
2747
+ * :store | :no | Don't store field
2748
+ * | |
2749
+ * | :yes (default) | Store field in its original
2750
+ * | | format. Use this value if you
2751
+ * | | want to highlight matches.
2752
+ * | | or print match excerpts a la
2753
+ * | | Google search.
2754
+ * -------------|-------------------------|------------------------------
2755
+ * :index | :no | Do not make this field
2756
+ * | | searchable.
2757
+ * | |
2758
+ * | :yes (default) | Make this field searchable and
2759
+ * | | tokenized its contents.
2760
+ * | |
2761
+ * | :untokenized | Make this field searchable but
2762
+ * | | do not tokenize its contents.
2763
+ * | | use this value for fields you
2764
+ * | | wish to sort by.
2765
+ * | |
2766
+ * | :omit_norms | Same as :yes except omit the
2767
+ * | | norms file. The norms file can
2768
+ * | | be omitted if you don't boost
2769
+ * | | any fields and you don't need
2770
+ * | | scoring based on field length.
2771
+ * | |
2772
+ * | :untokenized_omit_norms | Same as :untokenized except omit
2773
+ * | | the norms file. Norms files can
2774
+ * | | be omitted if you don't boost
2775
+ * | | any fields and you don't need
2776
+ * | | scoring based on field length.
2777
+ * | |
2778
+ * -------------|-------------------------|------------------------------
2779
+ * :term_vector | :no | Don't store term-vectors
2780
+ * | |
2781
+ * | :yes | Store term-vectors without
2782
+ * | | storing positions or offsets.
2783
+ * | |
2784
+ * | :with_positions | Store term-vectors with
2785
+ * | | positions.
2786
+ * | |
2787
+ * | :with_offsets | Store term-vectors with
2788
+ * | | offsets.
2789
+ * | |
2790
+ * | :with_positions_offsets | Store term-vectors with
2791
+ * | (default) | positions and offsets.
2792
+ * -------------|-------------------------|------------------------------
2793
+ * :boost | Float | The boost property is used to
2794
+ * | | set the default boost for a
2795
+ * | | field. This boost value will
2796
+ * | | used for all instances of the
2797
+ * | | field in the index unless
2798
+ * | | otherwise specified when you
2799
+ * | | create the field. All values
2800
+ * | | should be positive.
2801
+ * | |
2802
+ *
2803
+ * == Examples
2804
+ *
2805
+ * fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
2806
+ * :boost => 10.0)
2807
+ *
2808
+ * fi = FieldInfo.new(:content)
2809
+ *
2810
+ * fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
2811
+ * :term_vector => :no)
2812
+ */
2813
+ static void
2814
+ Init_FieldInfo(void)
2815
+ {
2816
+ sym_store = ID2SYM(rb_intern("store"));
2817
+ sym_index = ID2SYM(rb_intern("index"));
2818
+ sym_term_vector = ID2SYM(rb_intern("term_vector"));
2819
+
2820
+ sym_untokenized = ID2SYM(rb_intern("untokenized"));
2821
+ sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
2822
+ sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
2823
+
2824
+ sym_with_positions = ID2SYM(rb_intern("with_positions"));
2825
+ sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
2826
+ sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
2827
+
2828
+ cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
2829
+ rb_define_alloc_func(cFieldInfo, frb_data_alloc);
2830
+
2831
+ rb_define_method(cFieldInfo, "initialize", frb_fi_init, -1);
2832
+ rb_define_method(cFieldInfo, "name", frb_fi_name, 0);
2833
+ rb_define_method(cFieldInfo, "stored?", frb_fi_is_stored, 0);
2834
+ rb_define_method(cFieldInfo, "indexed?", frb_fi_is_indexed, 0);
2835
+ rb_define_method(cFieldInfo, "tokenized?", frb_fi_is_tokenized, 0);
2836
+ rb_define_method(cFieldInfo, "omit_norms?", frb_fi_omit_norms, 0);
2837
+ rb_define_method(cFieldInfo, "store_term_vector?",
2838
+ frb_fi_store_term_vector, 0);
2839
+ rb_define_method(cFieldInfo, "store_positions?",
2840
+ frb_fi_store_positions, 0);
2841
+ rb_define_method(cFieldInfo, "store_offsets?",
2842
+ frb_fi_store_offsets, 0);
2843
+ rb_define_method(cFieldInfo, "has_norms?", frb_fi_has_norms, 0);
2844
+ rb_define_method(cFieldInfo, "boost", frb_fi_boost, 0);
2845
+ rb_define_method(cFieldInfo, "to_s", frb_fi_to_s, 0);
2846
+ }
2847
+
2848
+ /*
2849
+ * Document-class: Ferret::Index::FieldInfos
2850
+ *
2851
+ * == Summary
2852
+ *
2853
+ * The FieldInfos class holds all the field descriptors for an index. It is
2854
+ * this class that is used to create a new index using the
2855
+ * FieldInfos#create_index method. If you are happy with the default
2856
+ * properties for FieldInfo then you don't need to worry about this class.
2857
+ * FrtIndexWriter can create the index for you. Otherwise you should set up the
2858
+ * index like in the example;
2859
+ *
2860
+ * == Example
2861
+ *
2862
+ * field_infos = FieldInfos.new(:term_vector => :no)
2863
+ *
2864
+ * field_infos.add_field(:title, :index => :untokenized, :term_vector => :no,
2865
+ * :boost => 10.0)
2866
+ *
2867
+ * field_infos.add_field(:content)
2868
+ *
2869
+ * field_infos.add_field(:created_on, :index => :untokenized_omit_norms,
2870
+ * :term_vector => :no)
2871
+ *
2872
+ * field_infos.create_index("/path/to/index")
2873
+ *
2874
+ * == Default Properties
2875
+ *
2876
+ * See FieldInfo for the available field property values.
2877
+ *
2878
+ * When you create the FieldInfos object you specify the default properties
2879
+ * for the fields. Often you'll specify all of the fields in the index before
2880
+ * you create the index so the default values won't come into play. However,
2881
+ * it is possible to continue to dynamically add fields as indexing goes
2882
+ * along. If you add a document to the index which has fields that the index
2883
+ * doesn't know about then the default properties are used for the new field.
2884
+ */
2885
+ static void
2886
+ Init_FieldInfos(void)
2887
+ {
2888
+ Init_FieldInfo();
2889
+
2890
+ cFieldInfos = rb_define_class_under(mIndex, "FieldInfos", rb_cObject);
2891
+ rb_define_alloc_func(cFieldInfos, frb_data_alloc);
2892
+
2893
+ rb_define_method(cFieldInfos, "initialize", frb_fis_init, -1);
2894
+ rb_define_method(cFieldInfos, "to_a", frb_fis_to_a, 0);
2895
+ rb_define_method(cFieldInfos, "[]", frb_fis_get, 1);
2896
+ rb_define_method(cFieldInfos, "add", frb_fis_add, 1);
2897
+ rb_define_method(cFieldInfos, "<<", frb_fis_add, 1);
2898
+ rb_define_method(cFieldInfos, "add_field", frb_fis_add_field, -1);
2899
+ rb_define_method(cFieldInfos, "each", frb_fis_each, 0);
2900
+ rb_define_method(cFieldInfos, "to_s", frb_fis_to_s, 0);
2901
+ rb_define_method(cFieldInfos, "size", frb_fis_size, 0);
2902
+ rb_define_method(cFieldInfos, "create_index",
2903
+ frb_fis_create_index, 1);
2904
+ rb_define_method(cFieldInfos, "fields", frb_fis_get_fields, 0);
2905
+ rb_define_method(cFieldInfos, "tokenized_fields", frb_fis_get_tk_fields, 0);
2906
+ }
2907
+
2908
+ /*
2909
+ * Document-class: Ferret::Index::TermEnum
2910
+ *
2911
+ * == Summary
2912
+ *
2913
+ * The TermEnum object is used to iterate through the terms in a field. To
2914
+ * get a TermEnum you need to use the IndexReader#terms(field) method.
2915
+ *
2916
+ * == Example
2917
+ *
2918
+ * te = index_reader.terms(:content)
2919
+ *
2920
+ * te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
2921
+ *
2922
+ * # or you could do it like this;
2923
+ * te = index_reader.terms(:content)
2924
+ *
2925
+ * while te.next?
2926
+ * puts "#{te.term} occurred in #{te.doc_freq} documents in the index"
2927
+ * end
2928
+ */
2929
+ static void
2930
+ Init_TermEnum(void)
2931
+ {
2932
+ id_term = rb_intern("@term");
2933
+
2934
+ cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
2935
+ rb_define_alloc_func(cTermEnum, frb_data_alloc);
2936
+
2937
+ rb_define_method(cTermEnum, "next?", frb_te_next, 0);
2938
+ rb_define_method(cTermEnum, "term", frb_te_term, 0);
2939
+ rb_define_method(cTermEnum, "doc_freq", frb_te_doc_freq, 0);
2940
+ rb_define_method(cTermEnum, "skip_to", frb_te_skip_to, 1);
2941
+ rb_define_method(cTermEnum, "each", frb_te_each, 0);
2942
+ rb_define_method(cTermEnum, "field=", frb_te_set_field, 1);
2943
+ rb_define_method(cTermEnum, "set_field",frb_te_set_field, 1);
2944
+ rb_define_method(cTermEnum, "to_json", frb_te_to_json, -1);
2945
+ }
2946
+
2947
+ /*
2948
+ * Document-class: Ferret::Index::TermDocEnum
2949
+ *
2950
+ * == Summary
2951
+ *
2952
+ * Use a TermDocEnum to iterate through the documents that contain a
2953
+ * particular term. You can also iterate through the positions which the term
2954
+ * occurs in a document.
2955
+ *
2956
+ *
2957
+ * == Example
2958
+ *
2959
+ * tde = index_reader.term_docs_for(:content, "fox")
2960
+ *
2961
+ * tde.each do |doc_id, freq|
2962
+ * puts "fox appeared #{freq} times in document #{doc_id}:"
2963
+ * positions = []
2964
+ * tde.each_position {|pos| positions << pos}
2965
+ * puts " #{positions.join(', ')}"
2966
+ * end
2967
+ *
2968
+ * # or you can do it like this;
2969
+ * tde.seek(:title, "red")
2970
+ * while tde.next?
2971
+ * puts "red appeared #{tde.freq} times in document #{tde.doc}:"
2972
+ * positions = []
2973
+ * while pos = tde.next_position
2974
+ * positions << pos
2975
+ * end
2976
+ * puts " #{positions.join(', ')}"
2977
+ * end
2978
+ */
2979
+ static void
2980
+ Init_TermDocEnum(void)
2981
+ {
2982
+ id_fld_num_map = rb_intern("@field_num_map");
2983
+ id_field_num = rb_intern("@field_num");
2984
+
2985
+ cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
2986
+ rb_define_alloc_func(cTermDocEnum, frb_data_alloc);
2987
+ rb_define_method(cTermDocEnum, "seek", frb_tde_seek, 2);
2988
+ rb_define_method(cTermDocEnum, "seek_term_enum", frb_tde_seek_te, 1);
2989
+ rb_define_method(cTermDocEnum, "doc", frb_tde_doc, 0);
2990
+ rb_define_method(cTermDocEnum, "freq", frb_tde_freq, 0);
2991
+ rb_define_method(cTermDocEnum, "next?", frb_tde_next, 0);
2992
+ rb_define_method(cTermDocEnum, "next_position", frb_tde_next_position, 0);
2993
+ rb_define_method(cTermDocEnum, "each", frb_tde_each, 0);
2994
+ rb_define_method(cTermDocEnum, "each_position", frb_tde_each_position, 0);
2995
+ rb_define_method(cTermDocEnum, "skip_to", frb_tde_skip_to, 1);
2996
+ rb_define_method(cTermDocEnum, "to_json", frb_tde_to_json, -1);
2997
+ }
2998
+
2999
+ /* rdochack
3000
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3001
+ */
3002
+
3003
+ /*
3004
+ * Document-class: Ferret::Index::TermVector::TVOffsets
3005
+ *
3006
+ * == Summary
3007
+ *
3008
+ * Holds the start and end byte-offsets of a term in a field. For example, if
3009
+ * the field was "the quick brown fox" then the start and end offsets of:
3010
+ *
3011
+ * ["the", "quick", "brown", "fox"]
3012
+ *
3013
+ * Would be:
3014
+ *
3015
+ * [(0,3), (4,9), (10,15), (16,19)]
3016
+ *
3017
+ * See the Analysis module for more information on setting the offsets.
3018
+ */
3019
+ static void
3020
+ Init_TVOffsets(void)
3021
+ {
3022
+ const char *tv_offsets_class = "TVOffsets";
3023
+ /* rdochack
3024
+ cTVOffsets = rb_define_class_under(cTermVector, "TVOffsets", rb_cObject);
3025
+ */
3026
+ cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
3027
+ rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
3028
+ rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
3029
+ }
3030
+
3031
+ /*
3032
+ * Document-class: Ferret::Index::TermVector::TVTerm
3033
+ *
3034
+ * == Summary
3035
+ *
3036
+ * The TVTerm class holds the term information for each term in a TermVector.
3037
+ * That is it holds the term's text and its positions in the document. You
3038
+ * can use those positions to reference the offsets for the term.
3039
+ *
3040
+ * == Example
3041
+ *
3042
+ * tv = index_reader.term_vector(:content)
3043
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3044
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3045
+ */
3046
+ static void
3047
+ Init_TVTerm(void)
3048
+ {
3049
+ const char *tv_term_class = "TVTerm";
3050
+ /* rdochack
3051
+ cTVTerm = rb_define_class_under(cTermVector, "TVTerm", rb_cObject);
3052
+ */
3053
+ cTVTerm = rb_struct_define(tv_term_class, "text", "freq", "positions", NULL);
3054
+ rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
3055
+ rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
3056
+ }
3057
+
3058
+ /*
3059
+ * Document-class: Ferret::Index::TermVector
3060
+ *
3061
+ * == Summary
3062
+ *
3063
+ * TermVectors are most commonly used for creating search result excerpts and
3064
+ * highlight search matches in results. This is all done internally so you
3065
+ * won't need to worry about the TermVector object. There are some other
3066
+ * reasons you may want to use the TermVectors object however. For example,
3067
+ * you may wish to see which terms are the most commonly occurring terms in a
3068
+ * document to implement a MoreLikeThis search.
3069
+ *
3070
+ * == Example
3071
+ *
3072
+ * tv = index_reader.term_vector(doc_id, :content)
3073
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
3074
+ *
3075
+ * # get the term frequency
3076
+ * term_freq = tv_term.positions.size
3077
+ *
3078
+ * # get the offsets for a term
3079
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
3080
+ *
3081
+ * == Note
3082
+ *
3083
+ * +positions+ and +offsets+ can be +nil+ depending on what you set the
3084
+ * +:term_vector+ to when you set the FieldInfo object for the field. Note in
3085
+ * particular that you need to store both positions and offsets if you want
3086
+ * to associate offsets with particular terms.
3087
+ */
3088
+ static void
3089
+ Init_TermVector(void)
3090
+ {
3091
+ const char *tv_class = "TermVector";
3092
+ /* rdochack
3093
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
3094
+ */
3095
+ cTermVector = rb_struct_define(tv_class,
3096
+ "field", "terms", "offsets", NULL);
3097
+ rb_set_class_path(cTermVector, mIndex, tv_class);
3098
+ rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
3099
+
3100
+ Init_TVOffsets();
3101
+ Init_TVTerm();
3102
+ }
3103
+
3104
+ /*
3105
+ * Document-class: Ferret::Index::IndexWriter
3106
+ *
3107
+ * == Summary
3108
+ *
3109
+ * The FrtIndexWriter is the class used to add documents to an index. You can
3110
+ * also delete documents from the index using this class. The indexing
3111
+ * process is highly customizable and the FrtIndexWriter has the following
3112
+ * parameters;
3113
+ *
3114
+ * dir:: This is an Ferret::Store::Directory object. You
3115
+ * should either pass a +:dir+ or a +:path+ when
3116
+ * creating an index.
3117
+ * path:: A string representing the path to the index
3118
+ * directory. If you are creating the index for the
3119
+ * first time the directory will be created if it's
3120
+ * missing. You should not choose a directory which
3121
+ * contains other files as they could be over-written.
3122
+ * To protect against this set +:create_if_missing+ to
3123
+ * false.
3124
+ * create_if_missing:: Default: true. Create the index if no index is
3125
+ * found in the specified directory. Otherwise, use
3126
+ * the existing index.
3127
+ * create:: Default: false. Creates the index, even if one
3128
+ * already exists. That means any existing index will
3129
+ * be deleted. It is probably better to use the
3130
+ * create_if_missing option so that the index is only
3131
+ * created the first time when it doesn't exist.
3132
+ * field_infos:: Default FieldInfos.new. The FieldInfos object to use
3133
+ * when creating a new index if +:create_if_missing+ or
3134
+ * +:create+ is set to true. If an existing index is
3135
+ * opened then this parameter is ignored.
3136
+ * analyzer:: Default: Ferret::Analysis::StandardAnalyzer.
3137
+ * Sets the default analyzer for the index. This is
3138
+ * used by both the FrtIndexWriter and the QueryParser
3139
+ * to tokenize the input. The default is the
3140
+ * StandardAnalyzer.
3141
+ * chunk_size:: Default: 0x100000 or 1Mb. Memory performance tuning
3142
+ * parameter. Sets the default size of chunks of memory
3143
+ * malloced for use during indexing. You can usually
3144
+ * leave this parameter as is.
3145
+ * max_buffer_memory:: Default: 0x1000000 or 16Mb. Memory performance
3146
+ * tuning parameter. Sets the amount of memory to be
3147
+ * used by the indexing process. Set to a larger value
3148
+ * to increase indexing speed. Note that this only
3149
+ * includes memory used by the indexing process, not
3150
+ * the rest of your ruby application.
3151
+ * term_index_interval:: Default: 128. The skip interval between terms in the
3152
+ * term dictionary. A smaller value will possibly
3153
+ * increase search performance while also increasing
3154
+ * memory usage and impacting negatively impacting
3155
+ * indexing performance.
3156
+ * doc_skip_interval:: Default: 16. The skip interval for document numbers
3157
+ * in the index. As with +:term_index_interval+ you
3158
+ * have a trade-off. A smaller number may increase
3159
+ * search performance while also increasing memory
3160
+ * usage and impacting negatively impacting indexing
3161
+ * performance.
3162
+ * merge_factor:: Default: 10. This must never be less than 2.
3163
+ * Specifies the number of segments of a certain size
3164
+ * that must exist before they are merged. A larger
3165
+ * value will improve indexing performance while
3166
+ * slowing search performance.
3167
+ * max_buffered_docs:: Default: 10000. The maximum number of documents that
3168
+ * may be stored in memory before being written to the
3169
+ * index. If you have a lot of memory and are indexing
3170
+ * a large number of small documents (like products in
3171
+ * a product database for example) you may want to set
3172
+ * this to a much higher number (like
3173
+ * Ferret::FIX_INT_MAX). If you are worried about your
3174
+ * application crashing during the middle of index you
3175
+ * might set this to a smaller number so that the index
3176
+ * is committed more often. This is like having an
3177
+ * auto-save in a word processor application.
3178
+ * max_merge_docs:: Set this value to limit the number of documents that
3179
+ * go into a single segment. Use this to avoid
3180
+ * extremely long merge times during indexing which can
3181
+ * make your application seem unresponsive. This is
3182
+ * only necessary for very large indexes (millions of
3183
+ * documents).
3184
+ * max_field_length:: Default: 10000. The maximum number of terms added to
3185
+ * a single field. This can be useful to protect the
3186
+ * indexer when indexing documents from the web for
3187
+ * example. Usually the most important terms will occur
3188
+ * early on in a document so you can often safely
3189
+ * ignore the terms in a field after a certain number
3190
+ * of them. If you wanted to speed up indexing and same
3191
+ * space in your index you may only want to index the
3192
+ * first 1000 terms in a field. On the other hand, if
3193
+ * you want to be more thorough and you are indexing
3194
+ * documents from your file-system you may set this
3195
+ * parameter to Ferret::FIX_INT_MAX.
3196
+ * use_compound_file:: Default: true. Uses a compound file to store the
3197
+ * index. This prevents an error being raised for
3198
+ * having too many files open at the same time. The
3199
+ * default is true but performance is better if this is
3200
+ * set to false.
3201
+ *
3202
+ *
3203
+ * === Deleting Documents
3204
+ *
3205
+ * Both IndexReader and FrtIndexWriter allow you to delete documents. You should
3206
+ * use the IndexReader to delete documents by document id and FrtIndexWriter to
3207
+ * delete documents by term which we'll explain now. It is preferrable to
3208
+ * delete documents from an index using FrtIndexWriter for performance reasons.
3209
+ * To delete documents using the FrtIndexWriter you should give each document in
3210
+ * the index a unique ID. If you are indexing documents from the file-system
3211
+ * this unique ID will be the full file path. If indexing documents from the
3212
+ * database you should use the primary key as the ID field. You can then
3213
+ * use the delete method to delete a file referenced by the ID. For example;
3214
+ *
3215
+ * index_writer.delete(:id, "/path/to/indexed/file")
3216
+ */
3217
+ void
3218
+ Init_IndexWriter(void)
3219
+ {
3220
+ id_boost = rb_intern("boost");
3221
+
3222
+ sym_create = ID2SYM(rb_intern("create"));
3223
+ sym_create_if_missing = ID2SYM(rb_intern("create_if_missing"));
3224
+ sym_field_infos = ID2SYM(rb_intern("field_infos"));
3225
+
3226
+ sym_chunk_size = ID2SYM(rb_intern("chunk_size"));
3227
+ sym_max_buffer_memory = ID2SYM(rb_intern("max_buffer_memory"));
3228
+ sym_index_interval = ID2SYM(rb_intern("term_index_interval"));
3229
+ sym_skip_interval = ID2SYM(rb_intern("doc_skip_interval"));
3230
+ sym_merge_factor = ID2SYM(rb_intern("merge_factor"));
3231
+ sym_max_buffered_docs = ID2SYM(rb_intern("max_buffered_docs"));
3232
+ sym_max_merge_docs = ID2SYM(rb_intern("max_merge_docs"));
3233
+ sym_max_field_length = ID2SYM(rb_intern("max_field_length"));
3234
+ sym_use_compound_file = ID2SYM(rb_intern("use_compound_file"));
3235
+
3236
+ cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
3237
+ rb_define_alloc_func(cIndexWriter, frb_data_alloc);
3238
+
3239
+ rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
3240
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
3241
+ rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
3242
+ rb_str_new2(FRT_WRITE_LOCK_NAME));
3243
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
3244
+ rb_str_new2(FRT_COMMIT_LOCK_NAME));
3245
+ rb_define_const(cIndexWriter, "DEFAULT_CHUNK_SIZE",
3246
+ INT2FIX(frt_default_config.chunk_size));
3247
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFER_MEMORY",
3248
+ INT2FIX(frt_default_config.max_buffer_memory));
3249
+ rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
3250
+ INT2FIX(frt_default_config.index_interval));
3251
+ rb_define_const(cIndexWriter, "DEFAULT_DOC_SKIP_INTERVAL",
3252
+ INT2FIX(frt_default_config.skip_interval));
3253
+ rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
3254
+ INT2FIX(frt_default_config.merge_factor));
3255
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFERED_DOCS",
3256
+ INT2FIX(frt_default_config.max_buffered_docs));
3257
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
3258
+ INT2FIX(frt_default_config.max_merge_docs));
3259
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
3260
+ INT2FIX(frt_default_config.max_field_length));
3261
+ rb_define_const(cIndexWriter, "DEFAULT_USE_COMPOUND_FILE",
3262
+ frt_default_config.use_compound_file ? Qtrue : Qfalse);
3263
+
3264
+ rb_define_method(cIndexWriter, "initialize", frb_iw_init, -1);
3265
+ rb_define_method(cIndexWriter, "doc_count", frb_iw_get_doc_count, 0);
3266
+ rb_define_method(cIndexWriter, "close", frb_iw_close, 0);
3267
+ rb_define_method(cIndexWriter, "add_document", frb_iw_add_doc, 1);
3268
+ rb_define_method(cIndexWriter, "<<", frb_iw_add_doc, 1);
3269
+ rb_define_method(cIndexWriter, "optimize", frb_iw_optimize, 0);
3270
+ rb_define_method(cIndexWriter, "commit", frb_iw_commit, 0);
3271
+ rb_define_method(cIndexWriter, "add_readers", frb_iw_add_readers, 1);
3272
+ rb_define_method(cIndexWriter, "delete", frb_iw_delete, 2);
3273
+ rb_define_method(cIndexWriter, "field_infos", frb_iw_field_infos, 0);
3274
+ rb_define_method(cIndexWriter, "analyzer", frb_iw_get_analyzer, 0);
3275
+ rb_define_method(cIndexWriter, "analyzer=", frb_iw_set_analyzer, 1);
3276
+ rb_define_method(cIndexWriter, "version", frb_iw_version, 0);
3277
+
3278
+ rb_define_method(cIndexWriter, "chunk_size",
3279
+ frb_iw_get_chunk_size, 0);
3280
+ rb_define_method(cIndexWriter, "chunk_size=",
3281
+ frb_iw_set_chunk_size, 1);
3282
+
3283
+ rb_define_method(cIndexWriter, "max_buffer_memory",
3284
+ frb_iw_get_max_buffer_memory, 0);
3285
+ rb_define_method(cIndexWriter, "max_buffer_memory=",
3286
+ frb_iw_set_max_buffer_memory, 1);
3287
+
3288
+ rb_define_method(cIndexWriter, "term_index_interval",
3289
+ frb_iw_get_index_interval, 0);
3290
+ rb_define_method(cIndexWriter, "term_index_interval=",
3291
+ frb_iw_set_index_interval, 1);
3292
+
3293
+ rb_define_method(cIndexWriter, "doc_skip_interval",
3294
+ frb_iw_get_skip_interval, 0);
3295
+ rb_define_method(cIndexWriter, "doc_skip_interval=",
3296
+ frb_iw_set_skip_interval, 1);
3297
+
3298
+ rb_define_method(cIndexWriter, "merge_factor",
3299
+ frb_iw_get_merge_factor, 0);
3300
+ rb_define_method(cIndexWriter, "merge_factor=",
3301
+ frb_iw_set_merge_factor, 1);
3302
+
3303
+ rb_define_method(cIndexWriter, "max_buffered_docs",
3304
+ frb_iw_get_max_buffered_docs, 0);
3305
+ rb_define_method(cIndexWriter, "max_buffered_docs=",
3306
+ frb_iw_set_max_buffered_docs, 1);
3307
+
3308
+ rb_define_method(cIndexWriter, "max_merge_docs",
3309
+ frb_iw_get_max_merge_docs, 0);
3310
+ rb_define_method(cIndexWriter, "max_merge_docs=",
3311
+ frb_iw_set_max_merge_docs, 1);
3312
+
3313
+ rb_define_method(cIndexWriter, "max_field_length",
3314
+ frb_iw_get_max_field_length, 0);
3315
+ rb_define_method(cIndexWriter, "max_field_length=",
3316
+ frb_iw_set_max_field_length, 1);
3317
+
3318
+ rb_define_method(cIndexWriter, "use_compound_file",
3319
+ frb_iw_get_use_compound_file, 0);
3320
+ rb_define_method(cIndexWriter, "use_compound_file=",
3321
+ frb_iw_set_use_compound_file, 1);
3322
+
3323
+ }
3324
+
3325
+ /*
3326
+ * Document-class: Ferret::Index::LazyDoc
3327
+ *
3328
+ * == Summary
3329
+ *
3330
+ * When a document is retrieved from the index a LazyDoc is returned.
3331
+ * Actually, LazyDoc is just a modified Hash object which lazily adds fields
3332
+ * to itself when they are accessed. You should note that the keys method
3333
+ * will return nothing until you actually access one of the fields. To see
3334
+ * what fields are available use LazyDoc#fields rather than LazyDoc#keys. To
3335
+ * load all fields use the LazyDoc#load method.
3336
+ *
3337
+ * == Example
3338
+ *
3339
+ * doc = index_reader[0]
3340
+ *
3341
+ * doc.keys #=> []
3342
+ * doc.values #=> []
3343
+ * doc.fields #=> [:title, :content]
3344
+ *
3345
+ * title = doc[:title] #=> "the title"
3346
+ * doc.keys #=> [:title]
3347
+ * doc.values #=> ["the title"]
3348
+ * doc.fields #=> [:title, :content]
3349
+ *
3350
+ * doc.load
3351
+ * doc.keys #=> [:title, :content]
3352
+ * doc.values #=> ["the title", "the content"]
3353
+ * doc.fields #=> [:title, :content]
3354
+ */
3355
+ void
3356
+ Init_LazyDoc(void)
3357
+ {
3358
+ id_fields = rb_intern("@fields");
3359
+
3360
+ cLazyDoc = rb_define_class_under(mIndex, "LazyDoc", rb_cHash);
3361
+ rb_define_method(cLazyDoc, "default", frb_lzd_default, 1);
3362
+ rb_define_method(cLazyDoc, "load", frb_lzd_load, 0);
3363
+ rb_define_method(cLazyDoc, "fields", frb_lzd_fields, 0);
3364
+
3365
+ cLazyDocData = rb_define_class_under(cLazyDoc, "LazyDocData", rb_cObject);
3366
+ rb_define_alloc_func(cLazyDocData, frb_data_alloc);
3367
+ }
3368
+
3369
+ /*
3370
+ * Document-class: Ferret::Index::IndexReader
3371
+ *
3372
+ * == Summary
3373
+ *
3374
+ * IndexReader is used for reading data from the index. This class is usually
3375
+ * used directly for more advanced tasks like iterating through terms in an
3376
+ * index, accessing term-vectors or deleting documents by document id. It is
3377
+ * also used internally by IndexSearcher.
3378
+ */
3379
+ void
3380
+ Init_IndexReader(void)
3381
+ {
3382
+ cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
3383
+ rb_define_alloc_func(cIndexReader, frb_data_alloc);
3384
+ rb_define_method(cIndexReader, "initialize", frb_ir_init, 1);
3385
+ rb_define_method(cIndexReader, "set_norm", frb_ir_set_norm, 3);
3386
+ rb_define_method(cIndexReader, "norms", frb_ir_norms, 1);
3387
+ rb_define_method(cIndexReader, "get_norms_into",frb_ir_get_norms_into, 3);
3388
+ rb_define_method(cIndexReader, "commit", frb_ir_commit, 0);
3389
+ rb_define_method(cIndexReader, "close", frb_ir_close, 0);
3390
+ rb_define_method(cIndexReader, "has_deletions?",frb_ir_has_deletions, 0);
3391
+ rb_define_method(cIndexReader, "delete", frb_ir_delete, 1);
3392
+ rb_define_method(cIndexReader, "deleted?", frb_ir_is_deleted, 1);
3393
+ rb_define_method(cIndexReader, "max_doc", frb_ir_max_doc, 0);
3394
+ rb_define_method(cIndexReader, "num_docs", frb_ir_num_docs, 0);
3395
+ rb_define_method(cIndexReader, "undelete_all", frb_ir_undelete_all, 0);
3396
+ rb_define_method(cIndexReader, "latest?", frb_ir_is_latest, 0);
3397
+ rb_define_method(cIndexReader, "get_document", frb_ir_get_doc, -1);
3398
+ rb_define_method(cIndexReader, "[]", frb_ir_get_doc, -1);
3399
+ rb_define_method(cIndexReader, "term_vector", frb_ir_term_vector, 2);
3400
+ rb_define_method(cIndexReader, "term_vectors", frb_ir_term_vectors, 1);
3401
+ rb_define_method(cIndexReader, "term_docs", frb_ir_term_docs, 0);
3402
+ rb_define_method(cIndexReader, "term_positions",frb_ir_term_positions, 0);
3403
+ rb_define_method(cIndexReader, "term_docs_for", frb_ir_term_docs_for, 2);
3404
+ rb_define_method(cIndexReader, "term_positions_for", frb_ir_t_pos_for, 2);
3405
+ rb_define_method(cIndexReader, "doc_freq", frb_ir_doc_freq, 2);
3406
+ rb_define_method(cIndexReader, "terms", frb_ir_terms, 1);
3407
+ rb_define_method(cIndexReader, "terms_from", frb_ir_terms_from, 2);
3408
+ rb_define_method(cIndexReader, "term_count", frb_ir_term_count, 1);
3409
+ rb_define_method(cIndexReader, "fields", frb_ir_fields, 0);
3410
+ rb_define_method(cIndexReader, "field_names", frb_ir_fields, 0);
3411
+ rb_define_method(cIndexReader, "field_infos", frb_ir_field_infos, 0);
3412
+ rb_define_method(cIndexReader, "tokenized_fields", frb_ir_tk_fields, 0);
3413
+ rb_define_method(cIndexReader, "version", frb_ir_version, 0);
3414
+ }
3415
+
3416
+ /* rdoc hack
3417
+ extern VALUE mFerret = rb_define_module("Ferret");
3418
+ */
3419
+
3420
+ /*
3421
+ * Document-module: Ferret::Index
3422
+ *
3423
+ * == Summary
3424
+ *
3425
+ * The Index module contains all the classes used for adding to and
3426
+ * retrieving from the index. The important classes to know about are;
3427
+ *
3428
+ * * FieldInfo
3429
+ * * FieldInfos
3430
+ * * IndexWriter
3431
+ * * IndexReader
3432
+ * * LazyDoc
3433
+ *
3434
+ * The other classes in this module are useful for more advanced uses like
3435
+ * building tag clouds, creating more-like-this queries, custom highlighting
3436
+ * etc. They are also useful for index browsers.
3437
+ */
3438
+ void
3439
+ Init_Index(void)
3440
+ {
3441
+ mIndex = rb_define_module_under(mFerret, "Index");
3442
+
3443
+ sym_boost = ID2SYM(rb_intern("boost"));
3444
+ sym_analyzer = ID2SYM(rb_intern("analyzer"));
3445
+ sym_close_dir = ID2SYM(rb_intern("close_dir"));
3446
+ fsym_content = rb_intern("content");
3447
+
3448
+ Init_TermVector();
3449
+ Init_TermEnum();
3450
+ Init_TermDocEnum();
3451
+
3452
+ Init_FieldInfos();
3453
+
3454
+ Init_LazyDoc();
3455
+ Init_IndexWriter();
3456
+ Init_IndexReader();
3457
+ }