ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_index.c ADDED
@@ -0,0 +1,3049 @@
1
+ #include "ferret.h"
2
+ #include "index.h"
3
+ #include <st.h>
4
+
5
+ VALUE mIndex;
6
+
7
+ VALUE cFieldInfo;
8
+ VALUE cFieldInfos;
9
+
10
+ VALUE cTVOffsets;
11
+ VALUE cTVTerm;
12
+ VALUE cTermVector;
13
+
14
+ VALUE cTermEnum;
15
+ VALUE cTermDocEnum;
16
+
17
+ VALUE cLazyDoc;
18
+ VALUE cLazyDocData;
19
+ VALUE cIndexWriter;
20
+ VALUE cIndexReader;
21
+
22
+ VALUE sym_analyzer;
23
+ static VALUE sym_close_dir;
24
+ static VALUE sym_create;
25
+ static VALUE sym_create_if_missing;
26
+
27
+ static VALUE sym_chunk_size;
28
+ static VALUE sym_max_buffer_memory;
29
+ static VALUE sym_index_interval;
30
+ static VALUE sym_skip_interval;
31
+ static VALUE sym_merge_factor;
32
+ static VALUE sym_max_buffered_docs;
33
+ static VALUE sym_max_merge_docs;
34
+ static VALUE sym_max_field_length;
35
+ static VALUE sym_use_compound_file;
36
+
37
+ static VALUE sym_boost;
38
+ static VALUE sym_field_infos;
39
+
40
+ static VALUE sym_store;
41
+ static VALUE sym_index;
42
+ static VALUE sym_term_vector;
43
+
44
+ static VALUE sym_compress;
45
+ static VALUE sym_compressed;
46
+
47
+ static VALUE sym_untokenized;
48
+ static VALUE sym_omit_norms;
49
+ static VALUE sym_untokenized_omit_norms;
50
+
51
+ static VALUE sym_with_positions;
52
+ static VALUE sym_with_offsets;
53
+ static VALUE sym_with_positions_offsets;
54
+
55
+ static ID id_term;
56
+ static ID id_fields;
57
+ static ID id_fld_num_map;
58
+ static ID id_field_num;
59
+ static ID id_boost;
60
+
61
+ extern void frt_set_term(VALUE rterm, Term *t);
62
+ extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
63
+ extern VALUE frt_get_analyzer(Analyzer *a);
64
+
65
+ /****************************************************************************
66
+ *
67
+ * FieldInfo Methods
68
+ *
69
+ ****************************************************************************/
70
+
71
+ static void
72
+ frt_fi_free(void *p)
73
+ {
74
+ object_del(p);
75
+ fi_deref((FieldInfo *)p);
76
+ }
77
+
78
+ static void
79
+ frt_fi_get_params(VALUE roptions,
80
+ enum StoreValues *store,
81
+ enum IndexValues *index,
82
+ enum TermVectorValues *term_vector,
83
+ float *boost)
84
+ {
85
+ VALUE v;
86
+ Check_Type(roptions, T_HASH);
87
+ v = rb_hash_aref(roptions, sym_boost);
88
+ if (Qnil != v) {
89
+ *boost = (float)NUM2DBL(v);
90
+ } else {
91
+ *boost = 1.0f;
92
+ }
93
+ v = rb_hash_aref(roptions, sym_store);
94
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
95
+ if (v == sym_no || v == sym_false || v == Qfalse) {
96
+ *store = STORE_NO;
97
+ } else if (v == sym_yes || v == sym_true || v == Qtrue || v == Qnil) {
98
+ *store = STORE_YES;
99
+ } else if (v == sym_compress || v == sym_compressed) {
100
+ *store = STORE_COMPRESS;
101
+ } else {
102
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
103
+ " Please choose from [:yes, :no, :compressed]",
104
+ rb_id2name(SYM2ID(v)));
105
+ }
106
+
107
+ v = rb_hash_aref(roptions, sym_index);
108
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
109
+ if (v == sym_no || v == sym_false || v == Qfalse) {
110
+ *index = INDEX_NO;
111
+ } else if (v == sym_yes || v == sym_true || v == Qtrue || v == Qnil) {
112
+ *index = INDEX_YES;
113
+ } else if (v == sym_untokenized) {
114
+ *index = INDEX_UNTOKENIZED;
115
+ } else if (v == sym_omit_norms) {
116
+ *index = INDEX_YES_OMIT_NORMS;
117
+ } else if (v == sym_untokenized_omit_norms) {
118
+ *index = INDEX_UNTOKENIZED_OMIT_NORMS;
119
+ } else {
120
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
121
+ " Please choose from [:no, :yes, :untokenized, "
122
+ ":omit_norms, :untokenized_omit_norms]",
123
+ rb_id2name(SYM2ID(v)));
124
+ }
125
+
126
+ v = rb_hash_aref(roptions, sym_term_vector);
127
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
128
+ if (v == sym_no || v == sym_false || v == Qfalse) {
129
+ *term_vector = TERM_VECTOR_NO;
130
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
131
+ *term_vector = TERM_VECTOR_YES;
132
+ } else if (v == sym_with_positions) {
133
+ *term_vector = TERM_VECTOR_WITH_POSITIONS;
134
+ } else if (v == sym_with_offsets) {
135
+ *term_vector = TERM_VECTOR_WITH_OFFSETS;
136
+ } else if (v == sym_with_positions_offsets || v == Qnil) {
137
+ *term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
138
+ } else {
139
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
140
+ ":term_vector. Please choose from [:no, :yes, "
141
+ ":with_positions, :with_offsets, "
142
+ ":with_positions_offsets]",
143
+ rb_id2name(SYM2ID(v)));
144
+ }
145
+ }
146
+
147
+ static VALUE
148
+ frt_get_field_info(FieldInfo *fi)
149
+ {
150
+
151
+ VALUE rfi = Qnil;
152
+ if (fi) {
153
+ rfi = object_get(fi);
154
+ if (rfi == Qnil) {
155
+ rfi = Data_Wrap_Struct(cFieldInfo, NULL, &frt_fi_free, fi);
156
+ REF(fi);
157
+ object_add(fi, rfi);
158
+ }
159
+ }
160
+ return rfi;
161
+ }
162
+
163
+ /*
164
+ * call-seq:
165
+ * FieldInfo.new(name, options = {}) -> field_info
166
+ *
167
+ * Create a new FieldInfo object with the name +name+ and the properties
168
+ * specified in +options+. The available options are [:store, :index,
169
+ * :term_vector, :boost]. See the description of FieldInfo for more
170
+ * information on these properties.
171
+ */
172
+ static VALUE
173
+ frt_fi_init(int argc, VALUE *argv, VALUE self)
174
+ {
175
+ VALUE roptions, rname;
176
+ FieldInfo *fi;
177
+ enum StoreValues store = STORE_YES;
178
+ enum IndexValues index = INDEX_YES;
179
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
180
+ float boost = 1.0f;
181
+
182
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
183
+ if (argc > 1) {
184
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
185
+ }
186
+ fi = fi_new(frt_field(rname), store, index, term_vector);
187
+ fi->boost = boost;
188
+ Frt_Wrap_Struct(self, NULL, &frt_fi_free, fi);
189
+ object_add(fi, self);
190
+ return self;
191
+ }
192
+
193
+ /*
194
+ * call-seq:
195
+ * fi.stored? -> bool
196
+ *
197
+ * Return true if the field is stored in the index.
198
+ */
199
+ static VALUE
200
+ frt_fi_is_stored(VALUE self)
201
+ {
202
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
203
+ return fi_is_stored(fi) ? Qtrue : Qfalse;
204
+ }
205
+
206
+ /*
207
+ * call-seq:
208
+ * fi.compressed? -> bool
209
+ *
210
+ * Return true if the field is stored in the index in compressed format.
211
+ */
212
+ static VALUE
213
+ frt_fi_is_compressed(VALUE self)
214
+ {
215
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
216
+ return fi_is_compressed(fi) ? Qtrue : Qfalse;
217
+ }
218
+
219
+ /*
220
+ * call-seq:
221
+ * fi.indexed? -> bool
222
+ *
223
+ * Return true if the field is indexed, ie searchable in the index.
224
+ */
225
+ static VALUE
226
+ frt_fi_is_indexed(VALUE self)
227
+ {
228
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
229
+ return fi_is_indexed(fi) ? Qtrue : Qfalse;
230
+ }
231
+
232
+ /*
233
+ * call-seq:
234
+ * fi.tokenized? -> bool
235
+ *
236
+ * Return true if the field is tokenized. Tokenizing is the process of
237
+ * breaking the field up into tokens. That is "the quick brown fox" becomes
238
+ * ["the", "quick", "brown", "fox"] This is only possible if the field in
239
+ * indexed.
240
+ */
241
+ static VALUE
242
+ frt_fi_is_tokenized(VALUE self)
243
+ {
244
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
245
+ return fi_is_tokenized(fi) ? Qtrue : Qfalse;
246
+ }
247
+
248
+ /*
249
+ * call-seq:
250
+ * fi.omit_norms? -> bool
251
+ *
252
+ * Return true if the field omits the norm file. The norm file is the file
253
+ * used to store the field boosts for an indexed field. If you do not boost
254
+ * any fields, and you can live without scoring based on field length then
255
+ * you can omit the norms file. This will give the index a slight performance
256
+ * boost and it will use less memory, escpecially for indexes which have a
257
+ * large number of documents.
258
+ */
259
+ static VALUE
260
+ frt_fi_omit_norms(VALUE self)
261
+ {
262
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
263
+ return fi_omit_norms(fi) ? Qtrue : Qfalse;
264
+ }
265
+
266
+ /*
267
+ * call-seq:
268
+ * fi.store_term_vector? -> bool
269
+ *
270
+ * Return true if the term-vectors are stored for this field.
271
+ */
272
+ static VALUE
273
+ frt_fi_store_term_vector(VALUE self)
274
+ {
275
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
276
+ return fi_store_term_vector(fi) ? Qtrue : Qfalse;
277
+ }
278
+
279
+ /*
280
+ * call-seq:
281
+ * fi.store_positions? -> bool
282
+ *
283
+ * Return true if positions are stored with the term-vectors for this field.
284
+ */
285
+ static VALUE
286
+ frt_fi_store_positions(VALUE self)
287
+ {
288
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
289
+ return fi_store_positions(fi) ? Qtrue : Qfalse;
290
+ }
291
+
292
+ /*
293
+ * call-seq:
294
+ * fi.store_offsets? -> bool
295
+ *
296
+ * Return true if offsets are stored with the term-vectors for this field.
297
+ */
298
+ static VALUE
299
+ frt_fi_store_offsets(VALUE self)
300
+ {
301
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
302
+ return fi_store_offsets(fi) ? Qtrue : Qfalse;
303
+ }
304
+
305
+ /*
306
+ * call-seq:
307
+ * fi.has_norms? -> bool
308
+ *
309
+ * Return true if this field has a norms file. This is the same as calling;
310
+ *
311
+ * fi.indexed? and not fi.omit_norms?
312
+ */
313
+ static VALUE
314
+ frt_fi_has_norms(VALUE self)
315
+ {
316
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
317
+ return fi_has_norms(fi) ? Qtrue : Qfalse;
318
+ }
319
+
320
+ /*
321
+ * call-seq:
322
+ * fi.boost -> boost
323
+ *
324
+ * Return the default boost for this field
325
+ */
326
+ static VALUE
327
+ frt_fi_boost(VALUE self)
328
+ {
329
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
330
+ return rb_float_new((double)fi->boost);
331
+ }
332
+
333
+ /*
334
+ * call-seq:
335
+ * fi.to_s -> string
336
+ *
337
+ * Return a string representation of the FieldInfo object.
338
+ */
339
+ static VALUE
340
+ frt_fi_to_s(VALUE self)
341
+ {
342
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
343
+ char *fi_s = fi_to_s(fi);
344
+ VALUE rfi_s = rb_str_new2(fi_s);
345
+ free(fi_s);
346
+ return rfi_s;
347
+ }
348
+
349
+ /****************************************************************************
350
+ *
351
+ * FieldInfos Methods
352
+ *
353
+ ****************************************************************************/
354
+
355
+ static void
356
+ frt_fis_free(void *p)
357
+ {
358
+ object_del(p);
359
+ fis_deref((FieldInfos *)p);
360
+ }
361
+
362
+ static void
363
+ frt_fis_mark(void *p)
364
+ {
365
+ int i;
366
+ FieldInfos *fis = (FieldInfos *)p;
367
+
368
+ for (i = 0; i < fis->size; i++) {
369
+ frt_gc_mark(fis->fields[i]);
370
+ }
371
+ }
372
+
373
+ static VALUE
374
+ frt_get_field_infos(FieldInfos *fis)
375
+ {
376
+
377
+ VALUE rfis = Qnil;
378
+ if (fis) {
379
+ rfis = object_get(fis);
380
+ if (rfis == Qnil) {
381
+ rfis = Data_Wrap_Struct(cFieldInfos, &frt_fis_mark, &frt_fis_free,
382
+ fis);
383
+ REF(fis);
384
+ object_add(fis, rfis);
385
+ }
386
+ }
387
+ return rfis;
388
+ }
389
+
390
+ /*
391
+ * call-seq:
392
+ * FieldInfos.new(defaults = {}) -> field_infos
393
+ *
394
+ * Create a new FieldInfos object which uses the default values for fields
395
+ * specified in the +default+ hash parameter. See FieldInfo for available
396
+ * property values.
397
+ */
398
+ static VALUE
399
+ frt_fis_init(int argc, VALUE *argv, VALUE self)
400
+ {
401
+ VALUE roptions;
402
+ FieldInfos *fis;
403
+ enum StoreValues store = STORE_YES;
404
+ enum IndexValues index = INDEX_YES;
405
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
406
+ float boost;
407
+
408
+ rb_scan_args(argc, argv, "01", &roptions);
409
+ if (argc > 0) {
410
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
411
+ }
412
+ fis = fis_new(store, index, term_vector);
413
+ Frt_Wrap_Struct(self, &frt_fis_mark, &frt_fis_free, fis);
414
+ object_add(fis, self);
415
+ return self;
416
+ }
417
+
418
+ /*
419
+ * call-seq:
420
+ * fis.to_a -> array
421
+ *
422
+ * Return an array of the FieldInfo objects contained but this FieldInfos
423
+ * object.
424
+ */
425
+ static VALUE
426
+ frt_fis_to_a(VALUE self)
427
+ {
428
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
429
+ VALUE rary = rb_ary_new();
430
+ int i;
431
+
432
+ for (i = 0; i < fis->size; i++) {
433
+ rb_ary_push(rary, frt_get_field_info(fis->fields[i]));
434
+ }
435
+ return rary;
436
+ }
437
+
438
+ /*
439
+ * call-seq:
440
+ * fis[name] -> field_info
441
+ * fis[number] -> field_info
442
+ *
443
+ * Get the FieldInfo object. FieldInfo objects can be referenced by either
444
+ * their field-number of the field-name (which must be a symbol). For
445
+ * example;
446
+ *
447
+ * fi = fis[:name]
448
+ * fi = fis[2]
449
+ */
450
+ static VALUE
451
+ frt_fis_get(VALUE self, VALUE ridx)
452
+ {
453
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
454
+ VALUE rfi = Qnil;
455
+ switch (TYPE(ridx)) {
456
+ case T_FIXNUM: {
457
+ int index = FIX2INT(ridx);
458
+ if (index < 0) index += fis->size;
459
+ if (index < 0 || index >= fis->size) {
460
+ rb_raise(rb_eArgError, "index of %d is out of range (0..%d)\n",
461
+ index, fis->size);
462
+ }
463
+ rfi = frt_get_field_info(fis->fields[index]);
464
+ break;
465
+ }
466
+ case T_SYMBOL:
467
+ rfi = frt_get_field_info(fis_get_field(fis, frt_field(ridx)));
468
+ break;
469
+ case T_STRING:
470
+ rfi = frt_get_field_info(fis_get_field(fis, StringValuePtr(ridx)));
471
+ break;
472
+ default:
473
+ rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
474
+ RSTRING(rb_obj_as_string(ridx))->ptr);
475
+ break;
476
+ }
477
+ return rfi;
478
+ }
479
+
480
+ /*
481
+ * call-seq:
482
+ * fis << fi -> fis
483
+ * fis.add(fi) -> fis
484
+ *
485
+ * Add a FieldInfo object. Use the FieldInfos#add_field method where
486
+ * possible.
487
+ */
488
+ static VALUE
489
+ frt_fis_add(VALUE self, VALUE rfi)
490
+ {
491
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
492
+ FieldInfo *fi = (FieldInfo *)frt_rb_data_ptr(rfi);
493
+ fis_add_field(fis, fi);
494
+ REF(fi);
495
+ return self;
496
+ }
497
+
498
+ /*
499
+ * call-seq:
500
+ * fis.add_field(name, properties = {} -> fis
501
+ *
502
+ * Add a new field to the FieldInfos object. See FieldInfo for a description
503
+ * of the available properties.
504
+ */
505
+ static VALUE
506
+ frt_fis_add_field(int argc, VALUE *argv, VALUE self)
507
+ {
508
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
509
+ FieldInfo *fi;
510
+ enum StoreValues store = STORE_YES;
511
+ enum IndexValues index = INDEX_YES;
512
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
513
+ float boost = 1.0f;
514
+ VALUE rname, roptions;
515
+
516
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
517
+ if (argc > 1) {
518
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
519
+ }
520
+ fi = fi_new(frt_field(rname), store, index, term_vector);
521
+ fi->boost = boost;
522
+ fis_add_field(fis, fi);
523
+ return self;
524
+ }
525
+
526
+ /*
527
+ * call-seq:
528
+ * fis.each {|fi| do_something } -> fis
529
+ *
530
+ * Iterate through the FieldInfo objects.
531
+ */
532
+ static VALUE
533
+ frt_fis_each(VALUE self)
534
+ {
535
+ int i;
536
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
537
+
538
+ for (i = 0; i < fis->size; i++) {
539
+ rb_yield(frt_get_field_info(fis->fields[i]));
540
+ }
541
+ return self;
542
+ }
543
+
544
+ /*
545
+ * call-seq:
546
+ * fis.to_s -> string
547
+ *
548
+ * Return a string representation of the FieldInfos object.
549
+ */
550
+ static VALUE
551
+ frt_fis_to_s(VALUE self)
552
+ {
553
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
554
+ char *fis_s = fis_to_s(fis);
555
+ VALUE rfis_s = rb_str_new2(fis_s);
556
+ free(fis_s);
557
+ return rfis_s;
558
+ }
559
+
560
+ /*
561
+ * call-seq:
562
+ * fis.create_index(dir) -> self
563
+ *
564
+ * Create a new index in the directory specified. The directory +dir+ can
565
+ * either be a string path representing a directory on the file-system or an
566
+ * actual directory object. Care should be taken when using this method. Any
567
+ * existing index (or other files for that matter) will be deleted from the
568
+ * directory and overwritten by the new index.
569
+ */
570
+ static VALUE
571
+ frt_fis_create_index(VALUE self, VALUE rdir)
572
+ {
573
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
574
+ Store *store = NULL;
575
+ if (TYPE(rdir) == T_DATA) {
576
+ store = DATA_PTR(rdir);
577
+ REF(store);
578
+ } else {
579
+ StringValue(rdir);
580
+ frt_create_dir(rdir);
581
+ store = open_fs_store(RSTRING(rdir)->ptr);
582
+ }
583
+ index_create(store, fis);
584
+ store_deref(store);
585
+ return self;
586
+ }
587
+
588
+ /*
589
+ * call-seq:
590
+ * fis.fields -> symbol array
591
+ *
592
+ * Return a list of the the field names (as symbols) in the index.
593
+ */
594
+ static VALUE
595
+ frt_fis_get_fields(VALUE self)
596
+ {
597
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
598
+ VALUE rfield_names = rb_ary_new();
599
+ int i;
600
+ for (i = 0; i < fis->size; i++) {
601
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
602
+ }
603
+ return rfield_names;
604
+ }
605
+
606
+ /****************************************************************************
607
+ *
608
+ * TermEnum Methods
609
+ *
610
+ ****************************************************************************/
611
+
612
+ static void
613
+ frt_te_free(void *p)
614
+ {
615
+ TermEnum *te = (TermEnum *)p;
616
+ te->close(te);
617
+ }
618
+
619
+ static VALUE
620
+ frt_te_get_set_term(VALUE self, const char *term)
621
+ {
622
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
623
+ VALUE str = term ? rb_str_new(term, te->curr_term_len) : Qnil;
624
+ rb_ivar_set(self, id_term, str);
625
+ return str;
626
+ }
627
+
628
+ static VALUE
629
+ frt_get_te(VALUE rir, TermEnum *te)
630
+ {
631
+ VALUE self = Qnil;
632
+ if (te != NULL) {
633
+ self = Data_Wrap_Struct(cTermEnum, NULL, &frt_te_free, te);
634
+ frt_te_get_set_term(self, te->curr_term);
635
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
636
+ }
637
+ return self;
638
+ }
639
+
640
+ /*
641
+ * call-seq:
642
+ * term_enum.next -> term_string
643
+ *
644
+ * Returns the next term in the enumeration or nil otherwise.
645
+ */
646
+ static VALUE
647
+ frt_te_next(VALUE self)
648
+ {
649
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
650
+ return frt_te_get_set_term(self, te->next(te));
651
+ }
652
+
653
+ /*
654
+ * call-seq:
655
+ * term_enum.term -> term_string
656
+ *
657
+ * Returns the current term pointed to by the enum. This method should only
658
+ * be called after a successful call to TermEnum#next.
659
+ */
660
+ static VALUE
661
+ frt_te_term(VALUE self)
662
+ {
663
+ return rb_ivar_get(self, id_term);
664
+ }
665
+
666
+ /*
667
+ * call-seq:
668
+ * term_enum.doc_freq -> integer
669
+ *
670
+ * Returns the document frequency of the current term pointed to by the enum.
671
+ * That is the number of documents that this term appears in. The method
672
+ * should only be called after a successful call to TermEnum#next.
673
+ */
674
+ static VALUE
675
+ frt_te_doc_freq(VALUE self)
676
+ {
677
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
678
+ return INT2FIX(te->curr_ti.doc_freq);
679
+ }
680
+
681
+ /*
682
+ * call-seq:
683
+ * term_enum.skip_to(target) -> term
684
+ *
685
+ * Skip to term +target+. This method can skip forwards or backwards. If you
686
+ * want to skip back to the start, pass the empty string "". That is;
687
+ *
688
+ * term_enum.skip_to("")
689
+ *
690
+ * Returns the first term greater than or equal to +target+
691
+ */
692
+ static VALUE
693
+ frt_te_skip_to(VALUE self, VALUE rterm)
694
+ {
695
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
696
+ return frt_te_get_set_term(self, te->skip_to(te, frt_field(rterm)));
697
+ }
698
+
699
+ /*
700
+ * call-seq:
701
+ * term_enum.each {|term, doc_freq| do_something() } -> term_count
702
+ *
703
+ * Iterates through all the terms in the field, yielding the term and the
704
+ * document frequency.
705
+ */
706
+ static VALUE
707
+ frt_te_each(VALUE self)
708
+ {
709
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
710
+ char *term;
711
+ int term_cnt = 0;
712
+ VALUE vals = rb_ary_new2(2);
713
+ RARRAY(vals)->len = 2;
714
+
715
+
716
+ /* each is being called so there will be no current term */
717
+ rb_ivar_set(self, id_term, Qnil);
718
+
719
+
720
+ while (NULL != (term = te->next(te))) {
721
+ term_cnt++;
722
+ RARRAY(vals)->ptr[0] = rb_str_new(term, te->curr_term_len);
723
+ RARRAY(vals)->ptr[1] = INT2FIX(te->curr_ti.doc_freq);
724
+ rb_yield(vals);
725
+ }
726
+ return INT2FIX(term_cnt);
727
+ }
728
+
729
+ /*
730
+ * call-seq:
731
+ * term_enum.set_field(field) -> self
732
+ *
733
+ * Set the field for the term_enum. The field value should be a symbol as
734
+ * usual. For example, to scan all title terms you'd do this;
735
+ *
736
+ * term_enum.set_field(:title).each do |term, doc_freq|
737
+ * do_something()
738
+ * end
739
+ */
740
+ static VALUE
741
+ frt_te_set_field(VALUE self, VALUE rfield)
742
+ {
743
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
744
+ int field_num = 0;
745
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
746
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
747
+ if (rfnum != Qnil) {
748
+ field_num = FIX2INT(rfnum);
749
+ rb_ivar_set(self, id_field_num, rfnum);
750
+ } else {
751
+ Check_Type(rfield, T_SYMBOL);
752
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
753
+ frt_field(rfield));
754
+ }
755
+ te->set_field(te, field_num);
756
+
757
+ return self;
758
+ }
759
+
760
+ /****************************************************************************
761
+ *
762
+ * TermDocEnum Methods
763
+ *
764
+ ****************************************************************************/
765
+
766
+ static void
767
+ frt_tde_free(void *p)
768
+ {
769
+ TermDocEnum *tde = (TermDocEnum *)p;
770
+ tde->close(tde);
771
+ }
772
+
773
+ static VALUE
774
+ frt_get_tde(VALUE rir, TermDocEnum *tde)
775
+ {
776
+ VALUE self = Data_Wrap_Struct(cTermDocEnum, NULL, &frt_tde_free, tde);
777
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
778
+ return self;
779
+ }
780
+
781
+ /*
782
+ * call-seq:
783
+ * term_doc_enum.seek(field, term) -> self
784
+ *
785
+ * Seek the term +term+ in the index for +field+. After you call this method
786
+ * you can call next or each to skip through the documents and positions of
787
+ * this particular term.
788
+ */
789
+ static VALUE
790
+ frt_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
791
+ {
792
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
793
+ char *term;
794
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
795
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
796
+ int field_num = -1;
797
+ term = StringValuePtr(rterm);
798
+ if (rfnum != Qnil) {
799
+ field_num = FIX2INT(rfnum);
800
+ } else {
801
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
802
+ frt_field(rfield));
803
+ }
804
+ tde->seek(tde, field_num, term);
805
+ return self;
806
+ }
807
+
808
+ /*
809
+ * call-seq:
810
+ * term_doc_enum.seek_term_enum(term_enum) -> self
811
+ *
812
+ * Seek the current term in +term_enum+. You could just use the standard seek
813
+ * method like this;
814
+ *
815
+ * term_doc_enum.seek(term_enum.term)
816
+ *
817
+ * However the +seek_term_enum+ method saves an index lookup so should offer
818
+ * a large performance improvement.
819
+ */
820
+ static VALUE
821
+ frt_tde_seek_te(VALUE self, VALUE rterm_enum)
822
+ {
823
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
824
+ TermEnum *te = (TermEnum *)frt_rb_data_ptr(rterm_enum);
825
+ tde->seek_te(tde, te);
826
+ return self;
827
+ }
828
+
829
+ /*
830
+ * call-seq:
831
+ * term_doc_enum.doc -> doc_id
832
+ *
833
+ * Returns the current document number pointed to by the +term_doc_enum+.
834
+ */
835
+ static VALUE
836
+ frt_tde_doc(VALUE self)
837
+ {
838
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
839
+ return INT2FIX(tde->doc_num(tde));
840
+ }
841
+
842
+ /*
843
+ * call-seq:
844
+ * term_doc_enum.doc -> doc_id
845
+ *
846
+ * Returns the frequency of the current document pointed to by the
847
+ * +term_doc_enum+.
848
+ */
849
+ static VALUE
850
+ frt_tde_freq(VALUE self)
851
+ {
852
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
853
+ return INT2FIX(tde->freq(tde));
854
+ }
855
+
856
+ /*
857
+ * call-seq:
858
+ * term_doc_enum.doc -> doc_id
859
+ *
860
+ * Move forward to the next document in the enumeration. Returns +true+ if
861
+ * there is another document or +false+ otherwise.
862
+ */
863
+ static VALUE
864
+ frt_tde_next(VALUE self)
865
+ {
866
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
867
+ return tde->next(tde) ? Qtrue : Qfalse;
868
+ }
869
+
870
+ /*
871
+ * call-seq:
872
+ * term_doc_enum.doc -> doc_id
873
+ *
874
+ * Move forward to the next document in the enumeration. Returns +true+ if
875
+ * there is another document or +false+ otherwise.
876
+ */
877
+ static VALUE
878
+ frt_tde_next_position(VALUE self)
879
+ {
880
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
881
+ int pos;
882
+ if (tde->next_position == NULL) {
883
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
884
+ "the TermDocEnum with Index#term_positions method rather "
885
+ "than the Index#term_docs method");
886
+ }
887
+ pos = tde->next_position(tde);
888
+ return pos >= 0 ? INT2FIX(pos) : Qnil;
889
+ }
890
+
891
+ /*
892
+ * call-seq:
893
+ * term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
894
+ *
895
+ * Iterate through the documents and document frequencies in the
896
+ * +term_doc_enum+.
897
+ *
898
+ * NOTE: this method can only be called once after each seek. If you need to
899
+ * call +#each+ again then you should call +#seek+ again too.
900
+ */
901
+ static VALUE
902
+ frt_tde_each(VALUE self)
903
+ {
904
+ int doc_cnt = 0;
905
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
906
+ VALUE vals = rb_ary_new2(2);
907
+ RARRAY(vals)->len = 2;
908
+
909
+ while (tde->next(tde)) {
910
+ doc_cnt++;
911
+ RARRAY(vals)->ptr[0] = INT2FIX(tde->doc_num(tde));
912
+ RARRAY(vals)->ptr[1] = INT2FIX(tde->freq(tde));
913
+ rb_yield(vals);
914
+
915
+ }
916
+ return INT2FIX(doc_cnt);
917
+ }
918
+
919
+ /*
920
+ * call-seq:
921
+ * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
922
+ *
923
+ * Iterate through each of the positions occupied by the current term in the
924
+ * current document. This can only be called once per document. It can be
925
+ * used within the each method. For example, to print the terms documents and
926
+ * positions;
927
+ *
928
+ * tde.each do |doc_id, freq|
929
+ * puts "term appeared #{freq} times in document #{doc_id}:"
930
+ * positions = []
931
+ * tde.each_position {|pos| positions << pos}
932
+ * puts " #{positions.join(', ')}"
933
+ * end
934
+ */
935
+ static VALUE
936
+ frt_tde_each_position(VALUE self)
937
+ {
938
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
939
+ int pos;
940
+ if (tde->next_position == NULL) {
941
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
942
+ "the TermDocEnum with Index#term_positions method rather "
943
+ "than the Index#term_docs method");
944
+ }
945
+ while (0 <= (pos = tde->next_position(tde))) {
946
+ rb_yield(INT2FIX(pos));
947
+ }
948
+ return self;
949
+ }
950
+
951
+ /*
952
+ * call-seq:
953
+ * term_doc_enum.skip_to(target) -> bool
954
+ *
955
+ * Skip to the required document number +target+ and return true if there is
956
+ * a document >= +target+.
957
+ */
958
+ static VALUE
959
+ frt_tde_skip_to(VALUE self, VALUE rtarget)
960
+ {
961
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
962
+ return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
963
+ }
964
+
965
+ /****************************************************************************
966
+ *
967
+ * TVOffsets Methods
968
+ *
969
+ ****************************************************************************/
970
+
971
+ static VALUE
972
+ frt_get_tv_offsets(Offset *offset)
973
+ {
974
+ return rb_struct_new(cTVOffsets,
975
+ INT2FIX(offset->start),
976
+ INT2FIX(offset->end),
977
+ NULL);
978
+ }
979
+
980
+ /****************************************************************************
981
+ *
982
+ * TVTerm Methods
983
+ *
984
+ ****************************************************************************/
985
+
986
+ static VALUE
987
+ frt_get_tv_term(TVTerm *tv_term)
988
+ {
989
+ int i;
990
+ const int freq = tv_term->freq;
991
+ VALUE rtext;
992
+ VALUE rpositions = Qnil;
993
+ rtext = rb_str_new2(tv_term->text);
994
+ if (tv_term->positions) {
995
+ VALUE *rpos;
996
+ int *positions = tv_term->positions;
997
+ rpositions = rb_ary_new2(freq);
998
+ rpos = RARRAY(rpositions)->ptr;
999
+ RARRAY(rpositions)->len = freq;
1000
+ for (i = 0; i < freq; i++) {
1001
+ rpos[i] = INT2FIX(positions[i]);
1002
+ }
1003
+ }
1004
+ return rb_struct_new(cTVTerm, rtext, rpositions, NULL);
1005
+ }
1006
+
1007
+ /****************************************************************************
1008
+ *
1009
+ * TermVector Methods
1010
+ *
1011
+ ****************************************************************************/
1012
+
1013
+ static VALUE
1014
+ frt_get_tv(TermVector *tv)
1015
+ {
1016
+ int i;
1017
+ TVTerm *terms = tv->terms;
1018
+ const int t_cnt = tv->term_cnt;
1019
+ const int o_cnt = tv->offset_cnt;
1020
+ VALUE rfield, rterms, *rts;
1021
+ VALUE roffsets = Qnil;
1022
+ rfield = ID2SYM(rb_intern(tv->field));
1023
+
1024
+ rterms = rb_ary_new2(t_cnt);
1025
+ RARRAY(rterms)->len = t_cnt;
1026
+ rts = RARRAY(rterms)->ptr;
1027
+ for (i = 0; i < t_cnt; i++) {
1028
+ rts[i] = frt_get_tv_term(&terms[i]);
1029
+ }
1030
+
1031
+ if (tv->offsets) {
1032
+ VALUE *ros;
1033
+ Offset *offsets = tv->offsets;
1034
+ roffsets = rb_ary_new2(o_cnt);
1035
+ ros = RARRAY(roffsets)->ptr;
1036
+ RARRAY(roffsets)->len = o_cnt;
1037
+ for (i = 0; i < o_cnt; i++) {
1038
+ ros[i] = frt_get_tv_offsets(&offsets[i]);
1039
+ }
1040
+ }
1041
+
1042
+ return rb_struct_new(cTermVector, rfield, rterms, roffsets, NULL);
1043
+ }
1044
+
1045
+ /****************************************************************************
1046
+ *
1047
+ * IndexWriter Methods
1048
+ *
1049
+ ****************************************************************************/
1050
+
1051
+ void
1052
+ frt_iw_free(void *p)
1053
+ {
1054
+ iw_close((IndexWriter *)p);
1055
+ }
1056
+
1057
+ void
1058
+ frt_iw_mark(void *p)
1059
+ {
1060
+ IndexWriter *iw = (IndexWriter *)p;
1061
+ frt_gc_mark(iw->analyzer);
1062
+ frt_gc_mark(iw->store);
1063
+ frt_gc_mark(iw->fis);
1064
+ }
1065
+
1066
+ /*
1067
+ * call-seq:
1068
+ * index_writer.close -> nil
1069
+ *
1070
+ * Close the IndexWriter. This will close and free all resources used
1071
+ * exclusively by the index writer. The garbage collector will do this
1072
+ * automatically if not called explicitly.
1073
+ */
1074
+ static VALUE
1075
+ frt_iw_close(VALUE self)
1076
+ {
1077
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1078
+ Frt_Unwrap_Struct(self);
1079
+ iw_close(iw);
1080
+ return Qnil;
1081
+ }
1082
+
1083
+ #define SET_INT_ATTR(attr) \
1084
+ do {\
1085
+ if (RTEST(rval = rb_hash_aref(roptions, sym_##attr)))\
1086
+ config.attr = FIX2INT(rval);\
1087
+ } while (0)
1088
+
1089
+ /*
1090
+ * call-seq:
1091
+ * IndexWriter.new(options = {}) -> index_writer
1092
+ *
1093
+ * Create a new IndexWriter. You should either pass a path or a directory to
1094
+ * this constructor. For example, here are three ways you can create an
1095
+ * IndexWriter;
1096
+ *
1097
+ * dir = RAMDirectory.new()
1098
+ * iw = IndexWriter.new(:dir => dir)
1099
+ *
1100
+ * dir = FSDirectory.new("/path/to/index")
1101
+ * iw = IndexWriter.new(:dir => dir)
1102
+ *
1103
+ * iw = IndexWriter.new(:path => "/path/to/index")
1104
+ *
1105
+ * See IndexWriter for more options.
1106
+ */
1107
+ static VALUE
1108
+ frt_iw_init(int argc, VALUE *argv, VALUE self)
1109
+ {
1110
+ VALUE roptions, rval;
1111
+ bool create = false;
1112
+ bool create_if_missing = true;
1113
+ Store *store = NULL;
1114
+ Analyzer *analyzer = NULL;
1115
+ IndexWriter *volatile iw = NULL;
1116
+ Config config = default_config;
1117
+
1118
+ rb_scan_args(argc, argv, "01", &roptions);
1119
+ if (argc > 0) {
1120
+ Check_Type(roptions, T_HASH);
1121
+
1122
+ if ((rval = rb_hash_aref(roptions, sym_dir)) != Qnil) {
1123
+ Check_Type(rval, T_DATA);
1124
+ store = DATA_PTR(rval);
1125
+ } else if ((rval = rb_hash_aref(roptions, sym_path)) != Qnil) {
1126
+ StringValue(rval);
1127
+ frt_create_dir(rval);
1128
+ store = open_fs_store(RSTRING(rval)->ptr);
1129
+ DEREF(store);
1130
+ }
1131
+
1132
+ /* Let ruby's garbage collector handle the closing of the store
1133
+ if (!close_dir) {
1134
+ close_dir = RTEST(rb_hash_aref(roptions, sym_close_dir));
1135
+ }
1136
+ */
1137
+ /* use_compound_file defaults to true */
1138
+ config.use_compound_file =
1139
+ (rb_hash_aref(roptions, sym_use_compound_file) == Qfalse)
1140
+ ? false
1141
+ : true;
1142
+
1143
+ if ((rval = rb_hash_aref(roptions, sym_analyzer)) != Qnil) {
1144
+ analyzer = frt_get_cwrapped_analyzer(rval);
1145
+ }
1146
+
1147
+ create = RTEST(rb_hash_aref(roptions, sym_create));
1148
+ if ((rval = rb_hash_aref(roptions, sym_create_if_missing)) != Qnil) {
1149
+ create_if_missing = RTEST(rval);
1150
+ }
1151
+ SET_INT_ATTR(chunk_size);
1152
+ SET_INT_ATTR(max_buffer_memory);
1153
+ SET_INT_ATTR(index_interval);
1154
+ SET_INT_ATTR(skip_interval);
1155
+ SET_INT_ATTR(merge_factor);
1156
+ SET_INT_ATTR(max_buffered_docs);
1157
+ SET_INT_ATTR(max_merge_docs);
1158
+ SET_INT_ATTR(max_field_length);
1159
+ }
1160
+ if (NULL == store) {
1161
+ store = open_ram_store();
1162
+ DEREF(store);
1163
+ }
1164
+ if (!create && create_if_missing && !store->exists(store, "segments")) {
1165
+ create = true;
1166
+ }
1167
+ if (create) {
1168
+ FieldInfos *fis;
1169
+ if ((rval = rb_hash_aref(roptions, sym_field_infos)) != Qnil) {
1170
+ Data_Get_Struct(rval, FieldInfos, fis);
1171
+ index_create(store, fis);
1172
+ } else {
1173
+ fis = fis_new(STORE_YES, INDEX_YES,
1174
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS);
1175
+ index_create(store, fis);
1176
+ fis_deref(fis);
1177
+ }
1178
+ }
1179
+
1180
+ iw = iw_open(store, analyzer, &config);
1181
+
1182
+ Frt_Wrap_Struct(self, &frt_iw_mark, &frt_iw_free, iw);
1183
+
1184
+ if (rb_block_given_p()) {
1185
+ rb_yield(self);
1186
+ frt_iw_close(self);
1187
+ return Qnil;
1188
+ } else {
1189
+ return self;
1190
+ }
1191
+ }
1192
+
1193
+ /*
1194
+ * call-seq:
1195
+ * iw.doc_count -> number
1196
+ *
1197
+ * Returns the number of documents in the Index. Note that deletions won't be
1198
+ * taken into account until the IndexWriter has been commited.
1199
+ */
1200
+ static VALUE
1201
+ frt_iw_get_doc_count(VALUE self)
1202
+ {
1203
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1204
+ return INT2FIX(iw_doc_count(iw));
1205
+ }
1206
+
1207
+ static int
1208
+ frt_hash_to_doc_i(VALUE key, VALUE value, VALUE arg)
1209
+ {
1210
+ if (key == Qundef) {
1211
+ return ST_CONTINUE;
1212
+ } else {
1213
+ Document *doc = (Document *)arg;
1214
+ char *field;
1215
+ VALUE val;
1216
+ DocField *df;
1217
+ switch (TYPE(key)) {
1218
+ case T_STRING:
1219
+ field = RSTRING(key)->ptr;
1220
+ break;
1221
+ case T_SYMBOL:
1222
+ field = rb_id2name(SYM2ID(key));
1223
+ break;
1224
+ default:
1225
+ rb_raise(rb_eArgError,
1226
+ "%s cannot be a key to a field. Field keys must "
1227
+ " be symbols.", RSTRING(rb_obj_as_string(key))->ptr);
1228
+ break;
1229
+ }
1230
+ if (NULL == (df = doc_get_field(doc, field))) {
1231
+ df = df_new(field);
1232
+ }
1233
+ if (rb_respond_to(value, id_boost)) {
1234
+ df->boost = (float)NUM2DBL(rb_funcall(value, id_boost, 0));
1235
+ }
1236
+ switch (TYPE(value)) {
1237
+ case T_ARRAY:
1238
+ {
1239
+ int i;
1240
+ for (i = 0; i < RARRAY(value)->len; i++) {
1241
+ val = rb_obj_as_string(RARRAY(value)->ptr[i]);
1242
+ df_add_data_len(df,
1243
+ RSTRING(val)->ptr,
1244
+ RSTRING(val)->len);
1245
+ }
1246
+ }
1247
+ break;
1248
+ default:
1249
+ val = rb_obj_as_string(value);
1250
+ df_add_data_len(df, RSTRING(val)->ptr, RSTRING(val)->len);
1251
+ break;
1252
+ }
1253
+ doc_add_field(doc, df);
1254
+ }
1255
+ return ST_CONTINUE;
1256
+ }
1257
+
1258
+ static Document *
1259
+ frt_get_doc(VALUE rdoc)
1260
+ {
1261
+ VALUE val;
1262
+ Document *doc = doc_new();
1263
+ DocField *df;
1264
+
1265
+ if (rb_respond_to(rdoc, id_boost)) {
1266
+ doc->boost = (float)NUM2DBL(rb_funcall(rdoc, id_boost, 0));
1267
+ }
1268
+
1269
+ switch (TYPE(rdoc)) {
1270
+ case T_HASH:
1271
+ rb_hash_foreach(rdoc, frt_hash_to_doc_i, (VALUE)doc);
1272
+ break;
1273
+ case T_ARRAY:
1274
+ {
1275
+ int i;
1276
+ df = df_new("content");
1277
+ for (i = 0; i < RARRAY(rdoc)->len; i++) {
1278
+ val = rb_obj_as_string(RARRAY(rdoc)->ptr[i]);
1279
+ df_add_data_len(df,
1280
+ RSTRING(val)->ptr,
1281
+ RSTRING(val)->len);
1282
+ }
1283
+ doc_add_field(doc, df);
1284
+ }
1285
+ break;
1286
+ case T_SYMBOL:
1287
+ df = df_add_data(df_new("content"), rb_id2name(SYM2ID(rdoc)));
1288
+ doc_add_field(doc, df);
1289
+ break;
1290
+ case T_STRING:
1291
+ default:
1292
+ val = rb_obj_as_string(rdoc);
1293
+ df = df_add_data_len(df_new("content"),
1294
+ RSTRING(val)->ptr,
1295
+ RSTRING(val)->len);
1296
+ doc_add_field(doc, df);
1297
+ break;
1298
+ }
1299
+ return doc;
1300
+ }
1301
+
1302
+ /*
1303
+ * call-seq:
1304
+ * iw << document -> iw
1305
+ * iw.add_document(document) -> iw
1306
+ *
1307
+ * Add a document to the index. See Document. A document can also be a simple
1308
+ * hash object.
1309
+ */
1310
+ static VALUE
1311
+ frt_iw_add_doc(VALUE self, VALUE rdoc)
1312
+ {
1313
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1314
+ Document *doc = frt_get_doc(rdoc);
1315
+ iw_add_doc(iw, doc);
1316
+ doc_destroy(doc);
1317
+ return self;
1318
+ }
1319
+
1320
+ /*
1321
+ * call-seq:
1322
+ * iw.optimize -> iw
1323
+ *
1324
+ * Optimize the index for searching. This commits any unwritten data to the
1325
+ * index and optimizes the index into a single segment to improve search
1326
+ * performance. This is an expensive operation and should not be called too
1327
+ * often. The best time to call this is at the end of a long batch indexing
1328
+ * process. Note that calling the optimize method do not in any way effect
1329
+ * indexing speed (except for the time taken to complete the optimization
1330
+ * process).
1331
+ */
1332
+ static VALUE
1333
+ frt_iw_optimize(VALUE self)
1334
+ {
1335
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1336
+ iw_optimize(iw);
1337
+ return self;
1338
+ }
1339
+
1340
+ /*
1341
+ * call-seq:
1342
+ * iw.commit -> iw
1343
+ *
1344
+ * Explicitly commit any changes to the index that may be hanging around in
1345
+ * memory. You should call this method if you want to read the latest index
1346
+ * with an IndexWriter.
1347
+ */
1348
+ static VALUE
1349
+ frt_iw_commit(VALUE self)
1350
+ {
1351
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1352
+ iw_commit(iw);
1353
+ return self;
1354
+ }
1355
+
1356
+ /*
1357
+ * call-seq:
1358
+ * iw.add_readers(reader_array) -> iw
1359
+ *
1360
+ * Use this method to merge other indexes into the one being written by
1361
+ * IndexWriter. This is useful for parallel indexing. You can have several
1362
+ * indexing processes running in parallel, possibly even on different
1363
+ * machines. Then you can finish by merging all of the indexes into a single
1364
+ * index.
1365
+ */
1366
+ static VALUE
1367
+ frt_iw_add_readers(VALUE self, VALUE rreaders)
1368
+ {
1369
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1370
+ int i;
1371
+ IndexReader **irs;
1372
+ Check_Type(rreaders, T_ARRAY);
1373
+
1374
+ irs = ALLOC_N(IndexReader *, RARRAY(rreaders)->len);
1375
+ i = RARRAY(rreaders)->len;
1376
+ while (i-- > 0) {
1377
+ IndexReader *ir;
1378
+ Data_Get_Struct(RARRAY(rreaders)->ptr[i], IndexReader, ir);
1379
+ irs[i] = ir;
1380
+ }
1381
+ iw_add_readers(iw, irs, RARRAY(rreaders)->len);
1382
+ free(irs);
1383
+ return self;
1384
+ }
1385
+
1386
+ /*
1387
+ * call-seq:
1388
+ * iw.delete(field, term) -> iw
1389
+ *
1390
+ * Delete all documents in the index with the term +term+ in the field
1391
+ * +field+. You should usually have a unique document id which you use with
1392
+ * this method, rather then deleting all documents with the word "the" in
1393
+ * them. You may however use this method to delete spam.
1394
+ */
1395
+ static VALUE
1396
+ frt_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
1397
+ {
1398
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1399
+ iw_delete_term(iw, frt_field(rfield), StringValuePtr(rterm));
1400
+ return self;
1401
+ }
1402
+
1403
+ /*
1404
+ * call-seq:
1405
+ * index_writer.field_infos -> FieldInfos
1406
+ *
1407
+ * Get the FieldInfos object for this IndexWriter. This is useful if you need
1408
+ * to dynamically add new fields to the index with specific properties.
1409
+ */
1410
+ static VALUE
1411
+ frt_iw_field_infos(VALUE self)
1412
+ {
1413
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1414
+ return frt_get_field_infos(iw->fis);
1415
+ }
1416
+
1417
+ /*
1418
+ * call-seq:
1419
+ * index_writer.analyzer -> Analyzer
1420
+ *
1421
+ * Get the Analyzer for this IndexWriter. This is useful if you need
1422
+ * to use the same analyzer in a QueryParser.
1423
+ */
1424
+ static VALUE
1425
+ frt_iw_get_analyzer(VALUE self)
1426
+ {
1427
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1428
+ return frt_get_analyzer(iw->analyzer);
1429
+ }
1430
+
1431
+ /*
1432
+ * call-seq:
1433
+ * index_writer.analyzer -> Analyzer
1434
+ *
1435
+ * Set the Analyzer for this IndexWriter. This is useful if you need to
1436
+ * change the analyzer for a special document. It is risky though as the
1437
+ * same anlyzer will be used for all documents during search.
1438
+ */
1439
+ static VALUE
1440
+ frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
1441
+ {
1442
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1443
+
1444
+ a_deref(iw->analyzer);
1445
+ iw->analyzer = frt_get_cwrapped_analyzer(ranalyzer);
1446
+ return ranalyzer;
1447
+ }
1448
+
1449
+ /*
1450
+ * call-seq:
1451
+ * iw.chunk_size -> number
1452
+ *
1453
+ * Return the current value of chunk_size
1454
+ */
1455
+ static VALUE
1456
+ frt_iw_get_chunk_size(VALUE self)
1457
+ {
1458
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1459
+ return INT2FIX(iw->config.chunk_size);
1460
+ }
1461
+
1462
+ /*
1463
+ * call-seq:
1464
+ * iw.chunk_size = chunk_size -> chunk_size
1465
+ *
1466
+ * Set the chunk_size parameter
1467
+ */
1468
+ static VALUE
1469
+ frt_iw_set_chunk_size(VALUE self, VALUE rval)
1470
+ {
1471
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1472
+ iw->config.chunk_size = FIX2INT(rval);
1473
+ return rval;
1474
+ }
1475
+
1476
+ /*
1477
+ * call-seq:
1478
+ * iw.max_buffer_memory -> number
1479
+ *
1480
+ * Return the current value of max_buffer_memory
1481
+ */
1482
+ static VALUE
1483
+ frt_iw_get_max_buffer_memory(VALUE self)
1484
+ {
1485
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1486
+ return INT2FIX(iw->config.max_buffer_memory);
1487
+ }
1488
+
1489
+ /*
1490
+ * call-seq:
1491
+ * iw.max_buffer_memory = max_buffer_memory -> max_buffer_memory
1492
+ *
1493
+ * Set the max_buffer_memory parameter
1494
+ */
1495
+ static VALUE
1496
+ frt_iw_set_max_buffer_memory(VALUE self, VALUE rval)
1497
+ {
1498
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1499
+ iw->config.max_buffer_memory = FIX2INT(rval);
1500
+ return rval;
1501
+ }
1502
+
1503
+ /*
1504
+ * call-seq:
1505
+ * iw.term_index_interval -> number
1506
+ *
1507
+ * Return the current value of term_index_interval
1508
+ */
1509
+ static VALUE
1510
+ frt_iw_get_index_interval(VALUE self)
1511
+ {
1512
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1513
+ return INT2FIX(iw->config.index_interval);
1514
+ }
1515
+
1516
+ /*
1517
+ * call-seq:
1518
+ * iw.term_index_interval = term_index_interval -> term_index_interval
1519
+ *
1520
+ * Set the term_index_interval parameter
1521
+ */
1522
+ static VALUE
1523
+ frt_iw_set_index_interval(VALUE self, VALUE rval)
1524
+ {
1525
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1526
+ iw->config.index_interval = FIX2INT(rval);
1527
+ return rval;
1528
+ }
1529
+
1530
+ /*
1531
+ * call-seq:
1532
+ * iw.doc_skip_interval -> number
1533
+ *
1534
+ * Return the current value of doc_skip_interval
1535
+ */
1536
+ static VALUE
1537
+ frt_iw_get_skip_interval(VALUE self)
1538
+ {
1539
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1540
+ return INT2FIX(iw->config.skip_interval);
1541
+ }
1542
+
1543
+ /*
1544
+ * call-seq:
1545
+ * iw.doc_skip_interval = doc_skip_interval -> doc_skip_interval
1546
+ *
1547
+ * Set the doc_skip_interval parameter
1548
+ */
1549
+ static VALUE
1550
+ frt_iw_set_skip_interval(VALUE self, VALUE rval)
1551
+ {
1552
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1553
+ iw->config.skip_interval = FIX2INT(rval);
1554
+ return rval;
1555
+ }
1556
+
1557
+ /*
1558
+ * call-seq:
1559
+ * iw.merge_factor -> number
1560
+ *
1561
+ * Return the current value of merge_factor
1562
+ */
1563
+ static VALUE
1564
+ frt_iw_get_merge_factor(VALUE self)
1565
+ {
1566
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1567
+ return INT2FIX(iw->config.merge_factor);
1568
+ }
1569
+
1570
+ /*
1571
+ * call-seq:
1572
+ * iw.merge_factor = merge_factor -> merge_factor
1573
+ *
1574
+ * Set the merge_factor parameter
1575
+ */
1576
+ static VALUE
1577
+ frt_iw_set_merge_factor(VALUE self, VALUE rval)
1578
+ {
1579
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1580
+ iw->config.merge_factor = FIX2INT(rval);
1581
+ return rval;
1582
+ }
1583
+
1584
+ /*
1585
+ * call-seq:
1586
+ * iw.max_buffered_docs -> number
1587
+ *
1588
+ * Return the current value of max_buffered_docs
1589
+ */
1590
+ static VALUE
1591
+ frt_iw_get_max_buffered_docs(VALUE self)
1592
+ {
1593
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1594
+ return INT2FIX(iw->config.max_buffered_docs);
1595
+ }
1596
+
1597
+ /*
1598
+ * call-seq:
1599
+ * iw.max_buffered_docs = max_buffered_docs -> max_buffered_docs
1600
+ *
1601
+ * Set the max_buffered_docs parameter
1602
+ */
1603
+ static VALUE
1604
+ frt_iw_set_max_buffered_docs(VALUE self, VALUE rval)
1605
+ {
1606
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1607
+ iw->config.max_buffered_docs = FIX2INT(rval);
1608
+ return rval;
1609
+ }
1610
+
1611
+ /*
1612
+ * call-seq:
1613
+ * iw.max_merge_docs -> number
1614
+ *
1615
+ * Return the current value of max_merge_docs
1616
+ */
1617
+ static VALUE
1618
+ frt_iw_get_max_merge_docs(VALUE self)
1619
+ {
1620
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1621
+ return INT2FIX(iw->config.max_merge_docs);
1622
+ }
1623
+
1624
+ /*
1625
+ * call-seq:
1626
+ * iw.max_merge_docs = max_merge_docs -> max_merge_docs
1627
+ *
1628
+ * Set the max_merge_docs parameter
1629
+ */
1630
+ static VALUE
1631
+ frt_iw_set_max_merge_docs(VALUE self, VALUE rval)
1632
+ {
1633
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1634
+ iw->config.max_merge_docs = FIX2INT(rval);
1635
+ return rval;
1636
+ }
1637
+
1638
+ /*
1639
+ * call-seq:
1640
+ * iw.max_field_length -> number
1641
+ *
1642
+ * Return the current value of max_field_length
1643
+ */
1644
+ static VALUE
1645
+ frt_iw_get_max_field_length(VALUE self)
1646
+ {
1647
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1648
+ return INT2FIX(iw->config.max_field_length);
1649
+ }
1650
+
1651
+ /*
1652
+ * call-seq:
1653
+ * iw.max_field_length = max_field_length -> max_field_length
1654
+ *
1655
+ * Set the max_field_length parameter
1656
+ */
1657
+ static VALUE
1658
+ frt_iw_set_max_field_length(VALUE self, VALUE rval)
1659
+ {
1660
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1661
+ iw->config.max_field_length = FIX2INT(rval);
1662
+ return rval;
1663
+ }
1664
+
1665
+ /*
1666
+ * call-seq:
1667
+ * iw.use_compound_file -> number
1668
+ *
1669
+ * Return the current value of use_compound_file
1670
+ */
1671
+ static VALUE
1672
+ frt_iw_get_use_compound_file(VALUE self)
1673
+ {
1674
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1675
+ return iw->config.use_compound_file ? Qtrue : Qfalse;
1676
+ }
1677
+
1678
+ /*
1679
+ * call-seq:
1680
+ * iw.use_compound_file = use_compound_file -> use_compound_file
1681
+ *
1682
+ * Set the use_compound_file parameter
1683
+ */
1684
+ static VALUE
1685
+ frt_iw_set_use_compound_file(VALUE self, VALUE rval)
1686
+ {
1687
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1688
+ iw->config.use_compound_file = RTEST(rval);
1689
+ return rval;
1690
+ }
1691
+
1692
+ /****************************************************************************
1693
+ *
1694
+ * LazyDoc Methods
1695
+ *
1696
+ ****************************************************************************/
1697
+
1698
+ static void
1699
+ frt_lzd_date_free(void *p)
1700
+ {
1701
+ lazy_doc_close((LazyDoc *)p);
1702
+ }
1703
+
1704
+ static VALUE
1705
+ frt_lazy_df_load(VALUE self, VALUE rkey, LazyDocField *lazy_df)
1706
+ {
1707
+ VALUE rdata = Qnil;
1708
+ if (lazy_df) {
1709
+ if (lazy_df->size == 1) {
1710
+ char *data = lazy_df_get_data(lazy_df, 0);
1711
+ rdata = rb_str_new(data, lazy_df->len);
1712
+ } else {
1713
+ int i;
1714
+ rdata = rb_ary_new2(lazy_df->size);
1715
+ for (i = 0; i < lazy_df->size; i++) {
1716
+ char *data = lazy_df_get_data(lazy_df, i);
1717
+ RARRAY(rdata)->ptr[i] =
1718
+ rb_str_new(data, lazy_df->data[i].length);
1719
+ RARRAY(rdata)->len++;
1720
+ }
1721
+ }
1722
+ rb_hash_aset(self, rkey, rdata);
1723
+ }
1724
+ return rdata;
1725
+ }
1726
+
1727
+ /*
1728
+ * call-seq:
1729
+ * lazy_doc.default(key) -> string
1730
+ *
1731
+ * This method is used internally to lazily load fields. You should never
1732
+ * really need to call it yourself.
1733
+ */
1734
+ static VALUE
1735
+ frt_lzd_default(VALUE self, VALUE rkey)
1736
+ {
1737
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1738
+ char *field = NULL;
1739
+ switch (TYPE(rkey)) {
1740
+ case T_STRING:
1741
+ field = RSTRING(rkey)->ptr;
1742
+ rkey = ID2SYM(rb_intern(field));
1743
+ break;
1744
+ case T_SYMBOL:
1745
+ field = frt_field(rkey);
1746
+ break;
1747
+ default:
1748
+ rb_raise(rb_eArgError,
1749
+ "%s cannot be a key to a field. Field keys must "
1750
+ " be symbols.", RSTRING(rb_obj_as_string(rkey))->ptr);
1751
+ break;
1752
+ }
1753
+ return frt_lazy_df_load(self, rkey, h_get(lazy_doc->field_dict, field));
1754
+ }
1755
+
1756
+ /*
1757
+ * call-seq:
1758
+ * lazy_doc.fields -> array of available fields
1759
+ *
1760
+ * Returns the list of fields stored for this particular document. If you try
1761
+ * to access any of these fields in the document the field will be loaded.
1762
+ * Try to access any other field an nil will be returned.
1763
+ */
1764
+ static VALUE
1765
+ frt_lzd_fields(VALUE self)
1766
+ {
1767
+ return rb_ivar_get(self, id_fields);
1768
+ }
1769
+
1770
+ /*
1771
+ * call-seq:
1772
+ * lazy_doc.load -> lazy_doc
1773
+ *
1774
+ * Load all unloaded fields in the document from the index.
1775
+ */
1776
+ static VALUE
1777
+ frt_lzd_load(VALUE self)
1778
+ {
1779
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1780
+ int i;
1781
+ for (i = 0; i < lazy_doc->size; i++) {
1782
+ LazyDocField *lazy_df = lazy_doc->fields[i];
1783
+ frt_lazy_df_load(self, ID2SYM(rb_intern(lazy_df->name)), lazy_df);
1784
+ }
1785
+ return self;
1786
+ }
1787
+
1788
+ VALUE
1789
+ frt_get_lazy_doc(LazyDoc *lazy_doc)
1790
+ {
1791
+ int i;
1792
+ VALUE rfields = rb_ary_new2(lazy_doc->size);
1793
+
1794
+ VALUE self, rdata;
1795
+ self = rb_hash_new();
1796
+ OBJSETUP(self, cLazyDoc, T_HASH);
1797
+
1798
+ rdata = Data_Wrap_Struct(cLazyDocData, NULL, &frt_lzd_date_free, lazy_doc);
1799
+ rb_ivar_set(self, id_data, rdata);
1800
+
1801
+ for (i = 0; i < lazy_doc->size; i++) {
1802
+ RARRAY(rfields)->ptr[i] = rb_intern(lazy_doc->fields[i]->name);
1803
+ RARRAY(rfields)->len++;
1804
+ }
1805
+ rb_ivar_set(self, id_fields, rfields);
1806
+
1807
+ return self;
1808
+ }
1809
+
1810
+ /****************************************************************************
1811
+ *
1812
+ * IndexReader Methods
1813
+ *
1814
+ ****************************************************************************/
1815
+
1816
+ void
1817
+ frt_ir_free(void *p)
1818
+ {
1819
+ object_del(p);
1820
+ ir_close((IndexReader *)p);
1821
+ }
1822
+
1823
+ void
1824
+ frt_ir_mark(void *p)
1825
+ {
1826
+ IndexReader *ir = (IndexReader *)p;
1827
+ frt_gc_mark(ir->store);
1828
+ }
1829
+
1830
+ /*
1831
+ * call-seq:
1832
+ * IndexReader.new(dir) -> index_reader
1833
+ *
1834
+ * Create a new IndexReader. You can either pass a string path to a
1835
+ * file-system directory or an actual Ferret::Store::Directory object. For
1836
+ * example;
1837
+ *
1838
+ * dir = RAMDirectory.new()
1839
+ * iw = IndexReader.new(dir)
1840
+ *
1841
+ * dir = FSDirectory.new("/path/to/index")
1842
+ * iw = IndexReader.new(dir)
1843
+ *
1844
+ * iw = IndexReader.new("/path/to/index")
1845
+ */
1846
+ static VALUE
1847
+ frt_ir_init(VALUE self, VALUE rdir)
1848
+ {
1849
+ Store *store = NULL;
1850
+ IndexReader *ir;
1851
+ int i;
1852
+ FieldInfos *fis;
1853
+ VALUE rfield_num_map = rb_hash_new();
1854
+
1855
+ if (TYPE(rdir) == T_ARRAY) {
1856
+ const int reader_cnt = RARRAY(rdir)->len;
1857
+ IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
1858
+ int i;
1859
+ for (i = 0; i < reader_cnt; i++) {
1860
+ Data_Get_Struct(RARRAY(rdir)->ptr[i], IndexReader, sub_readers[i]);
1861
+ REF(sub_readers[i]);
1862
+ }
1863
+ ir = mr_open(sub_readers, reader_cnt);
1864
+ } else {
1865
+ switch (TYPE(rdir)) {
1866
+ case T_DATA:
1867
+ store = DATA_PTR(rdir);
1868
+ break;
1869
+ case T_STRING:
1870
+ frt_create_dir(rdir);
1871
+ store = open_fs_store(RSTRING(rdir)->ptr);
1872
+ DEREF(store);
1873
+ break;
1874
+ default:
1875
+ rb_raise(rb_eArgError, "%s isn't a valid directory argument. "
1876
+ "You should use either a String or a Directory",
1877
+ RSTRING(rb_obj_as_string(rdir))->ptr);
1878
+ break;
1879
+ }
1880
+ ir = ir_open(store);
1881
+ }
1882
+ Frt_Wrap_Struct(self, &frt_ir_mark, &frt_ir_free, ir);
1883
+ object_add(ir, self);
1884
+
1885
+ fis = ir->fis;
1886
+ for (i = 0; i < fis->size; i++) {
1887
+ FieldInfo *fi = fis->fields[i];
1888
+ rb_hash_aset(rfield_num_map,
1889
+ ID2SYM(rb_intern(fi->name)),
1890
+ INT2FIX(fi->number));
1891
+ }
1892
+ rb_ivar_set(self, id_fld_num_map, rfield_num_map);
1893
+
1894
+ return self;
1895
+ }
1896
+
1897
+ /*
1898
+ * call-seq:
1899
+ * index_reader.set_norm(doc_id, field, val)
1900
+ *
1901
+ * Expert: change the boost value for a +field+ in document at +doc_id+.
1902
+ * +val+ should be an integer in the range 0..255 which corresponds to an
1903
+ * encoced float value.
1904
+ */
1905
+ static VALUE
1906
+ frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
1907
+ {
1908
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1909
+ ir_set_norm(ir, FIX2INT(rdoc_id), frt_field(rfield), NUM2CHR(rval));
1910
+ return self;
1911
+ }
1912
+
1913
+ /*
1914
+ * call-seq:
1915
+ * index_reader.norms(field) -> string
1916
+ *
1917
+ * Expert: Returns a string containing the norm values for a field. The
1918
+ * string length will be equal to the number of documents in the index and it
1919
+ * could have null bytes.
1920
+ */
1921
+ static VALUE
1922
+ frt_ir_norms(VALUE self, VALUE rfield)
1923
+ {
1924
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1925
+ uchar *norms;
1926
+ norms = ir_get_norms(ir, frt_field(rfield));
1927
+ if (norms) {
1928
+ return rb_str_new((char *)norms, ir->max_doc(ir));
1929
+ } else {
1930
+ return Qnil;
1931
+ }
1932
+ }
1933
+
1934
+ /*
1935
+ * call-seq:
1936
+ * index_reader.get_norms_into(field, buffer, offset) -> buffer
1937
+ *
1938
+ * Expert: Get the norm values into a string +buffer+ starting at +offset+.
1939
+ */
1940
+ static VALUE
1941
+ frt_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
1942
+ {
1943
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1944
+ int offset;
1945
+ offset = FIX2INT(roffset);
1946
+ Check_Type(rnorms, T_STRING);
1947
+ if (RSTRING(rnorms)->len < offset + ir->max_doc(ir)) {
1948
+ rb_raise(rb_eArgError, "supplied a string of length:%d to "
1949
+ "IndexReader#get_norms_into but needed a string of length "
1950
+ "offset:%d + maxdoc:%d",
1951
+ RSTRING(rnorms)->len, offset, ir->max_doc(ir));
1952
+ }
1953
+
1954
+ ir_get_norms_into(ir, frt_field(rfield),
1955
+ (uchar *)RSTRING(rnorms)->ptr + offset);
1956
+ return rnorms;
1957
+ }
1958
+
1959
+ /*
1960
+ * call-seq:
1961
+ * index_reader.commit -> index_reader
1962
+ *
1963
+ * Commit any deletes made by this particular IndexReader to the index. This
1964
+ * will use open a Commit lock.
1965
+ */
1966
+ static VALUE
1967
+ frt_ir_commit(VALUE self)
1968
+ {
1969
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1970
+ ir_commit(ir);
1971
+ return self;
1972
+ }
1973
+
1974
+ /*
1975
+ * call-seq:
1976
+ * index_reader.close -> index_reader
1977
+ *
1978
+ * Close the IndexReader. This method also commits any deletions made by this
1979
+ * IndexReader. Thise method will be called explicitly by the garbage
1980
+ * collector but you should call it explicitly to commit any changes as soon
1981
+ * as possible and to close any locks held by the object to prevent locking
1982
+ * errors.
1983
+ */
1984
+ static VALUE
1985
+ frt_ir_close(VALUE self)
1986
+ {
1987
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1988
+ object_del(ir);
1989
+ Frt_Unwrap_Struct(self);
1990
+ ir_close(ir);
1991
+ return self;
1992
+ }
1993
+
1994
+ /*
1995
+ * call-seq:
1996
+ * index_reader.has_deletions? -> bool
1997
+ *
1998
+ * Return true if the index has any deletions, either uncommited by this
1999
+ * IndexReader or committed by any other IndexReader.
2000
+ */
2001
+ static VALUE
2002
+ frt_ir_has_deletions(VALUE self)
2003
+ {
2004
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2005
+ return ir->has_deletions(ir) ? Qtrue : Qfalse;
2006
+ }
2007
+
2008
+ /*
2009
+ * call-seq:
2010
+ * index_reader.delete(doc_id) -> index_reader
2011
+ *
2012
+ * Delete document referenced internally by document id +doc_id+. The
2013
+ * document_id is the number used to reference documents in the index and is
2014
+ * returned by search methods.
2015
+ */
2016
+ static VALUE
2017
+ frt_ir_delete(VALUE self, VALUE rdoc_id)
2018
+ {
2019
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2020
+ ir_delete_doc(ir, FIX2INT(rdoc_id));
2021
+ return self;
2022
+ }
2023
+
2024
+ /*
2025
+ * call-seq:
2026
+ * index_reader.deleted?(doc_id) -> bool
2027
+ *
2028
+ * Returns true if the document at +doc_id+ has been deleted.
2029
+ */
2030
+ static VALUE
2031
+ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
2032
+ {
2033
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2034
+ return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse;
2035
+ }
2036
+
2037
+ /*
2038
+ * call-seq:
2039
+ * index_reader.max_doc -> number
2040
+ *
2041
+ * Returns 1 + the maximum document id in the index. It is the the
2042
+ * document_id that will be used by the next document added to the index. If
2043
+ * there are no deletions, this number also refers to the number of documents
2044
+ * in the index.
2045
+ */
2046
+ static VALUE
2047
+ frt_ir_max_doc(VALUE self)
2048
+ {
2049
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2050
+ return INT2FIX(ir->max_doc(ir));
2051
+ }
2052
+
2053
+ /*
2054
+ * call-seq:
2055
+ * index_reader.num_docs -> number
2056
+ *
2057
+ * Returns the number of accessible (not deleted) documents in the index.
2058
+ * This will be equal to IndexReader#max_doc if there have been no documents
2059
+ * deleted from the index.
2060
+ */
2061
+ static VALUE
2062
+ frt_ir_num_docs(VALUE self)
2063
+ {
2064
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2065
+ return INT2FIX(ir->num_docs(ir));
2066
+ }
2067
+
2068
+ /*
2069
+ * call-seq:
2070
+ * index_reader.undelete_all -> index_reader
2071
+ *
2072
+ * Undelete all deleted documents in the index. This is kind of like a
2073
+ * rollback feature. Not that once an index is commited or a merge happens
2074
+ * during index, deletions will be committed and undelete_all will have no
2075
+ * effect on these documents.
2076
+ */
2077
+ static VALUE
2078
+ frt_ir_undelete_all(VALUE self)
2079
+ {
2080
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2081
+ ir_undelete_all(ir);
2082
+ return self;
2083
+ }
2084
+
2085
+ /*
2086
+ * call-seq:
2087
+ * index_reader.get_document(doc_id) -> LazyDoc
2088
+ * index_reader[doc_id] -> LazyDoc
2089
+ *
2090
+ * Retrieve a document from the index. See LazyDoc for more details on the
2091
+ * document returned. Documents are referenced internally by document ids
2092
+ * which are returned by the Searchers search methods.
2093
+ */
2094
+ static VALUE
2095
+ frt_ir_get_doc(VALUE self, VALUE rdoc_id)
2096
+ {
2097
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2098
+ return frt_get_lazy_doc(ir->get_lazy_doc(ir, FIX2INT(rdoc_id)));
2099
+ }
2100
+
2101
+ /*
2102
+ * call-seq:
2103
+ * index_reader.is_latest? -> bool
2104
+ *
2105
+ * Return true if the index version referenced by this IndexReader is the
2106
+ * latest version of the index. If it isn't you should close and reopen the
2107
+ * index to search the latest documents added to the index.
2108
+ */
2109
+ static VALUE
2110
+ frt_ir_is_latest(VALUE self)
2111
+ {
2112
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2113
+ return ir_is_latest(ir) ? Qtrue : Qfalse;
2114
+ }
2115
+
2116
+ /*
2117
+ * call-seq:
2118
+ * index_reader.term_vector(doc_id, field) -> TermVector
2119
+ *
2120
+ * Return the TermVector for the field +field+ in the document at +doc_id+ in
2121
+ * the index. See TermVector.
2122
+ */
2123
+ static VALUE
2124
+ frt_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield)
2125
+ {
2126
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2127
+ TermVector *tv;
2128
+ VALUE rtv;
2129
+ tv = ir->term_vector(ir, FIX2INT(rdoc_id), frt_field(rfield));
2130
+ rtv = frt_get_tv(tv);
2131
+ tv_destroy(tv);
2132
+ return rtv;
2133
+ }
2134
+
2135
+ static void
2136
+ frt_add_each_tv(void *key, void *value, void *rtvs)
2137
+ {
2138
+ rb_hash_aset((VALUE)rtvs, ID2SYM(rb_intern(key)), frt_get_tv(value));
2139
+ }
2140
+
2141
+ /*
2142
+ * call-seq:
2143
+ * index_reader.term_vectors(doc_id) -> hash of TermVector
2144
+ *
2145
+ * Return the TermVectors for the document at +doc_id+ in the index. The
2146
+ * value returned is a hash of the TermVectors for each field in the document
2147
+ * and they are referenced by field names (as symbols).
2148
+ */
2149
+ static VALUE
2150
+ frt_ir_term_vectors(VALUE self, VALUE rdoc_id)
2151
+ {
2152
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2153
+ HashTable *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id));
2154
+ VALUE rtvs = rb_hash_new();
2155
+ h_each(tvs, &frt_add_each_tv, (void *)rtvs);
2156
+ h_destroy(tvs);
2157
+
2158
+ return rtvs;
2159
+ }
2160
+
2161
+ /*
2162
+ * call-seq:
2163
+ * index_reader.term_docs -> TermDocEnum
2164
+ *
2165
+ * Builds a TermDocEnum (term-document enumerator) for the index. You can use
2166
+ * this object to iterate through the documents in which certain terms occur.
2167
+ * See TermDocEnum for more info.
2168
+ */
2169
+ static VALUE
2170
+ frt_ir_term_docs(VALUE self)
2171
+ {
2172
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2173
+ return frt_get_tde(self, ir->term_docs(ir));
2174
+ }
2175
+
2176
+ /*
2177
+ * call-seq:
2178
+ * index_reader.term_docs_for(field, term) -> TermDocEnum
2179
+ *
2180
+ * Builds a TermDocEnum to iterate through the documents that contain the
2181
+ * term +term+ in the field +field+. See TermDocEnum for more info.
2182
+ */
2183
+ static VALUE
2184
+ frt_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm)
2185
+ {
2186
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2187
+ return frt_get_tde(self, ir_term_docs_for(ir,
2188
+ frt_field(rfield),
2189
+ StringValuePtr(rterm)));
2190
+ }
2191
+
2192
+ /*
2193
+ * call-seq:
2194
+ * index_reader.term_positions -> TermDocEnum
2195
+ *
2196
+ * Same as IndexReader#term_docs except the TermDocEnum will also allow you
2197
+ * to scan through the positions at which a term occurs. See TermDocEnum for
2198
+ * more info.
2199
+ */
2200
+ static VALUE
2201
+ frt_ir_term_positions(VALUE self)
2202
+ {
2203
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2204
+ return frt_get_tde(self, ir->term_positions(ir));
2205
+ }
2206
+
2207
+ /*
2208
+ * call-seq:
2209
+ * index_reader.term_positions_for(field, term) -> TermDocEnum
2210
+ *
2211
+ * Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will
2212
+ * also allow you to scan through the positions at which a term occurs. See
2213
+ * TermDocEnum for more info.
2214
+ */
2215
+ static VALUE
2216
+ frt_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm)
2217
+ {
2218
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2219
+ return frt_get_tde(self, ir_term_positions_for(ir,
2220
+ frt_field(rfield),
2221
+ StringValuePtr(rterm)));
2222
+ }
2223
+
2224
+ /*
2225
+ * call-seq:
2226
+ * index_reader.doc_freq(field, term) -> integer
2227
+ *
2228
+ * Return the number of documents in which the term +term+ appears in the
2229
+ * field +field+.
2230
+ */
2231
+ static VALUE
2232
+ frt_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
2233
+ {
2234
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2235
+ return INT2FIX(ir_doc_freq(ir,
2236
+ frt_field(rfield),
2237
+ StringValuePtr(rterm)));
2238
+ }
2239
+
2240
+ /*
2241
+ * call-seq:
2242
+ * index_reader.terms(field) -> TermEnum
2243
+ *
2244
+ * Returns a term enumerator which allows you to iterate through all the
2245
+ * terms in the field +field+ in the index.
2246
+ */
2247
+ static VALUE
2248
+ frt_ir_terms(VALUE self, VALUE rfield)
2249
+ {
2250
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2251
+ return frt_get_te(self, ir_terms(ir, frt_field(rfield)));
2252
+ }
2253
+
2254
+ /*
2255
+ * call-seq:
2256
+ * index_reader.terms_from(field, term) -> TermEnum
2257
+ *
2258
+ * Same as IndexReader#terms(fields) except that it starts the enumerator off
2259
+ * at term +term+.
2260
+ */
2261
+ static VALUE
2262
+ frt_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2263
+ {
2264
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2265
+ return frt_get_te(self, ir_terms_from(ir,
2266
+ frt_field(rfield),
2267
+ StringValuePtr(rterm)));
2268
+ }
2269
+
2270
+ /*
2271
+ * call-seq:
2272
+ * index_reader.field_names -> array of field-names
2273
+ *
2274
+ * Returns an array of field names in the index. This can be used to pass to
2275
+ * the QueryParser so that the QueryParser knows how to expand the "*"
2276
+ * wild-card to all fields in the index. A list of field names can also be
2277
+ * gathered from the FieldInfos object.
2278
+ */
2279
+ static VALUE
2280
+ frt_ir_field_names(VALUE self)
2281
+ {
2282
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2283
+ FieldInfos *fis = ir->fis;
2284
+ VALUE rfield_names = rb_ary_new();
2285
+ int i;
2286
+ for (i = 0; i < fis->size; i++) {
2287
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
2288
+ }
2289
+ return rfield_names;
2290
+ }
2291
+
2292
+ /*
2293
+ * call-seq:
2294
+ * index_reader.field_infos -> FieldInfos
2295
+ *
2296
+ * Get the FieldInfos object for this IndexReader.
2297
+ */
2298
+ static VALUE
2299
+ frt_ir_field_infos(VALUE self)
2300
+ {
2301
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2302
+ return frt_get_field_infos(ir->fis);
2303
+ }
2304
+
2305
+ /****************************************************************************
2306
+ *
2307
+ * Init Functions
2308
+ *
2309
+ ****************************************************************************/
2310
+
2311
+
2312
+ /*
2313
+ * Document-class: Ferret::Index::FieldInfo
2314
+ *
2315
+ * == Summary
2316
+ *
2317
+ * The FieldInfo class is the field descripter for the index. It specifies
2318
+ * whether a field is compressed or not or whether it should be indexed and
2319
+ * tokenized. Every field has a name which must be a symbol. There are three
2320
+ * properties that you can set, +:store+, +:index+ and +:term_vector+. You
2321
+ * can also set the default +:boost+ for a field as well.
2322
+ *
2323
+ * == Properties
2324
+ *
2325
+ * === :store
2326
+ *
2327
+ * The +:store+ property allows you to specify how a field is stored. You can
2328
+ * leave a field unstored (+:no+), store it in it's original format (+:yes+)
2329
+ * or store it in compressed format (+:compressed+). By default the document
2330
+ * is stored in its original format. If the field is large and it is stored
2331
+ * elsewhere where it is easily accessible you might want to leave it
2332
+ * unstored. This will keep the index size a lot smaller and make the
2333
+ * indexing process a lot faster. For example, you should probably leave the
2334
+ * +:content+ field unstored when indexing all the documents in your
2335
+ * file-system.
2336
+ *
2337
+ * === :index
2338
+ *
2339
+ * The +:index+ property allows you to specify how a field is indexed. A
2340
+ * field must be indexed to be searchable. However, a field doesn't need to
2341
+ * be indexed to be store in the Ferret index. You may want to use the index
2342
+ * as a simple database and store things like images or MP3s in the index. By
2343
+ * default each field is indexed and tokenized (split into tokens) (+:yes+).
2344
+ * If you don't want to index the field use +:no+. If you wan the field
2345
+ * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2346
+ * wish to sort by. There are two other values for +:index+; +:omit_norms+
2347
+ * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
2348
+ * +:untokenized+ respectively and are useful if you are not boosting any
2349
+ * fields and you'd like to speed up the index. The norms file is the file
2350
+ * which contains the boost values for each document for a particular field.
2351
+ *
2352
+ * === :term_vector
2353
+ *
2354
+ * See TermVector for a description of term-vectors. You can specify whether
2355
+ * or not you would like to store term-vectors. The available options are
2356
+ * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2357
+ * +:with_positions_offsets+. Note that you need to store the positions to
2358
+ * asscociate offsets with individual terms in the term_vector.
2359
+ *
2360
+ * == Property Table
2361
+ *
2362
+ * Property Value Description
2363
+ * ------------------------------------------------------------------------
2364
+ * :store | :no | Don't store field
2365
+ * | |
2366
+ * | :yes (default) | Store field in its original
2367
+ * | | format. Use this value if you
2368
+ * | | want to highlight matches.
2369
+ * | | or print match excerpts a la
2370
+ * | | Google search.
2371
+ * | |
2372
+ * | :compressed | Store field in compressed
2373
+ * | | format.
2374
+ * -------------|-------------------------|------------------------------
2375
+ * :index | :no | Do not make this field
2376
+ * | | searchable.
2377
+ * | |
2378
+ * | :yes (default) | Make this field searchable and
2379
+ * | | tokenized its contents.
2380
+ * | |
2381
+ * | :untokenized | Make this field searchable but
2382
+ * | | do not tokenize its contents.
2383
+ * | | use this value for fields you
2384
+ * | | wish to sort by.
2385
+ * | |
2386
+ * | :omit_norms | Same as :yes except omit the
2387
+ * | | norms file. The norms file can
2388
+ * | | be omitted if you don't boost
2389
+ * | | any fields and you don't need
2390
+ * | | scoring based on field length.
2391
+ * | |
2392
+ * | :untokenized_omit_norms | Same as :untokenized except omit
2393
+ * | | the norms file. Norms files can
2394
+ * | | be omitted if you don't boost
2395
+ * | | any fields and you don't need
2396
+ * | | scoring based on field length.
2397
+ * | |
2398
+ * -------------|-------------------------|------------------------------
2399
+ * :term_vector | :no | Don't store term-vectors
2400
+ * | |
2401
+ * | :yes | Store term-vectors without
2402
+ * | | storing positions or offsets.
2403
+ * | |
2404
+ * | :with_positions | Store term-vectors with
2405
+ * | | positions.
2406
+ * | |
2407
+ * | :with_offsets | Store term-vectors with
2408
+ * | | offsets.
2409
+ * | |
2410
+ * | :with_positions_offsets | Store term-vectors with
2411
+ * | (default) | positions and offsets.
2412
+ *
2413
+ * == Examples
2414
+ *
2415
+ * fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
2416
+ * :boost => 10.0)
2417
+ *
2418
+ * fi = FieldInfo.new(:content)
2419
+ *
2420
+ * fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
2421
+ * :term_vector => :no)
2422
+ *
2423
+ * fi = FieldInfo.new(:image, :store => :compressed, :index => :no,
2424
+ * :term_vector => :no)
2425
+ */
2426
+ static void
2427
+ Init_FieldInfo(void)
2428
+ {
2429
+ sym_store = ID2SYM(rb_intern("store"));
2430
+ sym_index = ID2SYM(rb_intern("index"));
2431
+ sym_term_vector = ID2SYM(rb_intern("term_vector"));
2432
+
2433
+ sym_compress = ID2SYM(rb_intern("compress"));
2434
+ sym_compressed = ID2SYM(rb_intern("compressed"));
2435
+
2436
+ sym_untokenized = ID2SYM(rb_intern("untokenized"));
2437
+ sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
2438
+ sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
2439
+
2440
+ sym_with_positions = ID2SYM(rb_intern("with_positions"));
2441
+ sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
2442
+ sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
2443
+
2444
+ cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
2445
+ rb_define_alloc_func(cFieldInfo, frt_data_alloc);
2446
+
2447
+ rb_define_method(cFieldInfo, "initialize", frt_fi_init, -1);
2448
+ rb_define_method(cFieldInfo, "stored?", frt_fi_is_stored, 0);
2449
+ rb_define_method(cFieldInfo, "compressed?", frt_fi_is_compressed, 0);
2450
+ rb_define_method(cFieldInfo, "indexed?", frt_fi_is_indexed, 0);
2451
+ rb_define_method(cFieldInfo, "tokenized?", frt_fi_is_tokenized, 0);
2452
+ rb_define_method(cFieldInfo, "omit_norms?", frt_fi_omit_norms, 0);
2453
+ rb_define_method(cFieldInfo, "store_term_vector?",
2454
+ frt_fi_store_term_vector, 0);
2455
+ rb_define_method(cFieldInfo, "store_positions?",
2456
+ frt_fi_store_positions, 0);
2457
+ rb_define_method(cFieldInfo, "store_offsets?",
2458
+ frt_fi_store_offsets, 0);
2459
+ rb_define_method(cFieldInfo, "has_norms?", frt_fi_has_norms, 0);
2460
+ rb_define_method(cFieldInfo, "boost", frt_fi_boost, 0);
2461
+ rb_define_method(cFieldInfo, "to_s", frt_fi_to_s, 0);
2462
+ }
2463
+
2464
+ /*
2465
+ * Document-class: Ferret::Index::FieldInfos
2466
+ *
2467
+ * == Summary
2468
+ *
2469
+ * The FieldInfos class holds all the field descriptors for an index. It is
2470
+ * this class that is used to create a new index using the
2471
+ * FieldInfos#create_index method. If you are happy with the default
2472
+ * properties for FieldInfo then you don't need to worry about this class.
2473
+ * IndexWriter can create the index for you. Otherwise you should set up the
2474
+ * index like in the example;
2475
+ *
2476
+ * == Example
2477
+ *
2478
+ * field_infos = FieldInfos.new(:term_vector => :no)
2479
+ *
2480
+ * field_infos.add_field(:title, :index => :untokenized, :term_vector => :no,
2481
+ * :boost => 10.0)
2482
+ *
2483
+ * field_infos.add_field(:content)
2484
+ *
2485
+ * field_infos.add_field(:created_on, :index => :untokenized_omit_norms,
2486
+ * :term_vector => :no)
2487
+ *
2488
+ * field_infos.add_field(:image, :store => :compressed, :index => :no,
2489
+ * :term_vector => :no)
2490
+ *
2491
+ * field_infos.create_index("/path/to/index")
2492
+ *
2493
+ * == Default Properties
2494
+ *
2495
+ * See FieldInfo for the available field property values.
2496
+ *
2497
+ * When you create the FieldInfos object you specify the default properties
2498
+ * for the fields. Often you'll specify all of the fields in the index before
2499
+ * you create the index so the default values won't come into play. However,
2500
+ * it is possible to continue to dynamically add fields as indexing goes
2501
+ * along. If you add a document to the index which has fields that the index
2502
+ * doesn't know about then the default properties are used for the new field.
2503
+ */
2504
+ static void
2505
+ Init_FieldInfos(void)
2506
+ {
2507
+ Init_FieldInfo();
2508
+
2509
+ cFieldInfos = rb_define_class_under(mIndex, "FieldInfos", rb_cObject);
2510
+ rb_define_alloc_func(cFieldInfos, frt_data_alloc);
2511
+
2512
+ rb_define_method(cFieldInfos, "initialize", frt_fis_init, -1);
2513
+ rb_define_method(cFieldInfos, "to_a", frt_fis_to_a, 0);
2514
+ rb_define_method(cFieldInfos, "[]", frt_fis_get, 1);
2515
+ rb_define_method(cFieldInfos, "add", frt_fis_add, 1);
2516
+ rb_define_method(cFieldInfos, "<<", frt_fis_add, 1);
2517
+ rb_define_method(cFieldInfos, "add_field", frt_fis_add_field, -1);
2518
+ rb_define_method(cFieldInfos, "each", frt_fis_each, 0);
2519
+ rb_define_method(cFieldInfos, "to_s", frt_fis_to_s, 0);
2520
+ rb_define_method(cFieldInfos, "create_index",
2521
+ frt_fis_create_index, 1);
2522
+ rb_define_method(cFieldInfos, "fields", frt_fis_get_fields, -1);
2523
+ }
2524
+
2525
+ /*
2526
+ * Document-class: Ferret::Index::TermEnum
2527
+ *
2528
+ * == Summary
2529
+ *
2530
+ * The TermEnum object is used to iterate through the terms in a field. To
2531
+ * get a TermEnum you need to use the IndexReader#terms(field) method.
2532
+ *
2533
+ * == Example
2534
+ *
2535
+ * te = index_reader.terms(:content)
2536
+ *
2537
+ * te.each {|term, doc_freq| puts "#{term} occured #{doc_freq} times" }
2538
+ *
2539
+ * # or you could do it like this;
2540
+ * te = index_reader.terms(:content)
2541
+ *
2542
+ * while te.next?
2543
+ * puts "#{te.term} occured in #{te.doc_freq} documents in the index"
2544
+ * end
2545
+ */
2546
+ static void
2547
+ Init_TermEnum(void)
2548
+ {
2549
+ id_term = rb_intern("@term");
2550
+
2551
+ cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
2552
+ rb_define_alloc_func(cTermEnum, frt_data_alloc);
2553
+
2554
+ rb_define_method(cTermEnum, "next?", frt_te_next, 0);
2555
+ rb_define_method(cTermEnum, "term", frt_te_term, 0);
2556
+ rb_define_method(cTermEnum, "doc_freq", frt_te_doc_freq, 0);
2557
+ rb_define_method(cTermEnum, "skip_to", frt_te_skip_to, 1);
2558
+ rb_define_method(cTermEnum, "each", frt_te_each, 0);
2559
+ rb_define_method(cTermEnum, "field=", frt_te_set_field, 1);
2560
+ rb_define_method(cTermEnum, "set_field",frt_te_set_field, 1);
2561
+ }
2562
+
2563
+ /*
2564
+ * Document-class: Ferret::Index::TermDocEnum
2565
+ *
2566
+ * == Summary
2567
+ *
2568
+ * Use a TermDocEnum to iterate through the documents that contain a
2569
+ * particular term. You can also iterate through the positions which the term
2570
+ * occurs in a document.
2571
+ *
2572
+ *
2573
+ * == Example
2574
+ *
2575
+ * tde = index_reader.term_docs_for(:content, "fox")
2576
+ *
2577
+ * tde.each do |doc_id, freq|
2578
+ * puts "fox appeared #{freq} times in document #{doc_id}:"
2579
+ * positions = []
2580
+ * tde.each_position {|pos| positions << pos}
2581
+ * puts " #{positions.join(', ')}"
2582
+ * end
2583
+ *
2584
+ * # or you can do it like this;
2585
+ * tde.seek(:title, "red")
2586
+ * while tde.next?
2587
+ * puts "red appeared #{tde.freq} times in document #{tde.doc}:"
2588
+ * positions = []
2589
+ * while pos = tde.next_position
2590
+ * positions << pos
2591
+ * end
2592
+ * puts " #{positions.join(', ')}"
2593
+ * end
2594
+ */
2595
+ static void
2596
+ Init_TermDocEnum(void)
2597
+ {
2598
+ id_fld_num_map = rb_intern("@field_num_map");
2599
+ id_field_num = rb_intern("@field_num");
2600
+
2601
+ cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
2602
+ rb_define_alloc_func(cTermDocEnum, frt_data_alloc);
2603
+ rb_define_method(cTermDocEnum, "seek", frt_tde_seek, 2);
2604
+ rb_define_method(cTermDocEnum, "seek_term_enum", frt_tde_seek_te, 1);
2605
+ rb_define_method(cTermDocEnum, "doc", frt_tde_doc, 0);
2606
+ rb_define_method(cTermDocEnum, "freq", frt_tde_freq, 0);
2607
+ rb_define_method(cTermDocEnum, "next?", frt_tde_next, 0);
2608
+ rb_define_method(cTermDocEnum, "next_position", frt_tde_next_position, 0);
2609
+ rb_define_method(cTermDocEnum, "each", frt_tde_each, 0);
2610
+ rb_define_method(cTermDocEnum, "each_position", frt_tde_each_position, 0);
2611
+ rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
2612
+ }
2613
+
2614
+ /*
2615
+ * Document-class: Ferret::Index::TermVector::TVOffsets
2616
+ *
2617
+ * == Summary
2618
+ *
2619
+ * Holds the start and end byte-offsets of a term in a field. For example, if
2620
+ * the field was "the quick brown fox" then the start and end offsets of
2621
+ * ["the", "quick", "brown", "fox"] would be [(0,3), (4,9), (10,15), (16,19)]
2622
+ * respectively. See the Analysis module for more information on setting the
2623
+ * offsets.
2624
+ */
2625
+ static void
2626
+ Init_TVOffsets(void)
2627
+ {
2628
+ const char *tv_offsets_class = "TVOffsets";
2629
+ cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
2630
+ rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
2631
+ rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
2632
+ }
2633
+
2634
+ /*
2635
+ * Document-class: Ferret::Index::TermVector::TVTerm
2636
+ *
2637
+ * == Summary
2638
+ *
2639
+ * The TVTerm class holds the term information for each term in a TermVector.
2640
+ * That is it holds the term's text and its positions in the document. You
2641
+ * can use those positions to reference the offsets for the term.
2642
+ *
2643
+ * == Example
2644
+ *
2645
+ * tv = index_reader.term_vector(:content)
2646
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
2647
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
2648
+ */
2649
+ static void
2650
+ Init_TVTerm(void)
2651
+ {
2652
+ const char *tv_term_class = "TVTerm";
2653
+ cTVTerm = rb_struct_define(tv_term_class, "text", "positions", NULL);
2654
+ rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
2655
+ rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
2656
+ }
2657
+
2658
+ /*
2659
+ * Document-class: Ferret::Index::TermVector
2660
+ *
2661
+ * == Summary
2662
+ *
2663
+ * TermVectors are most commonly used for creating search result excerpts and
2664
+ * highlight search matches in results. This is all done internally so you
2665
+ * won't need to worry about the TermVector object. There are some other
2666
+ * reasons you may want to use the TermVectors object however. For example,
2667
+ * you may wish to see which terms are the most commonly occuring terms in a
2668
+ * document to implement a MoreLikeThis search.
2669
+ *
2670
+ * == Example
2671
+ *
2672
+ * tv = index_reader.term_vector(:content)
2673
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
2674
+ *
2675
+ * # get the term frequency
2676
+ * term_freq = tv_term.positions.size
2677
+ *
2678
+ * # get the offsets for a term
2679
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
2680
+ *
2681
+ * == Note
2682
+ *
2683
+ * +positions+ and +offsets+ can be +nil+ depending on what you set the
2684
+ * +:term_vector+ to when you set the FieldInfo object for the field. Note in
2685
+ * particular that you need to store both positions and offsets if you want
2686
+ * to asscociate offsets with particular terms.
2687
+ */
2688
+ static void
2689
+ Init_TermVector(void)
2690
+ {
2691
+ const char *tv_class = "TermVector";
2692
+ cTermVector = rb_struct_define(tv_class,
2693
+ "field", "terms", "offsets", NULL);
2694
+ rb_set_class_path(cTermVector, mIndex, tv_class);
2695
+ rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
2696
+
2697
+ Init_TVOffsets();
2698
+ Init_TVTerm();
2699
+ }
2700
+
2701
+ /*
2702
+ * Document-class: Ferret::Index::IndexWriter
2703
+ *
2704
+ * == Summary
2705
+ *
2706
+ * The IndexWriter is the class used to add documents to an index. You can
2707
+ * also delete docuements from the index using this class. The indexing
2708
+ * process is highly customizable and the IndexWriter has the following
2709
+ * parameters;
2710
+ *
2711
+ * dir:: This is an Ferret::Store::Directory object. You
2712
+ * should either pass a +:dir+ or a +:path+ when
2713
+ * creating an index.
2714
+ * path:: A string representing the path to the index
2715
+ * directory. If you are creating the index for the
2716
+ * first time the directory will be created if it's
2717
+ * missing. You should not choose a directory which
2718
+ * contains other files as they could be over-written.
2719
+ * To protect against this set +:create_if_missing+ to
2720
+ * false.
2721
+ * create_if_missing:: Default: true. Create the index if no index is
2722
+ * found in the specified directory. Otherwise, use
2723
+ * the existing index.
2724
+ * create:: Default: false. Creates the index, even if one
2725
+ * already exists. That means any existing index will
2726
+ * be deleted. It is probably better to use the
2727
+ * create_if_missing option so that the index is only
2728
+ * created the first time when it doesn't exist.
2729
+ * field_infos:: Default FieldInfos.new. The FieldInfos object to use
2730
+ * when creating a new index if +:create_if_missing+ or
2731
+ * +:create+ is set to true. If an existing index is
2732
+ * opened then this parameter is ignored.
2733
+ * analyzer:: Default: Ferret::Analysis::StandardAnalyzer.
2734
+ * Sets the default analyzer for the index. This is
2735
+ * used by both the IndexWriter and the QueryParser
2736
+ * to tokenize the input. The default is the
2737
+ * StandardAnalyzer.
2738
+ * chunk_size:: Default: 0x100000 or 1Mb. Memory performance tuning
2739
+ * parameter. Sets the default size of chunks of memory
2740
+ * malloced for use during indexing. You can usually
2741
+ * leave this parameter as is.
2742
+ * max_buffer_memory:: Default: 0x1000000 or 16Mb. Memory performance
2743
+ * tuning parameter. Sets the amount of memory to be
2744
+ * used by the indexing process. Set to a larger value
2745
+ * to increase indexing speed. Note that this only
2746
+ * includes memory used by the indexing process, not
2747
+ * the rest of your ruby application.
2748
+ * term_index_interval:: Default: 128. The skip interval between terms in the
2749
+ * term dictionary. A smaller value will possibly
2750
+ * increase search performance while also increasing
2751
+ * memory usage and impacting negatively impacting
2752
+ * indexing performance.
2753
+ * doc_skip_interval:: Default: 16. The skip interval for document numbers
2754
+ * in the index. As with +:term_index_interval+ you
2755
+ * have a trade-off. A smaller number may increase
2756
+ * search performance while also increasing memory
2757
+ * usage and impacting negatively impacting indexing
2758
+ * performance.
2759
+ * merge_factor:: Default: 10. This must never be less than 2.
2760
+ * Specifies the number of segments of a certain size
2761
+ * that must exist before they are merged. A larger
2762
+ * value will improve indexing performance while
2763
+ * slowing search performance.
2764
+ * max_buffered_docs:: Default: 10000. The maximum number of documents that
2765
+ * may be stored in memory before being written to the
2766
+ * index. If you have a lot of memory and are indexing
2767
+ * a large number of small documents (like products in
2768
+ * a product database for example) you may want to set
2769
+ * this to a much higher number (like
2770
+ * Ferret::FIX_INT_MAX). If you are worried about your
2771
+ * application crashing during the middle of index you
2772
+ * might set this to a smaller number so that the index
2773
+ * is committed more often. This is like having an
2774
+ * auto-save in a word processor application.
2775
+ * max_merge_docs:: Set this value to limit the number of documents that
2776
+ * go into a single segment. Use this to avoid
2777
+ * extremely long merge times during indexing which can
2778
+ * make your application seem unresponsive. This is
2779
+ * only necessary for very large indexes (millions of
2780
+ * documents).
2781
+ * max_field_length:: Default: 10000. The maximum number of terms added to
2782
+ * a single field. This can be useful to protect the
2783
+ * indexer when indexing documents fromt the web for
2784
+ * example. Usually the most important terms will occur
2785
+ * early on in a document so you can often safely
2786
+ * ignore the terms in a field after a certain number
2787
+ * of them. If you wanted to speed up indexing and same
2788
+ * space in your index you may only want to index the
2789
+ * first 1000 terms in a field. On the other hand, if
2790
+ * you want to be more thorough and you are indexing
2791
+ * documents from your file-system you may set this
2792
+ * paramter to Ferret::FIX_INT_MAX.
2793
+ * use_compound_file:: Default: true. Uses a compound file to store the
2794
+ * index. This prevents an error being raised for
2795
+ * having too many files open at the same time. The
2796
+ * default is true but performance is better if this is
2797
+ * set to false.
2798
+ *
2799
+ *
2800
+ * === Deleting Documents
2801
+ *
2802
+ * Both IndexReader and IndexWriter allow you to delete documents. You should
2803
+ * use the IndexReader to delete documents by document id and IndexWriter to
2804
+ * delete documents by term which we'll explain now. It is preferrable to
2805
+ * delete documents from an index using IndexWriter for performance reasons.
2806
+ * To delete documents using the IndexWriter you should give each document in
2807
+ * the index a unique ID. If you are indexing documents from the file-system
2808
+ * this unique ID will be the full file path. If indexing documents from the
2809
+ * database you should use the primary key as the ID field. You can then
2810
+ * use the delete method to delete a file referenced by the ID. For example;
2811
+ *
2812
+ * index_writer.delete(:id, "/path/to/indexed/file")
2813
+ */
2814
+ void
2815
+ Init_IndexWriter(void)
2816
+ {
2817
+ id_boost = rb_intern("boost");
2818
+
2819
+ sym_create = ID2SYM(rb_intern("create"));
2820
+ sym_create_if_missing = ID2SYM(rb_intern("create_if_missing"));
2821
+ sym_field_infos = ID2SYM(rb_intern("field_infos"));
2822
+
2823
+ sym_chunk_size = ID2SYM(rb_intern("chunk_size"));
2824
+ sym_max_buffer_memory = ID2SYM(rb_intern("max_buffer_memory"));
2825
+ sym_index_interval = ID2SYM(rb_intern("term_index_interval"));
2826
+ sym_skip_interval = ID2SYM(rb_intern("doc_skip_interval"));
2827
+ sym_merge_factor = ID2SYM(rb_intern("merge_factor"));
2828
+ sym_max_buffered_docs = ID2SYM(rb_intern("max_buffered_docs"));
2829
+ sym_max_merge_docs = ID2SYM(rb_intern("max_merge_docs"));
2830
+ sym_max_field_length = ID2SYM(rb_intern("max_field_length"));
2831
+ sym_use_compound_file = ID2SYM(rb_intern("use_compound_file"));
2832
+
2833
+ cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
2834
+ rb_define_alloc_func(cIndexWriter, frt_data_alloc);
2835
+
2836
+ rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
2837
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
2838
+ rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
2839
+ rb_str_new2(WRITE_LOCK_NAME));
2840
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
2841
+ rb_str_new2(COMMIT_LOCK_NAME));
2842
+ rb_define_const(cIndexWriter, "DEFAULT_CHUNK_SIZE",
2843
+ INT2FIX(default_config.chunk_size));
2844
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFER_MEMORY",
2845
+ INT2FIX(default_config.max_buffer_memory));
2846
+ rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
2847
+ INT2FIX(default_config.index_interval));
2848
+ rb_define_const(cIndexWriter, "DEFAULT_DOC_SKIP_INTERVAL",
2849
+ INT2FIX(default_config.skip_interval));
2850
+ rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
2851
+ INT2FIX(default_config.merge_factor));
2852
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFERED_DOCS",
2853
+ INT2FIX(default_config.max_buffered_docs));
2854
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
2855
+ INT2FIX(default_config.max_merge_docs));
2856
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
2857
+ INT2FIX(default_config.max_field_length));
2858
+ rb_define_const(cIndexWriter, "DEFAULT_USE_COMPOUND_FILE",
2859
+ default_config.use_compound_file ? Qtrue : Qfalse);
2860
+
2861
+ rb_define_method(cIndexWriter, "initialize", frt_iw_init, -1);
2862
+ rb_define_method(cIndexWriter, "doc_count", frt_iw_get_doc_count, 0);
2863
+ rb_define_method(cIndexWriter, "close", frt_iw_close, 0);
2864
+ rb_define_method(cIndexWriter, "add_document", frt_iw_add_doc, 1);
2865
+ rb_define_method(cIndexWriter, "<<", frt_iw_add_doc, 1);
2866
+ rb_define_method(cIndexWriter, "optimize", frt_iw_optimize, 0);
2867
+ rb_define_method(cIndexWriter, "commit", frt_iw_commit, 0);
2868
+ rb_define_method(cIndexWriter, "add_readers", frt_iw_add_readers, 1);
2869
+ rb_define_method(cIndexWriter, "delete", frt_iw_delete, 2);
2870
+ rb_define_method(cIndexWriter, "field_infos", frt_iw_field_infos, 0);
2871
+ rb_define_method(cIndexWriter, "analyzer", frt_iw_get_analyzer, 0);
2872
+ rb_define_method(cIndexWriter, "analyzer=", frt_iw_set_analyzer, 1);
2873
+
2874
+ rb_define_method(cIndexWriter, "chunk_size",
2875
+ frt_iw_get_chunk_size, 0);
2876
+ rb_define_method(cIndexWriter, "chunk_size=",
2877
+ frt_iw_set_chunk_size, 1);
2878
+
2879
+ rb_define_method(cIndexWriter, "max_buffer_memory",
2880
+ frt_iw_get_max_buffer_memory, 0);
2881
+ rb_define_method(cIndexWriter, "max_buffer_memory=",
2882
+ frt_iw_set_max_buffer_memory, 1);
2883
+
2884
+ rb_define_method(cIndexWriter, "term_index_interval",
2885
+ frt_iw_get_index_interval, 0);
2886
+ rb_define_method(cIndexWriter, "term_index_interval=",
2887
+ frt_iw_set_index_interval, 1);
2888
+
2889
+ rb_define_method(cIndexWriter, "doc_skip_interval",
2890
+ frt_iw_get_skip_interval, 0);
2891
+ rb_define_method(cIndexWriter, "doc_skip_interval=",
2892
+ frt_iw_set_skip_interval, 1);
2893
+
2894
+ rb_define_method(cIndexWriter, "merge_factor",
2895
+ frt_iw_get_merge_factor, 0);
2896
+ rb_define_method(cIndexWriter, "merge_factor=",
2897
+ frt_iw_set_merge_factor, 1);
2898
+
2899
+ rb_define_method(cIndexWriter, "max_buffered_docs",
2900
+ frt_iw_get_max_buffered_docs, 0);
2901
+ rb_define_method(cIndexWriter, "max_buffered_docs=",
2902
+ frt_iw_set_max_buffered_docs, 1);
2903
+
2904
+ rb_define_method(cIndexWriter, "max_merge_docs",
2905
+ frt_iw_get_max_merge_docs, 0);
2906
+ rb_define_method(cIndexWriter, "max_merge_docs=",
2907
+ frt_iw_set_max_merge_docs, 1);
2908
+
2909
+ rb_define_method(cIndexWriter, "max_field_length",
2910
+ frt_iw_get_max_field_length, 0);
2911
+ rb_define_method(cIndexWriter, "max_field_length=",
2912
+ frt_iw_set_max_field_length, 1);
2913
+
2914
+ rb_define_method(cIndexWriter, "use_compound_file",
2915
+ frt_iw_get_use_compound_file, 0);
2916
+ rb_define_method(cIndexWriter, "use_compound_file=",
2917
+ frt_iw_set_use_compound_file, 1);
2918
+
2919
+ }
2920
+
2921
+ /*
2922
+ * Document-class: Ferret::Index::LazyDoc
2923
+ *
2924
+ * == Summary
2925
+ *
2926
+ * When a document is retrieved from the index a LazyDoc is returned.
2927
+ * Actually, LazyDoc is just a modified Hash object which lazily adds fields
2928
+ * to itself when they are accessed. You should not that they keys method
2929
+ * will return nothing until you actually access one of the fields. To see
2930
+ * what fields are available use LazyDoc#fields rather than LazyDoc#keys. To
2931
+ * load all fields use the LazyDoc#load method.
2932
+ *
2933
+ * == Example
2934
+ *
2935
+ * doc = index_reader[0]
2936
+ *
2937
+ * doc.keys #=> []
2938
+ * doc.values #=> []
2939
+ * doc.fields #=> [:title, :content]
2940
+ *
2941
+ * title = doc[:title] #=> "the title"
2942
+ * doc.keys #=> [:title]
2943
+ * doc.values #=> ["the title"]
2944
+ * doc.fields #=> [:title, :content]
2945
+ *
2946
+ * doc.load
2947
+ * doc.keys #=> [:title, :content]
2948
+ * doc.values #=> ["the title", "the content"]
2949
+ * doc.fields #=> [:title, :content]
2950
+ */
2951
+ void
2952
+ Init_LazyDoc(void)
2953
+ {
2954
+ id_fields = rb_intern("@fields");
2955
+
2956
+
2957
+ cLazyDoc = rb_define_class_under(mIndex, "LazyDoc", rb_cHash);
2958
+ rb_define_method(cLazyDoc, "default", frt_lzd_default, 1);
2959
+ rb_define_method(cLazyDoc, "load", frt_lzd_load, 0);
2960
+ rb_define_method(cLazyDoc, "fields", frt_lzd_fields, 0);
2961
+
2962
+ cLazyDocData = rb_define_class_under(cLazyDoc, "LazyDocData", rb_cObject);
2963
+ rb_define_alloc_func(cLazyDocData, frt_data_alloc);
2964
+ }
2965
+
2966
+ /*
2967
+ * Document-class: Ferret::Index::IndexReader
2968
+ *
2969
+ * == Summary
2970
+ *
2971
+ * IndexReader is used for reading data from the index. This class is usually
2972
+ * used directly for more advanced tasks like iterating through terms in an
2973
+ * index, accessing term-vectors or deleting documents by document id. It is
2974
+ * also used internally by IndexSearcher.
2975
+ */
2976
+ void
2977
+ Init_IndexReader(void)
2978
+ {
2979
+ cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
2980
+ rb_define_alloc_func(cIndexReader, frt_data_alloc);
2981
+ rb_define_method(cIndexReader, "initialize", frt_ir_init, 1);
2982
+ rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
2983
+ rb_define_method(cIndexReader, "norms", frt_ir_norms, 1);
2984
+ rb_define_method(cIndexReader, "get_norms_into",frt_ir_get_norms_into, 3);
2985
+ rb_define_method(cIndexReader, "commit", frt_ir_commit, 0);
2986
+ rb_define_method(cIndexReader, "close", frt_ir_close, 0);
2987
+ rb_define_method(cIndexReader, "has_deletions?",frt_ir_has_deletions, 0);
2988
+ rb_define_method(cIndexReader, "delete", frt_ir_delete, 1);
2989
+ rb_define_method(cIndexReader, "deleted?", frt_ir_is_deleted, 1);
2990
+ rb_define_method(cIndexReader, "max_doc", frt_ir_max_doc, 0);
2991
+ rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
2992
+ rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
2993
+ rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
2994
+ rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, 1);
2995
+ rb_define_method(cIndexReader, "[]", frt_ir_get_doc, 1);
2996
+ rb_define_method(cIndexReader, "term_vector", frt_ir_term_vector, 2);
2997
+ rb_define_method(cIndexReader, "term_vectors", frt_ir_term_vectors, 1);
2998
+ rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
2999
+ rb_define_method(cIndexReader, "term_positions",frt_ir_term_positions, 0);
3000
+ rb_define_method(cIndexReader, "term_docs_for", frt_ir_term_docs_for, 2);
3001
+ rb_define_method(cIndexReader, "term_positions_for", frt_ir_t_pos_for, 2);
3002
+ rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 2);
3003
+ rb_define_method(cIndexReader, "terms", frt_ir_terms, 1);
3004
+ rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 2);
3005
+ rb_define_method(cIndexReader, "field_names", frt_ir_field_names, 0);
3006
+ rb_define_method(cIndexReader, "field_infos", frt_ir_field_infos, 0);
3007
+ }
3008
+
3009
+ /* rdoc hack
3010
+ extern VALUE mFerret = rb_define_module("Ferret");
3011
+ */
3012
+
3013
+ /*
3014
+ * Document-module: Ferret::Index
3015
+ *
3016
+ * == Summary
3017
+ *
3018
+ * The Index module contains all the classes used for adding to and
3019
+ * retrieving from the index. The important classes to know about are;
3020
+ *
3021
+ * * FieldInfo
3022
+ * * FieldInfos
3023
+ * * IndexWriter
3024
+ * * IndexReader
3025
+ * * LazyDoc
3026
+ *
3027
+ * The other classes in this module are useful for more advanced uses like
3028
+ * building tag clouds, creating more-like-this queries, custom highlighting
3029
+ * etc. They are also useful for index browsers.
3030
+ */
3031
+ void
3032
+ Init_Index(void)
3033
+ {
3034
+ mIndex = rb_define_module_under(mFerret, "Index");
3035
+
3036
+ sym_boost = ID2SYM(rb_intern("boost"));
3037
+ sym_analyzer = ID2SYM(rb_intern("analyzer"));
3038
+ sym_close_dir = ID2SYM(rb_intern("close_dir"));
3039
+
3040
+ Init_TermVector();
3041
+ Init_TermEnum();
3042
+ Init_TermDocEnum();
3043
+
3044
+ Init_FieldInfos();
3045
+
3046
+ Init_LazyDoc();
3047
+ Init_IndexWriter();
3048
+ Init_IndexReader();
3049
+ }