ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_index.c ADDED
@@ -0,0 +1,3049 @@
1
+ #include "ferret.h"
2
+ #include "index.h"
3
+ #include <st.h>
4
+
5
+ VALUE mIndex;
6
+
7
+ VALUE cFieldInfo;
8
+ VALUE cFieldInfos;
9
+
10
+ VALUE cTVOffsets;
11
+ VALUE cTVTerm;
12
+ VALUE cTermVector;
13
+
14
+ VALUE cTermEnum;
15
+ VALUE cTermDocEnum;
16
+
17
+ VALUE cLazyDoc;
18
+ VALUE cLazyDocData;
19
+ VALUE cIndexWriter;
20
+ VALUE cIndexReader;
21
+
22
+ VALUE sym_analyzer;
23
+ static VALUE sym_close_dir;
24
+ static VALUE sym_create;
25
+ static VALUE sym_create_if_missing;
26
+
27
+ static VALUE sym_chunk_size;
28
+ static VALUE sym_max_buffer_memory;
29
+ static VALUE sym_index_interval;
30
+ static VALUE sym_skip_interval;
31
+ static VALUE sym_merge_factor;
32
+ static VALUE sym_max_buffered_docs;
33
+ static VALUE sym_max_merge_docs;
34
+ static VALUE sym_max_field_length;
35
+ static VALUE sym_use_compound_file;
36
+
37
+ static VALUE sym_boost;
38
+ static VALUE sym_field_infos;
39
+
40
+ static VALUE sym_store;
41
+ static VALUE sym_index;
42
+ static VALUE sym_term_vector;
43
+
44
+ static VALUE sym_compress;
45
+ static VALUE sym_compressed;
46
+
47
+ static VALUE sym_untokenized;
48
+ static VALUE sym_omit_norms;
49
+ static VALUE sym_untokenized_omit_norms;
50
+
51
+ static VALUE sym_with_positions;
52
+ static VALUE sym_with_offsets;
53
+ static VALUE sym_with_positions_offsets;
54
+
55
+ static ID id_term;
56
+ static ID id_fields;
57
+ static ID id_fld_num_map;
58
+ static ID id_field_num;
59
+ static ID id_boost;
60
+
61
+ extern void frt_set_term(VALUE rterm, Term *t);
62
+ extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
63
+ extern VALUE frt_get_analyzer(Analyzer *a);
64
+
65
+ /****************************************************************************
66
+ *
67
+ * FieldInfo Methods
68
+ *
69
+ ****************************************************************************/
70
+
71
+ static void
72
+ frt_fi_free(void *p)
73
+ {
74
+ object_del(p);
75
+ fi_deref((FieldInfo *)p);
76
+ }
77
+
78
+ static void
79
+ frt_fi_get_params(VALUE roptions,
80
+ enum StoreValues *store,
81
+ enum IndexValues *index,
82
+ enum TermVectorValues *term_vector,
83
+ float *boost)
84
+ {
85
+ VALUE v;
86
+ Check_Type(roptions, T_HASH);
87
+ v = rb_hash_aref(roptions, sym_boost);
88
+ if (Qnil != v) {
89
+ *boost = (float)NUM2DBL(v);
90
+ } else {
91
+ *boost = 1.0f;
92
+ }
93
+ v = rb_hash_aref(roptions, sym_store);
94
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
95
+ if (v == sym_no || v == sym_false || v == Qfalse) {
96
+ *store = STORE_NO;
97
+ } else if (v == sym_yes || v == sym_true || v == Qtrue || v == Qnil) {
98
+ *store = STORE_YES;
99
+ } else if (v == sym_compress || v == sym_compressed) {
100
+ *store = STORE_COMPRESS;
101
+ } else {
102
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
103
+ " Please choose from [:yes, :no, :compressed]",
104
+ rb_id2name(SYM2ID(v)));
105
+ }
106
+
107
+ v = rb_hash_aref(roptions, sym_index);
108
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
109
+ if (v == sym_no || v == sym_false || v == Qfalse) {
110
+ *index = INDEX_NO;
111
+ } else if (v == sym_yes || v == sym_true || v == Qtrue || v == Qnil) {
112
+ *index = INDEX_YES;
113
+ } else if (v == sym_untokenized) {
114
+ *index = INDEX_UNTOKENIZED;
115
+ } else if (v == sym_omit_norms) {
116
+ *index = INDEX_YES_OMIT_NORMS;
117
+ } else if (v == sym_untokenized_omit_norms) {
118
+ *index = INDEX_UNTOKENIZED_OMIT_NORMS;
119
+ } else {
120
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
121
+ " Please choose from [:no, :yes, :untokenized, "
122
+ ":omit_norms, :untokenized_omit_norms]",
123
+ rb_id2name(SYM2ID(v)));
124
+ }
125
+
126
+ v = rb_hash_aref(roptions, sym_term_vector);
127
+ if (Qnil != v) Check_Type(v, T_SYMBOL);
128
+ if (v == sym_no || v == sym_false || v == Qfalse) {
129
+ *term_vector = TERM_VECTOR_NO;
130
+ } else if (v == sym_yes || v == sym_true || v == Qtrue) {
131
+ *term_vector = TERM_VECTOR_YES;
132
+ } else if (v == sym_with_positions) {
133
+ *term_vector = TERM_VECTOR_WITH_POSITIONS;
134
+ } else if (v == sym_with_offsets) {
135
+ *term_vector = TERM_VECTOR_WITH_OFFSETS;
136
+ } else if (v == sym_with_positions_offsets || v == Qnil) {
137
+ *term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
138
+ } else {
139
+ rb_raise(rb_eArgError, ":%s isn't a valid argument for "
140
+ ":term_vector. Please choose from [:no, :yes, "
141
+ ":with_positions, :with_offsets, "
142
+ ":with_positions_offsets]",
143
+ rb_id2name(SYM2ID(v)));
144
+ }
145
+ }
146
+
147
+ static VALUE
148
+ frt_get_field_info(FieldInfo *fi)
149
+ {
150
+
151
+ VALUE rfi = Qnil;
152
+ if (fi) {
153
+ rfi = object_get(fi);
154
+ if (rfi == Qnil) {
155
+ rfi = Data_Wrap_Struct(cFieldInfo, NULL, &frt_fi_free, fi);
156
+ REF(fi);
157
+ object_add(fi, rfi);
158
+ }
159
+ }
160
+ return rfi;
161
+ }
162
+
163
+ /*
164
+ * call-seq:
165
+ * FieldInfo.new(name, options = {}) -> field_info
166
+ *
167
+ * Create a new FieldInfo object with the name +name+ and the properties
168
+ * specified in +options+. The available options are [:store, :index,
169
+ * :term_vector, :boost]. See the description of FieldInfo for more
170
+ * information on these properties.
171
+ */
172
+ static VALUE
173
+ frt_fi_init(int argc, VALUE *argv, VALUE self)
174
+ {
175
+ VALUE roptions, rname;
176
+ FieldInfo *fi;
177
+ enum StoreValues store = STORE_YES;
178
+ enum IndexValues index = INDEX_YES;
179
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
180
+ float boost = 1.0f;
181
+
182
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
183
+ if (argc > 1) {
184
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
185
+ }
186
+ fi = fi_new(frt_field(rname), store, index, term_vector);
187
+ fi->boost = boost;
188
+ Frt_Wrap_Struct(self, NULL, &frt_fi_free, fi);
189
+ object_add(fi, self);
190
+ return self;
191
+ }
192
+
193
+ /*
194
+ * call-seq:
195
+ * fi.stored? -> bool
196
+ *
197
+ * Return true if the field is stored in the index.
198
+ */
199
+ static VALUE
200
+ frt_fi_is_stored(VALUE self)
201
+ {
202
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
203
+ return fi_is_stored(fi) ? Qtrue : Qfalse;
204
+ }
205
+
206
+ /*
207
+ * call-seq:
208
+ * fi.compressed? -> bool
209
+ *
210
+ * Return true if the field is stored in the index in compressed format.
211
+ */
212
+ static VALUE
213
+ frt_fi_is_compressed(VALUE self)
214
+ {
215
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
216
+ return fi_is_compressed(fi) ? Qtrue : Qfalse;
217
+ }
218
+
219
+ /*
220
+ * call-seq:
221
+ * fi.indexed? -> bool
222
+ *
223
+ * Return true if the field is indexed, ie searchable in the index.
224
+ */
225
+ static VALUE
226
+ frt_fi_is_indexed(VALUE self)
227
+ {
228
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
229
+ return fi_is_indexed(fi) ? Qtrue : Qfalse;
230
+ }
231
+
232
+ /*
233
+ * call-seq:
234
+ * fi.tokenized? -> bool
235
+ *
236
+ * Return true if the field is tokenized. Tokenizing is the process of
237
+ * breaking the field up into tokens. That is "the quick brown fox" becomes
238
+ * ["the", "quick", "brown", "fox"] This is only possible if the field in
239
+ * indexed.
240
+ */
241
+ static VALUE
242
+ frt_fi_is_tokenized(VALUE self)
243
+ {
244
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
245
+ return fi_is_tokenized(fi) ? Qtrue : Qfalse;
246
+ }
247
+
248
+ /*
249
+ * call-seq:
250
+ * fi.omit_norms? -> bool
251
+ *
252
+ * Return true if the field omits the norm file. The norm file is the file
253
+ * used to store the field boosts for an indexed field. If you do not boost
254
+ * any fields, and you can live without scoring based on field length then
255
+ * you can omit the norms file. This will give the index a slight performance
256
+ * boost and it will use less memory, escpecially for indexes which have a
257
+ * large number of documents.
258
+ */
259
+ static VALUE
260
+ frt_fi_omit_norms(VALUE self)
261
+ {
262
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
263
+ return fi_omit_norms(fi) ? Qtrue : Qfalse;
264
+ }
265
+
266
+ /*
267
+ * call-seq:
268
+ * fi.store_term_vector? -> bool
269
+ *
270
+ * Return true if the term-vectors are stored for this field.
271
+ */
272
+ static VALUE
273
+ frt_fi_store_term_vector(VALUE self)
274
+ {
275
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
276
+ return fi_store_term_vector(fi) ? Qtrue : Qfalse;
277
+ }
278
+
279
+ /*
280
+ * call-seq:
281
+ * fi.store_positions? -> bool
282
+ *
283
+ * Return true if positions are stored with the term-vectors for this field.
284
+ */
285
+ static VALUE
286
+ frt_fi_store_positions(VALUE self)
287
+ {
288
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
289
+ return fi_store_positions(fi) ? Qtrue : Qfalse;
290
+ }
291
+
292
+ /*
293
+ * call-seq:
294
+ * fi.store_offsets? -> bool
295
+ *
296
+ * Return true if offsets are stored with the term-vectors for this field.
297
+ */
298
+ static VALUE
299
+ frt_fi_store_offsets(VALUE self)
300
+ {
301
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
302
+ return fi_store_offsets(fi) ? Qtrue : Qfalse;
303
+ }
304
+
305
+ /*
306
+ * call-seq:
307
+ * fi.has_norms? -> bool
308
+ *
309
+ * Return true if this field has a norms file. This is the same as calling;
310
+ *
311
+ * fi.indexed? and not fi.omit_norms?
312
+ */
313
+ static VALUE
314
+ frt_fi_has_norms(VALUE self)
315
+ {
316
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
317
+ return fi_has_norms(fi) ? Qtrue : Qfalse;
318
+ }
319
+
320
+ /*
321
+ * call-seq:
322
+ * fi.boost -> boost
323
+ *
324
+ * Return the default boost for this field
325
+ */
326
+ static VALUE
327
+ frt_fi_boost(VALUE self)
328
+ {
329
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
330
+ return rb_float_new((double)fi->boost);
331
+ }
332
+
333
+ /*
334
+ * call-seq:
335
+ * fi.to_s -> string
336
+ *
337
+ * Return a string representation of the FieldInfo object.
338
+ */
339
+ static VALUE
340
+ frt_fi_to_s(VALUE self)
341
+ {
342
+ FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
343
+ char *fi_s = fi_to_s(fi);
344
+ VALUE rfi_s = rb_str_new2(fi_s);
345
+ free(fi_s);
346
+ return rfi_s;
347
+ }
348
+
349
+ /****************************************************************************
350
+ *
351
+ * FieldInfos Methods
352
+ *
353
+ ****************************************************************************/
354
+
355
+ static void
356
+ frt_fis_free(void *p)
357
+ {
358
+ object_del(p);
359
+ fis_deref((FieldInfos *)p);
360
+ }
361
+
362
+ static void
363
+ frt_fis_mark(void *p)
364
+ {
365
+ int i;
366
+ FieldInfos *fis = (FieldInfos *)p;
367
+
368
+ for (i = 0; i < fis->size; i++) {
369
+ frt_gc_mark(fis->fields[i]);
370
+ }
371
+ }
372
+
373
+ static VALUE
374
+ frt_get_field_infos(FieldInfos *fis)
375
+ {
376
+
377
+ VALUE rfis = Qnil;
378
+ if (fis) {
379
+ rfis = object_get(fis);
380
+ if (rfis == Qnil) {
381
+ rfis = Data_Wrap_Struct(cFieldInfos, &frt_fis_mark, &frt_fis_free,
382
+ fis);
383
+ REF(fis);
384
+ object_add(fis, rfis);
385
+ }
386
+ }
387
+ return rfis;
388
+ }
389
+
390
+ /*
391
+ * call-seq:
392
+ * FieldInfos.new(defaults = {}) -> field_infos
393
+ *
394
+ * Create a new FieldInfos object which uses the default values for fields
395
+ * specified in the +default+ hash parameter. See FieldInfo for available
396
+ * property values.
397
+ */
398
+ static VALUE
399
+ frt_fis_init(int argc, VALUE *argv, VALUE self)
400
+ {
401
+ VALUE roptions;
402
+ FieldInfos *fis;
403
+ enum StoreValues store = STORE_YES;
404
+ enum IndexValues index = INDEX_YES;
405
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
406
+ float boost;
407
+
408
+ rb_scan_args(argc, argv, "01", &roptions);
409
+ if (argc > 0) {
410
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
411
+ }
412
+ fis = fis_new(store, index, term_vector);
413
+ Frt_Wrap_Struct(self, &frt_fis_mark, &frt_fis_free, fis);
414
+ object_add(fis, self);
415
+ return self;
416
+ }
417
+
418
+ /*
419
+ * call-seq:
420
+ * fis.to_a -> array
421
+ *
422
+ * Return an array of the FieldInfo objects contained but this FieldInfos
423
+ * object.
424
+ */
425
+ static VALUE
426
+ frt_fis_to_a(VALUE self)
427
+ {
428
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
429
+ VALUE rary = rb_ary_new();
430
+ int i;
431
+
432
+ for (i = 0; i < fis->size; i++) {
433
+ rb_ary_push(rary, frt_get_field_info(fis->fields[i]));
434
+ }
435
+ return rary;
436
+ }
437
+
438
+ /*
439
+ * call-seq:
440
+ * fis[name] -> field_info
441
+ * fis[number] -> field_info
442
+ *
443
+ * Get the FieldInfo object. FieldInfo objects can be referenced by either
444
+ * their field-number of the field-name (which must be a symbol). For
445
+ * example;
446
+ *
447
+ * fi = fis[:name]
448
+ * fi = fis[2]
449
+ */
450
+ static VALUE
451
+ frt_fis_get(VALUE self, VALUE ridx)
452
+ {
453
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
454
+ VALUE rfi = Qnil;
455
+ switch (TYPE(ridx)) {
456
+ case T_FIXNUM: {
457
+ int index = FIX2INT(ridx);
458
+ if (index < 0) index += fis->size;
459
+ if (index < 0 || index >= fis->size) {
460
+ rb_raise(rb_eArgError, "index of %d is out of range (0..%d)\n",
461
+ index, fis->size);
462
+ }
463
+ rfi = frt_get_field_info(fis->fields[index]);
464
+ break;
465
+ }
466
+ case T_SYMBOL:
467
+ rfi = frt_get_field_info(fis_get_field(fis, frt_field(ridx)));
468
+ break;
469
+ case T_STRING:
470
+ rfi = frt_get_field_info(fis_get_field(fis, StringValuePtr(ridx)));
471
+ break;
472
+ default:
473
+ rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
474
+ RSTRING(rb_obj_as_string(ridx))->ptr);
475
+ break;
476
+ }
477
+ return rfi;
478
+ }
479
+
480
+ /*
481
+ * call-seq:
482
+ * fis << fi -> fis
483
+ * fis.add(fi) -> fis
484
+ *
485
+ * Add a FieldInfo object. Use the FieldInfos#add_field method where
486
+ * possible.
487
+ */
488
+ static VALUE
489
+ frt_fis_add(VALUE self, VALUE rfi)
490
+ {
491
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
492
+ FieldInfo *fi = (FieldInfo *)frt_rb_data_ptr(rfi);
493
+ fis_add_field(fis, fi);
494
+ REF(fi);
495
+ return self;
496
+ }
497
+
498
+ /*
499
+ * call-seq:
500
+ * fis.add_field(name, properties = {} -> fis
501
+ *
502
+ * Add a new field to the FieldInfos object. See FieldInfo for a description
503
+ * of the available properties.
504
+ */
505
+ static VALUE
506
+ frt_fis_add_field(int argc, VALUE *argv, VALUE self)
507
+ {
508
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
509
+ FieldInfo *fi;
510
+ enum StoreValues store = STORE_YES;
511
+ enum IndexValues index = INDEX_YES;
512
+ enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
513
+ float boost = 1.0f;
514
+ VALUE rname, roptions;
515
+
516
+ rb_scan_args(argc, argv, "11", &rname, &roptions);
517
+ if (argc > 1) {
518
+ frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
519
+ }
520
+ fi = fi_new(frt_field(rname), store, index, term_vector);
521
+ fi->boost = boost;
522
+ fis_add_field(fis, fi);
523
+ return self;
524
+ }
525
+
526
+ /*
527
+ * call-seq:
528
+ * fis.each {|fi| do_something } -> fis
529
+ *
530
+ * Iterate through the FieldInfo objects.
531
+ */
532
+ static VALUE
533
+ frt_fis_each(VALUE self)
534
+ {
535
+ int i;
536
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
537
+
538
+ for (i = 0; i < fis->size; i++) {
539
+ rb_yield(frt_get_field_info(fis->fields[i]));
540
+ }
541
+ return self;
542
+ }
543
+
544
+ /*
545
+ * call-seq:
546
+ * fis.to_s -> string
547
+ *
548
+ * Return a string representation of the FieldInfos object.
549
+ */
550
+ static VALUE
551
+ frt_fis_to_s(VALUE self)
552
+ {
553
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
554
+ char *fis_s = fis_to_s(fis);
555
+ VALUE rfis_s = rb_str_new2(fis_s);
556
+ free(fis_s);
557
+ return rfis_s;
558
+ }
559
+
560
+ /*
561
+ * call-seq:
562
+ * fis.create_index(dir) -> self
563
+ *
564
+ * Create a new index in the directory specified. The directory +dir+ can
565
+ * either be a string path representing a directory on the file-system or an
566
+ * actual directory object. Care should be taken when using this method. Any
567
+ * existing index (or other files for that matter) will be deleted from the
568
+ * directory and overwritten by the new index.
569
+ */
570
+ static VALUE
571
+ frt_fis_create_index(VALUE self, VALUE rdir)
572
+ {
573
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
574
+ Store *store = NULL;
575
+ if (TYPE(rdir) == T_DATA) {
576
+ store = DATA_PTR(rdir);
577
+ REF(store);
578
+ } else {
579
+ StringValue(rdir);
580
+ frt_create_dir(rdir);
581
+ store = open_fs_store(RSTRING(rdir)->ptr);
582
+ }
583
+ index_create(store, fis);
584
+ store_deref(store);
585
+ return self;
586
+ }
587
+
588
+ /*
589
+ * call-seq:
590
+ * fis.fields -> symbol array
591
+ *
592
+ * Return a list of the the field names (as symbols) in the index.
593
+ */
594
+ static VALUE
595
+ frt_fis_get_fields(VALUE self)
596
+ {
597
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
598
+ VALUE rfield_names = rb_ary_new();
599
+ int i;
600
+ for (i = 0; i < fis->size; i++) {
601
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
602
+ }
603
+ return rfield_names;
604
+ }
605
+
606
+ /****************************************************************************
607
+ *
608
+ * TermEnum Methods
609
+ *
610
+ ****************************************************************************/
611
+
612
+ static void
613
+ frt_te_free(void *p)
614
+ {
615
+ TermEnum *te = (TermEnum *)p;
616
+ te->close(te);
617
+ }
618
+
619
+ static VALUE
620
+ frt_te_get_set_term(VALUE self, const char *term)
621
+ {
622
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
623
+ VALUE str = term ? rb_str_new(term, te->curr_term_len) : Qnil;
624
+ rb_ivar_set(self, id_term, str);
625
+ return str;
626
+ }
627
+
628
+ static VALUE
629
+ frt_get_te(VALUE rir, TermEnum *te)
630
+ {
631
+ VALUE self = Qnil;
632
+ if (te != NULL) {
633
+ self = Data_Wrap_Struct(cTermEnum, NULL, &frt_te_free, te);
634
+ frt_te_get_set_term(self, te->curr_term);
635
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
636
+ }
637
+ return self;
638
+ }
639
+
640
+ /*
641
+ * call-seq:
642
+ * term_enum.next -> term_string
643
+ *
644
+ * Returns the next term in the enumeration or nil otherwise.
645
+ */
646
+ static VALUE
647
+ frt_te_next(VALUE self)
648
+ {
649
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
650
+ return frt_te_get_set_term(self, te->next(te));
651
+ }
652
+
653
+ /*
654
+ * call-seq:
655
+ * term_enum.term -> term_string
656
+ *
657
+ * Returns the current term pointed to by the enum. This method should only
658
+ * be called after a successful call to TermEnum#next.
659
+ */
660
+ static VALUE
661
+ frt_te_term(VALUE self)
662
+ {
663
+ return rb_ivar_get(self, id_term);
664
+ }
665
+
666
+ /*
667
+ * call-seq:
668
+ * term_enum.doc_freq -> integer
669
+ *
670
+ * Returns the document frequency of the current term pointed to by the enum.
671
+ * That is the number of documents that this term appears in. The method
672
+ * should only be called after a successful call to TermEnum#next.
673
+ */
674
+ static VALUE
675
+ frt_te_doc_freq(VALUE self)
676
+ {
677
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
678
+ return INT2FIX(te->curr_ti.doc_freq);
679
+ }
680
+
681
+ /*
682
+ * call-seq:
683
+ * term_enum.skip_to(target) -> term
684
+ *
685
+ * Skip to term +target+. This method can skip forwards or backwards. If you
686
+ * want to skip back to the start, pass the empty string "". That is;
687
+ *
688
+ * term_enum.skip_to("")
689
+ *
690
+ * Returns the first term greater than or equal to +target+
691
+ */
692
+ static VALUE
693
+ frt_te_skip_to(VALUE self, VALUE rterm)
694
+ {
695
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
696
+ return frt_te_get_set_term(self, te->skip_to(te, frt_field(rterm)));
697
+ }
698
+
699
+ /*
700
+ * call-seq:
701
+ * term_enum.each {|term, doc_freq| do_something() } -> term_count
702
+ *
703
+ * Iterates through all the terms in the field, yielding the term and the
704
+ * document frequency.
705
+ */
706
+ static VALUE
707
+ frt_te_each(VALUE self)
708
+ {
709
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
710
+ char *term;
711
+ int term_cnt = 0;
712
+ VALUE vals = rb_ary_new2(2);
713
+ RARRAY(vals)->len = 2;
714
+
715
+
716
+ /* each is being called so there will be no current term */
717
+ rb_ivar_set(self, id_term, Qnil);
718
+
719
+
720
+ while (NULL != (term = te->next(te))) {
721
+ term_cnt++;
722
+ RARRAY(vals)->ptr[0] = rb_str_new(term, te->curr_term_len);
723
+ RARRAY(vals)->ptr[1] = INT2FIX(te->curr_ti.doc_freq);
724
+ rb_yield(vals);
725
+ }
726
+ return INT2FIX(term_cnt);
727
+ }
728
+
729
+ /*
730
+ * call-seq:
731
+ * term_enum.set_field(field) -> self
732
+ *
733
+ * Set the field for the term_enum. The field value should be a symbol as
734
+ * usual. For example, to scan all title terms you'd do this;
735
+ *
736
+ * term_enum.set_field(:title).each do |term, doc_freq|
737
+ * do_something()
738
+ * end
739
+ */
740
+ static VALUE
741
+ frt_te_set_field(VALUE self, VALUE rfield)
742
+ {
743
+ TermEnum *te = (TermEnum *)DATA_PTR(self);
744
+ int field_num = 0;
745
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
746
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
747
+ if (rfnum != Qnil) {
748
+ field_num = FIX2INT(rfnum);
749
+ rb_ivar_set(self, id_field_num, rfnum);
750
+ } else {
751
+ Check_Type(rfield, T_SYMBOL);
752
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
753
+ frt_field(rfield));
754
+ }
755
+ te->set_field(te, field_num);
756
+
757
+ return self;
758
+ }
759
+
760
+ /****************************************************************************
761
+ *
762
+ * TermDocEnum Methods
763
+ *
764
+ ****************************************************************************/
765
+
766
+ static void
767
+ frt_tde_free(void *p)
768
+ {
769
+ TermDocEnum *tde = (TermDocEnum *)p;
770
+ tde->close(tde);
771
+ }
772
+
773
+ static VALUE
774
+ frt_get_tde(VALUE rir, TermDocEnum *tde)
775
+ {
776
+ VALUE self = Data_Wrap_Struct(cTermDocEnum, NULL, &frt_tde_free, tde);
777
+ rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
778
+ return self;
779
+ }
780
+
781
+ /*
782
+ * call-seq:
783
+ * term_doc_enum.seek(field, term) -> self
784
+ *
785
+ * Seek the term +term+ in the index for +field+. After you call this method
786
+ * you can call next or each to skip through the documents and positions of
787
+ * this particular term.
788
+ */
789
+ static VALUE
790
+ frt_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
791
+ {
792
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
793
+ char *term;
794
+ VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
795
+ VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
796
+ int field_num = -1;
797
+ term = StringValuePtr(rterm);
798
+ if (rfnum != Qnil) {
799
+ field_num = FIX2INT(rfnum);
800
+ } else {
801
+ rb_raise(rb_eArgError, "field %s doesn't exist in the index",
802
+ frt_field(rfield));
803
+ }
804
+ tde->seek(tde, field_num, term);
805
+ return self;
806
+ }
807
+
808
+ /*
809
+ * call-seq:
810
+ * term_doc_enum.seek_term_enum(term_enum) -> self
811
+ *
812
+ * Seek the current term in +term_enum+. You could just use the standard seek
813
+ * method like this;
814
+ *
815
+ * term_doc_enum.seek(term_enum.term)
816
+ *
817
+ * However the +seek_term_enum+ method saves an index lookup so should offer
818
+ * a large performance improvement.
819
+ */
820
+ static VALUE
821
+ frt_tde_seek_te(VALUE self, VALUE rterm_enum)
822
+ {
823
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
824
+ TermEnum *te = (TermEnum *)frt_rb_data_ptr(rterm_enum);
825
+ tde->seek_te(tde, te);
826
+ return self;
827
+ }
828
+
829
+ /*
830
+ * call-seq:
831
+ * term_doc_enum.doc -> doc_id
832
+ *
833
+ * Returns the current document number pointed to by the +term_doc_enum+.
834
+ */
835
+ static VALUE
836
+ frt_tde_doc(VALUE self)
837
+ {
838
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
839
+ return INT2FIX(tde->doc_num(tde));
840
+ }
841
+
842
+ /*
843
+ * call-seq:
844
+ * term_doc_enum.doc -> doc_id
845
+ *
846
+ * Returns the frequency of the current document pointed to by the
847
+ * +term_doc_enum+.
848
+ */
849
+ static VALUE
850
+ frt_tde_freq(VALUE self)
851
+ {
852
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
853
+ return INT2FIX(tde->freq(tde));
854
+ }
855
+
856
+ /*
857
+ * call-seq:
858
+ * term_doc_enum.doc -> doc_id
859
+ *
860
+ * Move forward to the next document in the enumeration. Returns +true+ if
861
+ * there is another document or +false+ otherwise.
862
+ */
863
+ static VALUE
864
+ frt_tde_next(VALUE self)
865
+ {
866
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
867
+ return tde->next(tde) ? Qtrue : Qfalse;
868
+ }
869
+
870
+ /*
871
+ * call-seq:
872
+ * term_doc_enum.doc -> doc_id
873
+ *
874
+ * Move forward to the next document in the enumeration. Returns +true+ if
875
+ * there is another document or +false+ otherwise.
876
+ */
877
+ static VALUE
878
+ frt_tde_next_position(VALUE self)
879
+ {
880
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
881
+ int pos;
882
+ if (tde->next_position == NULL) {
883
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
884
+ "the TermDocEnum with Index#term_positions method rather "
885
+ "than the Index#term_docs method");
886
+ }
887
+ pos = tde->next_position(tde);
888
+ return pos >= 0 ? INT2FIX(pos) : Qnil;
889
+ }
890
+
891
+ /*
892
+ * call-seq:
893
+ * term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
894
+ *
895
+ * Iterate through the documents and document frequencies in the
896
+ * +term_doc_enum+.
897
+ *
898
+ * NOTE: this method can only be called once after each seek. If you need to
899
+ * call +#each+ again then you should call +#seek+ again too.
900
+ */
901
+ static VALUE
902
+ frt_tde_each(VALUE self)
903
+ {
904
+ int doc_cnt = 0;
905
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
906
+ VALUE vals = rb_ary_new2(2);
907
+ RARRAY(vals)->len = 2;
908
+
909
+ while (tde->next(tde)) {
910
+ doc_cnt++;
911
+ RARRAY(vals)->ptr[0] = INT2FIX(tde->doc_num(tde));
912
+ RARRAY(vals)->ptr[1] = INT2FIX(tde->freq(tde));
913
+ rb_yield(vals);
914
+
915
+ }
916
+ return INT2FIX(doc_cnt);
917
+ }
918
+
919
+ /*
920
+ * call-seq:
921
+ * term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
922
+ *
923
+ * Iterate through each of the positions occupied by the current term in the
924
+ * current document. This can only be called once per document. It can be
925
+ * used within the each method. For example, to print the terms documents and
926
+ * positions;
927
+ *
928
+ * tde.each do |doc_id, freq|
929
+ * puts "term appeared #{freq} times in document #{doc_id}:"
930
+ * positions = []
931
+ * tde.each_position {|pos| positions << pos}
932
+ * puts " #{positions.join(', ')}"
933
+ * end
934
+ */
935
+ static VALUE
936
+ frt_tde_each_position(VALUE self)
937
+ {
938
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
939
+ int pos;
940
+ if (tde->next_position == NULL) {
941
+ rb_raise(rb_eNotImpError, "to scan through positions you must create "
942
+ "the TermDocEnum with Index#term_positions method rather "
943
+ "than the Index#term_docs method");
944
+ }
945
+ while (0 <= (pos = tde->next_position(tde))) {
946
+ rb_yield(INT2FIX(pos));
947
+ }
948
+ return self;
949
+ }
950
+
951
+ /*
952
+ * call-seq:
953
+ * term_doc_enum.skip_to(target) -> bool
954
+ *
955
+ * Skip to the required document number +target+ and return true if there is
956
+ * a document >= +target+.
957
+ */
958
+ static VALUE
959
+ frt_tde_skip_to(VALUE self, VALUE rtarget)
960
+ {
961
+ TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
962
+ return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
963
+ }
964
+
965
+ /****************************************************************************
966
+ *
967
+ * TVOffsets Methods
968
+ *
969
+ ****************************************************************************/
970
+
971
+ static VALUE
972
+ frt_get_tv_offsets(Offset *offset)
973
+ {
974
+ return rb_struct_new(cTVOffsets,
975
+ INT2FIX(offset->start),
976
+ INT2FIX(offset->end),
977
+ NULL);
978
+ }
979
+
980
+ /****************************************************************************
981
+ *
982
+ * TVTerm Methods
983
+ *
984
+ ****************************************************************************/
985
+
986
+ static VALUE
987
+ frt_get_tv_term(TVTerm *tv_term)
988
+ {
989
+ int i;
990
+ const int freq = tv_term->freq;
991
+ VALUE rtext;
992
+ VALUE rpositions = Qnil;
993
+ rtext = rb_str_new2(tv_term->text);
994
+ if (tv_term->positions) {
995
+ VALUE *rpos;
996
+ int *positions = tv_term->positions;
997
+ rpositions = rb_ary_new2(freq);
998
+ rpos = RARRAY(rpositions)->ptr;
999
+ RARRAY(rpositions)->len = freq;
1000
+ for (i = 0; i < freq; i++) {
1001
+ rpos[i] = INT2FIX(positions[i]);
1002
+ }
1003
+ }
1004
+ return rb_struct_new(cTVTerm, rtext, rpositions, NULL);
1005
+ }
1006
+
1007
+ /****************************************************************************
1008
+ *
1009
+ * TermVector Methods
1010
+ *
1011
+ ****************************************************************************/
1012
+
1013
+ static VALUE
1014
+ frt_get_tv(TermVector *tv)
1015
+ {
1016
+ int i;
1017
+ TVTerm *terms = tv->terms;
1018
+ const int t_cnt = tv->term_cnt;
1019
+ const int o_cnt = tv->offset_cnt;
1020
+ VALUE rfield, rterms, *rts;
1021
+ VALUE roffsets = Qnil;
1022
+ rfield = ID2SYM(rb_intern(tv->field));
1023
+
1024
+ rterms = rb_ary_new2(t_cnt);
1025
+ RARRAY(rterms)->len = t_cnt;
1026
+ rts = RARRAY(rterms)->ptr;
1027
+ for (i = 0; i < t_cnt; i++) {
1028
+ rts[i] = frt_get_tv_term(&terms[i]);
1029
+ }
1030
+
1031
+ if (tv->offsets) {
1032
+ VALUE *ros;
1033
+ Offset *offsets = tv->offsets;
1034
+ roffsets = rb_ary_new2(o_cnt);
1035
+ ros = RARRAY(roffsets)->ptr;
1036
+ RARRAY(roffsets)->len = o_cnt;
1037
+ for (i = 0; i < o_cnt; i++) {
1038
+ ros[i] = frt_get_tv_offsets(&offsets[i]);
1039
+ }
1040
+ }
1041
+
1042
+ return rb_struct_new(cTermVector, rfield, rterms, roffsets, NULL);
1043
+ }
1044
+
1045
+ /****************************************************************************
1046
+ *
1047
+ * IndexWriter Methods
1048
+ *
1049
+ ****************************************************************************/
1050
+
1051
+ void
1052
+ frt_iw_free(void *p)
1053
+ {
1054
+ iw_close((IndexWriter *)p);
1055
+ }
1056
+
1057
+ void
1058
+ frt_iw_mark(void *p)
1059
+ {
1060
+ IndexWriter *iw = (IndexWriter *)p;
1061
+ frt_gc_mark(iw->analyzer);
1062
+ frt_gc_mark(iw->store);
1063
+ frt_gc_mark(iw->fis);
1064
+ }
1065
+
1066
+ /*
1067
+ * call-seq:
1068
+ * index_writer.close -> nil
1069
+ *
1070
+ * Close the IndexWriter. This will close and free all resources used
1071
+ * exclusively by the index writer. The garbage collector will do this
1072
+ * automatically if not called explicitly.
1073
+ */
1074
+ static VALUE
1075
+ frt_iw_close(VALUE self)
1076
+ {
1077
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1078
+ Frt_Unwrap_Struct(self);
1079
+ iw_close(iw);
1080
+ return Qnil;
1081
+ }
1082
+
1083
+ #define SET_INT_ATTR(attr) \
1084
+ do {\
1085
+ if (RTEST(rval = rb_hash_aref(roptions, sym_##attr)))\
1086
+ config.attr = FIX2INT(rval);\
1087
+ } while (0)
1088
+
1089
+ /*
1090
+ * call-seq:
1091
+ * IndexWriter.new(options = {}) -> index_writer
1092
+ *
1093
+ * Create a new IndexWriter. You should either pass a path or a directory to
1094
+ * this constructor. For example, here are three ways you can create an
1095
+ * IndexWriter;
1096
+ *
1097
+ * dir = RAMDirectory.new()
1098
+ * iw = IndexWriter.new(:dir => dir)
1099
+ *
1100
+ * dir = FSDirectory.new("/path/to/index")
1101
+ * iw = IndexWriter.new(:dir => dir)
1102
+ *
1103
+ * iw = IndexWriter.new(:path => "/path/to/index")
1104
+ *
1105
+ * See IndexWriter for more options.
1106
+ */
1107
+ static VALUE
1108
+ frt_iw_init(int argc, VALUE *argv, VALUE self)
1109
+ {
1110
+ VALUE roptions, rval;
1111
+ bool create = false;
1112
+ bool create_if_missing = true;
1113
+ Store *store = NULL;
1114
+ Analyzer *analyzer = NULL;
1115
+ IndexWriter *volatile iw = NULL;
1116
+ Config config = default_config;
1117
+
1118
+ rb_scan_args(argc, argv, "01", &roptions);
1119
+ if (argc > 0) {
1120
+ Check_Type(roptions, T_HASH);
1121
+
1122
+ if ((rval = rb_hash_aref(roptions, sym_dir)) != Qnil) {
1123
+ Check_Type(rval, T_DATA);
1124
+ store = DATA_PTR(rval);
1125
+ } else if ((rval = rb_hash_aref(roptions, sym_path)) != Qnil) {
1126
+ StringValue(rval);
1127
+ frt_create_dir(rval);
1128
+ store = open_fs_store(RSTRING(rval)->ptr);
1129
+ DEREF(store);
1130
+ }
1131
+
1132
+ /* Let ruby's garbage collector handle the closing of the store
1133
+ if (!close_dir) {
1134
+ close_dir = RTEST(rb_hash_aref(roptions, sym_close_dir));
1135
+ }
1136
+ */
1137
+ /* use_compound_file defaults to true */
1138
+ config.use_compound_file =
1139
+ (rb_hash_aref(roptions, sym_use_compound_file) == Qfalse)
1140
+ ? false
1141
+ : true;
1142
+
1143
+ if ((rval = rb_hash_aref(roptions, sym_analyzer)) != Qnil) {
1144
+ analyzer = frt_get_cwrapped_analyzer(rval);
1145
+ }
1146
+
1147
+ create = RTEST(rb_hash_aref(roptions, sym_create));
1148
+ if ((rval = rb_hash_aref(roptions, sym_create_if_missing)) != Qnil) {
1149
+ create_if_missing = RTEST(rval);
1150
+ }
1151
+ SET_INT_ATTR(chunk_size);
1152
+ SET_INT_ATTR(max_buffer_memory);
1153
+ SET_INT_ATTR(index_interval);
1154
+ SET_INT_ATTR(skip_interval);
1155
+ SET_INT_ATTR(merge_factor);
1156
+ SET_INT_ATTR(max_buffered_docs);
1157
+ SET_INT_ATTR(max_merge_docs);
1158
+ SET_INT_ATTR(max_field_length);
1159
+ }
1160
+ if (NULL == store) {
1161
+ store = open_ram_store();
1162
+ DEREF(store);
1163
+ }
1164
+ if (!create && create_if_missing && !store->exists(store, "segments")) {
1165
+ create = true;
1166
+ }
1167
+ if (create) {
1168
+ FieldInfos *fis;
1169
+ if ((rval = rb_hash_aref(roptions, sym_field_infos)) != Qnil) {
1170
+ Data_Get_Struct(rval, FieldInfos, fis);
1171
+ index_create(store, fis);
1172
+ } else {
1173
+ fis = fis_new(STORE_YES, INDEX_YES,
1174
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS);
1175
+ index_create(store, fis);
1176
+ fis_deref(fis);
1177
+ }
1178
+ }
1179
+
1180
+ iw = iw_open(store, analyzer, &config);
1181
+
1182
+ Frt_Wrap_Struct(self, &frt_iw_mark, &frt_iw_free, iw);
1183
+
1184
+ if (rb_block_given_p()) {
1185
+ rb_yield(self);
1186
+ frt_iw_close(self);
1187
+ return Qnil;
1188
+ } else {
1189
+ return self;
1190
+ }
1191
+ }
1192
+
1193
+ /*
1194
+ * call-seq:
1195
+ * iw.doc_count -> number
1196
+ *
1197
+ * Returns the number of documents in the Index. Note that deletions won't be
1198
+ * taken into account until the IndexWriter has been commited.
1199
+ */
1200
+ static VALUE
1201
+ frt_iw_get_doc_count(VALUE self)
1202
+ {
1203
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1204
+ return INT2FIX(iw_doc_count(iw));
1205
+ }
1206
+
1207
+ static int
1208
+ frt_hash_to_doc_i(VALUE key, VALUE value, VALUE arg)
1209
+ {
1210
+ if (key == Qundef) {
1211
+ return ST_CONTINUE;
1212
+ } else {
1213
+ Document *doc = (Document *)arg;
1214
+ char *field;
1215
+ VALUE val;
1216
+ DocField *df;
1217
+ switch (TYPE(key)) {
1218
+ case T_STRING:
1219
+ field = RSTRING(key)->ptr;
1220
+ break;
1221
+ case T_SYMBOL:
1222
+ field = rb_id2name(SYM2ID(key));
1223
+ break;
1224
+ default:
1225
+ rb_raise(rb_eArgError,
1226
+ "%s cannot be a key to a field. Field keys must "
1227
+ " be symbols.", RSTRING(rb_obj_as_string(key))->ptr);
1228
+ break;
1229
+ }
1230
+ if (NULL == (df = doc_get_field(doc, field))) {
1231
+ df = df_new(field);
1232
+ }
1233
+ if (rb_respond_to(value, id_boost)) {
1234
+ df->boost = (float)NUM2DBL(rb_funcall(value, id_boost, 0));
1235
+ }
1236
+ switch (TYPE(value)) {
1237
+ case T_ARRAY:
1238
+ {
1239
+ int i;
1240
+ for (i = 0; i < RARRAY(value)->len; i++) {
1241
+ val = rb_obj_as_string(RARRAY(value)->ptr[i]);
1242
+ df_add_data_len(df,
1243
+ RSTRING(val)->ptr,
1244
+ RSTRING(val)->len);
1245
+ }
1246
+ }
1247
+ break;
1248
+ default:
1249
+ val = rb_obj_as_string(value);
1250
+ df_add_data_len(df, RSTRING(val)->ptr, RSTRING(val)->len);
1251
+ break;
1252
+ }
1253
+ doc_add_field(doc, df);
1254
+ }
1255
+ return ST_CONTINUE;
1256
+ }
1257
+
1258
+ static Document *
1259
+ frt_get_doc(VALUE rdoc)
1260
+ {
1261
+ VALUE val;
1262
+ Document *doc = doc_new();
1263
+ DocField *df;
1264
+
1265
+ if (rb_respond_to(rdoc, id_boost)) {
1266
+ doc->boost = (float)NUM2DBL(rb_funcall(rdoc, id_boost, 0));
1267
+ }
1268
+
1269
+ switch (TYPE(rdoc)) {
1270
+ case T_HASH:
1271
+ rb_hash_foreach(rdoc, frt_hash_to_doc_i, (VALUE)doc);
1272
+ break;
1273
+ case T_ARRAY:
1274
+ {
1275
+ int i;
1276
+ df = df_new("content");
1277
+ for (i = 0; i < RARRAY(rdoc)->len; i++) {
1278
+ val = rb_obj_as_string(RARRAY(rdoc)->ptr[i]);
1279
+ df_add_data_len(df,
1280
+ RSTRING(val)->ptr,
1281
+ RSTRING(val)->len);
1282
+ }
1283
+ doc_add_field(doc, df);
1284
+ }
1285
+ break;
1286
+ case T_SYMBOL:
1287
+ df = df_add_data(df_new("content"), rb_id2name(SYM2ID(rdoc)));
1288
+ doc_add_field(doc, df);
1289
+ break;
1290
+ case T_STRING:
1291
+ default:
1292
+ val = rb_obj_as_string(rdoc);
1293
+ df = df_add_data_len(df_new("content"),
1294
+ RSTRING(val)->ptr,
1295
+ RSTRING(val)->len);
1296
+ doc_add_field(doc, df);
1297
+ break;
1298
+ }
1299
+ return doc;
1300
+ }
1301
+
1302
+ /*
1303
+ * call-seq:
1304
+ * iw << document -> iw
1305
+ * iw.add_document(document) -> iw
1306
+ *
1307
+ * Add a document to the index. See Document. A document can also be a simple
1308
+ * hash object.
1309
+ */
1310
+ static VALUE
1311
+ frt_iw_add_doc(VALUE self, VALUE rdoc)
1312
+ {
1313
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1314
+ Document *doc = frt_get_doc(rdoc);
1315
+ iw_add_doc(iw, doc);
1316
+ doc_destroy(doc);
1317
+ return self;
1318
+ }
1319
+
1320
+ /*
1321
+ * call-seq:
1322
+ * iw.optimize -> iw
1323
+ *
1324
+ * Optimize the index for searching. This commits any unwritten data to the
1325
+ * index and optimizes the index into a single segment to improve search
1326
+ * performance. This is an expensive operation and should not be called too
1327
+ * often. The best time to call this is at the end of a long batch indexing
1328
+ * process. Note that calling the optimize method do not in any way effect
1329
+ * indexing speed (except for the time taken to complete the optimization
1330
+ * process).
1331
+ */
1332
+ static VALUE
1333
+ frt_iw_optimize(VALUE self)
1334
+ {
1335
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1336
+ iw_optimize(iw);
1337
+ return self;
1338
+ }
1339
+
1340
+ /*
1341
+ * call-seq:
1342
+ * iw.commit -> iw
1343
+ *
1344
+ * Explicitly commit any changes to the index that may be hanging around in
1345
+ * memory. You should call this method if you want to read the latest index
1346
+ * with an IndexWriter.
1347
+ */
1348
+ static VALUE
1349
+ frt_iw_commit(VALUE self)
1350
+ {
1351
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1352
+ iw_commit(iw);
1353
+ return self;
1354
+ }
1355
+
1356
+ /*
1357
+ * call-seq:
1358
+ * iw.add_readers(reader_array) -> iw
1359
+ *
1360
+ * Use this method to merge other indexes into the one being written by
1361
+ * IndexWriter. This is useful for parallel indexing. You can have several
1362
+ * indexing processes running in parallel, possibly even on different
1363
+ * machines. Then you can finish by merging all of the indexes into a single
1364
+ * index.
1365
+ */
1366
+ static VALUE
1367
+ frt_iw_add_readers(VALUE self, VALUE rreaders)
1368
+ {
1369
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1370
+ int i;
1371
+ IndexReader **irs;
1372
+ Check_Type(rreaders, T_ARRAY);
1373
+
1374
+ irs = ALLOC_N(IndexReader *, RARRAY(rreaders)->len);
1375
+ i = RARRAY(rreaders)->len;
1376
+ while (i-- > 0) {
1377
+ IndexReader *ir;
1378
+ Data_Get_Struct(RARRAY(rreaders)->ptr[i], IndexReader, ir);
1379
+ irs[i] = ir;
1380
+ }
1381
+ iw_add_readers(iw, irs, RARRAY(rreaders)->len);
1382
+ free(irs);
1383
+ return self;
1384
+ }
1385
+
1386
+ /*
1387
+ * call-seq:
1388
+ * iw.delete(field, term) -> iw
1389
+ *
1390
+ * Delete all documents in the index with the term +term+ in the field
1391
+ * +field+. You should usually have a unique document id which you use with
1392
+ * this method, rather then deleting all documents with the word "the" in
1393
+ * them. You may however use this method to delete spam.
1394
+ */
1395
+ static VALUE
1396
+ frt_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
1397
+ {
1398
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1399
+ iw_delete_term(iw, frt_field(rfield), StringValuePtr(rterm));
1400
+ return self;
1401
+ }
1402
+
1403
+ /*
1404
+ * call-seq:
1405
+ * index_writer.field_infos -> FieldInfos
1406
+ *
1407
+ * Get the FieldInfos object for this IndexWriter. This is useful if you need
1408
+ * to dynamically add new fields to the index with specific properties.
1409
+ */
1410
+ static VALUE
1411
+ frt_iw_field_infos(VALUE self)
1412
+ {
1413
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1414
+ return frt_get_field_infos(iw->fis);
1415
+ }
1416
+
1417
+ /*
1418
+ * call-seq:
1419
+ * index_writer.analyzer -> Analyzer
1420
+ *
1421
+ * Get the Analyzer for this IndexWriter. This is useful if you need
1422
+ * to use the same analyzer in a QueryParser.
1423
+ */
1424
+ static VALUE
1425
+ frt_iw_get_analyzer(VALUE self)
1426
+ {
1427
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1428
+ return frt_get_analyzer(iw->analyzer);
1429
+ }
1430
+
1431
+ /*
1432
+ * call-seq:
1433
+ * index_writer.analyzer -> Analyzer
1434
+ *
1435
+ * Set the Analyzer for this IndexWriter. This is useful if you need to
1436
+ * change the analyzer for a special document. It is risky though as the
1437
+ * same anlyzer will be used for all documents during search.
1438
+ */
1439
+ static VALUE
1440
+ frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
1441
+ {
1442
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1443
+
1444
+ a_deref(iw->analyzer);
1445
+ iw->analyzer = frt_get_cwrapped_analyzer(ranalyzer);
1446
+ return ranalyzer;
1447
+ }
1448
+
1449
+ /*
1450
+ * call-seq:
1451
+ * iw.chunk_size -> number
1452
+ *
1453
+ * Return the current value of chunk_size
1454
+ */
1455
+ static VALUE
1456
+ frt_iw_get_chunk_size(VALUE self)
1457
+ {
1458
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1459
+ return INT2FIX(iw->config.chunk_size);
1460
+ }
1461
+
1462
+ /*
1463
+ * call-seq:
1464
+ * iw.chunk_size = chunk_size -> chunk_size
1465
+ *
1466
+ * Set the chunk_size parameter
1467
+ */
1468
+ static VALUE
1469
+ frt_iw_set_chunk_size(VALUE self, VALUE rval)
1470
+ {
1471
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1472
+ iw->config.chunk_size = FIX2INT(rval);
1473
+ return rval;
1474
+ }
1475
+
1476
+ /*
1477
+ * call-seq:
1478
+ * iw.max_buffer_memory -> number
1479
+ *
1480
+ * Return the current value of max_buffer_memory
1481
+ */
1482
+ static VALUE
1483
+ frt_iw_get_max_buffer_memory(VALUE self)
1484
+ {
1485
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1486
+ return INT2FIX(iw->config.max_buffer_memory);
1487
+ }
1488
+
1489
+ /*
1490
+ * call-seq:
1491
+ * iw.max_buffer_memory = max_buffer_memory -> max_buffer_memory
1492
+ *
1493
+ * Set the max_buffer_memory parameter
1494
+ */
1495
+ static VALUE
1496
+ frt_iw_set_max_buffer_memory(VALUE self, VALUE rval)
1497
+ {
1498
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1499
+ iw->config.max_buffer_memory = FIX2INT(rval);
1500
+ return rval;
1501
+ }
1502
+
1503
+ /*
1504
+ * call-seq:
1505
+ * iw.term_index_interval -> number
1506
+ *
1507
+ * Return the current value of term_index_interval
1508
+ */
1509
+ static VALUE
1510
+ frt_iw_get_index_interval(VALUE self)
1511
+ {
1512
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1513
+ return INT2FIX(iw->config.index_interval);
1514
+ }
1515
+
1516
+ /*
1517
+ * call-seq:
1518
+ * iw.term_index_interval = term_index_interval -> term_index_interval
1519
+ *
1520
+ * Set the term_index_interval parameter
1521
+ */
1522
+ static VALUE
1523
+ frt_iw_set_index_interval(VALUE self, VALUE rval)
1524
+ {
1525
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1526
+ iw->config.index_interval = FIX2INT(rval);
1527
+ return rval;
1528
+ }
1529
+
1530
+ /*
1531
+ * call-seq:
1532
+ * iw.doc_skip_interval -> number
1533
+ *
1534
+ * Return the current value of doc_skip_interval
1535
+ */
1536
+ static VALUE
1537
+ frt_iw_get_skip_interval(VALUE self)
1538
+ {
1539
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1540
+ return INT2FIX(iw->config.skip_interval);
1541
+ }
1542
+
1543
+ /*
1544
+ * call-seq:
1545
+ * iw.doc_skip_interval = doc_skip_interval -> doc_skip_interval
1546
+ *
1547
+ * Set the doc_skip_interval parameter
1548
+ */
1549
+ static VALUE
1550
+ frt_iw_set_skip_interval(VALUE self, VALUE rval)
1551
+ {
1552
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1553
+ iw->config.skip_interval = FIX2INT(rval);
1554
+ return rval;
1555
+ }
1556
+
1557
+ /*
1558
+ * call-seq:
1559
+ * iw.merge_factor -> number
1560
+ *
1561
+ * Return the current value of merge_factor
1562
+ */
1563
+ static VALUE
1564
+ frt_iw_get_merge_factor(VALUE self)
1565
+ {
1566
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1567
+ return INT2FIX(iw->config.merge_factor);
1568
+ }
1569
+
1570
+ /*
1571
+ * call-seq:
1572
+ * iw.merge_factor = merge_factor -> merge_factor
1573
+ *
1574
+ * Set the merge_factor parameter
1575
+ */
1576
+ static VALUE
1577
+ frt_iw_set_merge_factor(VALUE self, VALUE rval)
1578
+ {
1579
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1580
+ iw->config.merge_factor = FIX2INT(rval);
1581
+ return rval;
1582
+ }
1583
+
1584
+ /*
1585
+ * call-seq:
1586
+ * iw.max_buffered_docs -> number
1587
+ *
1588
+ * Return the current value of max_buffered_docs
1589
+ */
1590
+ static VALUE
1591
+ frt_iw_get_max_buffered_docs(VALUE self)
1592
+ {
1593
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1594
+ return INT2FIX(iw->config.max_buffered_docs);
1595
+ }
1596
+
1597
+ /*
1598
+ * call-seq:
1599
+ * iw.max_buffered_docs = max_buffered_docs -> max_buffered_docs
1600
+ *
1601
+ * Set the max_buffered_docs parameter
1602
+ */
1603
+ static VALUE
1604
+ frt_iw_set_max_buffered_docs(VALUE self, VALUE rval)
1605
+ {
1606
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1607
+ iw->config.max_buffered_docs = FIX2INT(rval);
1608
+ return rval;
1609
+ }
1610
+
1611
+ /*
1612
+ * call-seq:
1613
+ * iw.max_merge_docs -> number
1614
+ *
1615
+ * Return the current value of max_merge_docs
1616
+ */
1617
+ static VALUE
1618
+ frt_iw_get_max_merge_docs(VALUE self)
1619
+ {
1620
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1621
+ return INT2FIX(iw->config.max_merge_docs);
1622
+ }
1623
+
1624
+ /*
1625
+ * call-seq:
1626
+ * iw.max_merge_docs = max_merge_docs -> max_merge_docs
1627
+ *
1628
+ * Set the max_merge_docs parameter
1629
+ */
1630
+ static VALUE
1631
+ frt_iw_set_max_merge_docs(VALUE self, VALUE rval)
1632
+ {
1633
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1634
+ iw->config.max_merge_docs = FIX2INT(rval);
1635
+ return rval;
1636
+ }
1637
+
1638
+ /*
1639
+ * call-seq:
1640
+ * iw.max_field_length -> number
1641
+ *
1642
+ * Return the current value of max_field_length
1643
+ */
1644
+ static VALUE
1645
+ frt_iw_get_max_field_length(VALUE self)
1646
+ {
1647
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1648
+ return INT2FIX(iw->config.max_field_length);
1649
+ }
1650
+
1651
+ /*
1652
+ * call-seq:
1653
+ * iw.max_field_length = max_field_length -> max_field_length
1654
+ *
1655
+ * Set the max_field_length parameter
1656
+ */
1657
+ static VALUE
1658
+ frt_iw_set_max_field_length(VALUE self, VALUE rval)
1659
+ {
1660
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1661
+ iw->config.max_field_length = FIX2INT(rval);
1662
+ return rval;
1663
+ }
1664
+
1665
+ /*
1666
+ * call-seq:
1667
+ * iw.use_compound_file -> number
1668
+ *
1669
+ * Return the current value of use_compound_file
1670
+ */
1671
+ static VALUE
1672
+ frt_iw_get_use_compound_file(VALUE self)
1673
+ {
1674
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1675
+ return iw->config.use_compound_file ? Qtrue : Qfalse;
1676
+ }
1677
+
1678
+ /*
1679
+ * call-seq:
1680
+ * iw.use_compound_file = use_compound_file -> use_compound_file
1681
+ *
1682
+ * Set the use_compound_file parameter
1683
+ */
1684
+ static VALUE
1685
+ frt_iw_set_use_compound_file(VALUE self, VALUE rval)
1686
+ {
1687
+ IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
1688
+ iw->config.use_compound_file = RTEST(rval);
1689
+ return rval;
1690
+ }
1691
+
1692
+ /****************************************************************************
1693
+ *
1694
+ * LazyDoc Methods
1695
+ *
1696
+ ****************************************************************************/
1697
+
1698
+ static void
1699
+ frt_lzd_date_free(void *p)
1700
+ {
1701
+ lazy_doc_close((LazyDoc *)p);
1702
+ }
1703
+
1704
+ static VALUE
1705
+ frt_lazy_df_load(VALUE self, VALUE rkey, LazyDocField *lazy_df)
1706
+ {
1707
+ VALUE rdata = Qnil;
1708
+ if (lazy_df) {
1709
+ if (lazy_df->size == 1) {
1710
+ char *data = lazy_df_get_data(lazy_df, 0);
1711
+ rdata = rb_str_new(data, lazy_df->len);
1712
+ } else {
1713
+ int i;
1714
+ rdata = rb_ary_new2(lazy_df->size);
1715
+ for (i = 0; i < lazy_df->size; i++) {
1716
+ char *data = lazy_df_get_data(lazy_df, i);
1717
+ RARRAY(rdata)->ptr[i] =
1718
+ rb_str_new(data, lazy_df->data[i].length);
1719
+ RARRAY(rdata)->len++;
1720
+ }
1721
+ }
1722
+ rb_hash_aset(self, rkey, rdata);
1723
+ }
1724
+ return rdata;
1725
+ }
1726
+
1727
+ /*
1728
+ * call-seq:
1729
+ * lazy_doc.default(key) -> string
1730
+ *
1731
+ * This method is used internally to lazily load fields. You should never
1732
+ * really need to call it yourself.
1733
+ */
1734
+ static VALUE
1735
+ frt_lzd_default(VALUE self, VALUE rkey)
1736
+ {
1737
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1738
+ char *field = NULL;
1739
+ switch (TYPE(rkey)) {
1740
+ case T_STRING:
1741
+ field = RSTRING(rkey)->ptr;
1742
+ rkey = ID2SYM(rb_intern(field));
1743
+ break;
1744
+ case T_SYMBOL:
1745
+ field = frt_field(rkey);
1746
+ break;
1747
+ default:
1748
+ rb_raise(rb_eArgError,
1749
+ "%s cannot be a key to a field. Field keys must "
1750
+ " be symbols.", RSTRING(rb_obj_as_string(rkey))->ptr);
1751
+ break;
1752
+ }
1753
+ return frt_lazy_df_load(self, rkey, h_get(lazy_doc->field_dict, field));
1754
+ }
1755
+
1756
+ /*
1757
+ * call-seq:
1758
+ * lazy_doc.fields -> array of available fields
1759
+ *
1760
+ * Returns the list of fields stored for this particular document. If you try
1761
+ * to access any of these fields in the document the field will be loaded.
1762
+ * Try to access any other field an nil will be returned.
1763
+ */
1764
+ static VALUE
1765
+ frt_lzd_fields(VALUE self)
1766
+ {
1767
+ return rb_ivar_get(self, id_fields);
1768
+ }
1769
+
1770
+ /*
1771
+ * call-seq:
1772
+ * lazy_doc.load -> lazy_doc
1773
+ *
1774
+ * Load all unloaded fields in the document from the index.
1775
+ */
1776
+ static VALUE
1777
+ frt_lzd_load(VALUE self)
1778
+ {
1779
+ LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
1780
+ int i;
1781
+ for (i = 0; i < lazy_doc->size; i++) {
1782
+ LazyDocField *lazy_df = lazy_doc->fields[i];
1783
+ frt_lazy_df_load(self, ID2SYM(rb_intern(lazy_df->name)), lazy_df);
1784
+ }
1785
+ return self;
1786
+ }
1787
+
1788
+ VALUE
1789
+ frt_get_lazy_doc(LazyDoc *lazy_doc)
1790
+ {
1791
+ int i;
1792
+ VALUE rfields = rb_ary_new2(lazy_doc->size);
1793
+
1794
+ VALUE self, rdata;
1795
+ self = rb_hash_new();
1796
+ OBJSETUP(self, cLazyDoc, T_HASH);
1797
+
1798
+ rdata = Data_Wrap_Struct(cLazyDocData, NULL, &frt_lzd_date_free, lazy_doc);
1799
+ rb_ivar_set(self, id_data, rdata);
1800
+
1801
+ for (i = 0; i < lazy_doc->size; i++) {
1802
+ RARRAY(rfields)->ptr[i] = rb_intern(lazy_doc->fields[i]->name);
1803
+ RARRAY(rfields)->len++;
1804
+ }
1805
+ rb_ivar_set(self, id_fields, rfields);
1806
+
1807
+ return self;
1808
+ }
1809
+
1810
+ /****************************************************************************
1811
+ *
1812
+ * IndexReader Methods
1813
+ *
1814
+ ****************************************************************************/
1815
+
1816
+ void
1817
+ frt_ir_free(void *p)
1818
+ {
1819
+ object_del(p);
1820
+ ir_close((IndexReader *)p);
1821
+ }
1822
+
1823
+ void
1824
+ frt_ir_mark(void *p)
1825
+ {
1826
+ IndexReader *ir = (IndexReader *)p;
1827
+ frt_gc_mark(ir->store);
1828
+ }
1829
+
1830
+ /*
1831
+ * call-seq:
1832
+ * IndexReader.new(dir) -> index_reader
1833
+ *
1834
+ * Create a new IndexReader. You can either pass a string path to a
1835
+ * file-system directory or an actual Ferret::Store::Directory object. For
1836
+ * example;
1837
+ *
1838
+ * dir = RAMDirectory.new()
1839
+ * iw = IndexReader.new(dir)
1840
+ *
1841
+ * dir = FSDirectory.new("/path/to/index")
1842
+ * iw = IndexReader.new(dir)
1843
+ *
1844
+ * iw = IndexReader.new("/path/to/index")
1845
+ */
1846
+ static VALUE
1847
+ frt_ir_init(VALUE self, VALUE rdir)
1848
+ {
1849
+ Store *store = NULL;
1850
+ IndexReader *ir;
1851
+ int i;
1852
+ FieldInfos *fis;
1853
+ VALUE rfield_num_map = rb_hash_new();
1854
+
1855
+ if (TYPE(rdir) == T_ARRAY) {
1856
+ const int reader_cnt = RARRAY(rdir)->len;
1857
+ IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
1858
+ int i;
1859
+ for (i = 0; i < reader_cnt; i++) {
1860
+ Data_Get_Struct(RARRAY(rdir)->ptr[i], IndexReader, sub_readers[i]);
1861
+ REF(sub_readers[i]);
1862
+ }
1863
+ ir = mr_open(sub_readers, reader_cnt);
1864
+ } else {
1865
+ switch (TYPE(rdir)) {
1866
+ case T_DATA:
1867
+ store = DATA_PTR(rdir);
1868
+ break;
1869
+ case T_STRING:
1870
+ frt_create_dir(rdir);
1871
+ store = open_fs_store(RSTRING(rdir)->ptr);
1872
+ DEREF(store);
1873
+ break;
1874
+ default:
1875
+ rb_raise(rb_eArgError, "%s isn't a valid directory argument. "
1876
+ "You should use either a String or a Directory",
1877
+ RSTRING(rb_obj_as_string(rdir))->ptr);
1878
+ break;
1879
+ }
1880
+ ir = ir_open(store);
1881
+ }
1882
+ Frt_Wrap_Struct(self, &frt_ir_mark, &frt_ir_free, ir);
1883
+ object_add(ir, self);
1884
+
1885
+ fis = ir->fis;
1886
+ for (i = 0; i < fis->size; i++) {
1887
+ FieldInfo *fi = fis->fields[i];
1888
+ rb_hash_aset(rfield_num_map,
1889
+ ID2SYM(rb_intern(fi->name)),
1890
+ INT2FIX(fi->number));
1891
+ }
1892
+ rb_ivar_set(self, id_fld_num_map, rfield_num_map);
1893
+
1894
+ return self;
1895
+ }
1896
+
1897
+ /*
1898
+ * call-seq:
1899
+ * index_reader.set_norm(doc_id, field, val)
1900
+ *
1901
+ * Expert: change the boost value for a +field+ in document at +doc_id+.
1902
+ * +val+ should be an integer in the range 0..255 which corresponds to an
1903
+ * encoced float value.
1904
+ */
1905
+ static VALUE
1906
+ frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
1907
+ {
1908
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1909
+ ir_set_norm(ir, FIX2INT(rdoc_id), frt_field(rfield), NUM2CHR(rval));
1910
+ return self;
1911
+ }
1912
+
1913
+ /*
1914
+ * call-seq:
1915
+ * index_reader.norms(field) -> string
1916
+ *
1917
+ * Expert: Returns a string containing the norm values for a field. The
1918
+ * string length will be equal to the number of documents in the index and it
1919
+ * could have null bytes.
1920
+ */
1921
+ static VALUE
1922
+ frt_ir_norms(VALUE self, VALUE rfield)
1923
+ {
1924
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1925
+ uchar *norms;
1926
+ norms = ir_get_norms(ir, frt_field(rfield));
1927
+ if (norms) {
1928
+ return rb_str_new((char *)norms, ir->max_doc(ir));
1929
+ } else {
1930
+ return Qnil;
1931
+ }
1932
+ }
1933
+
1934
+ /*
1935
+ * call-seq:
1936
+ * index_reader.get_norms_into(field, buffer, offset) -> buffer
1937
+ *
1938
+ * Expert: Get the norm values into a string +buffer+ starting at +offset+.
1939
+ */
1940
+ static VALUE
1941
+ frt_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
1942
+ {
1943
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1944
+ int offset;
1945
+ offset = FIX2INT(roffset);
1946
+ Check_Type(rnorms, T_STRING);
1947
+ if (RSTRING(rnorms)->len < offset + ir->max_doc(ir)) {
1948
+ rb_raise(rb_eArgError, "supplied a string of length:%d to "
1949
+ "IndexReader#get_norms_into but needed a string of length "
1950
+ "offset:%d + maxdoc:%d",
1951
+ RSTRING(rnorms)->len, offset, ir->max_doc(ir));
1952
+ }
1953
+
1954
+ ir_get_norms_into(ir, frt_field(rfield),
1955
+ (uchar *)RSTRING(rnorms)->ptr + offset);
1956
+ return rnorms;
1957
+ }
1958
+
1959
+ /*
1960
+ * call-seq:
1961
+ * index_reader.commit -> index_reader
1962
+ *
1963
+ * Commit any deletes made by this particular IndexReader to the index. This
1964
+ * will use open a Commit lock.
1965
+ */
1966
+ static VALUE
1967
+ frt_ir_commit(VALUE self)
1968
+ {
1969
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1970
+ ir_commit(ir);
1971
+ return self;
1972
+ }
1973
+
1974
+ /*
1975
+ * call-seq:
1976
+ * index_reader.close -> index_reader
1977
+ *
1978
+ * Close the IndexReader. This method also commits any deletions made by this
1979
+ * IndexReader. Thise method will be called explicitly by the garbage
1980
+ * collector but you should call it explicitly to commit any changes as soon
1981
+ * as possible and to close any locks held by the object to prevent locking
1982
+ * errors.
1983
+ */
1984
+ static VALUE
1985
+ frt_ir_close(VALUE self)
1986
+ {
1987
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
1988
+ object_del(ir);
1989
+ Frt_Unwrap_Struct(self);
1990
+ ir_close(ir);
1991
+ return self;
1992
+ }
1993
+
1994
+ /*
1995
+ * call-seq:
1996
+ * index_reader.has_deletions? -> bool
1997
+ *
1998
+ * Return true if the index has any deletions, either uncommited by this
1999
+ * IndexReader or committed by any other IndexReader.
2000
+ */
2001
+ static VALUE
2002
+ frt_ir_has_deletions(VALUE self)
2003
+ {
2004
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2005
+ return ir->has_deletions(ir) ? Qtrue : Qfalse;
2006
+ }
2007
+
2008
+ /*
2009
+ * call-seq:
2010
+ * index_reader.delete(doc_id) -> index_reader
2011
+ *
2012
+ * Delete document referenced internally by document id +doc_id+. The
2013
+ * document_id is the number used to reference documents in the index and is
2014
+ * returned by search methods.
2015
+ */
2016
+ static VALUE
2017
+ frt_ir_delete(VALUE self, VALUE rdoc_id)
2018
+ {
2019
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2020
+ ir_delete_doc(ir, FIX2INT(rdoc_id));
2021
+ return self;
2022
+ }
2023
+
2024
+ /*
2025
+ * call-seq:
2026
+ * index_reader.deleted?(doc_id) -> bool
2027
+ *
2028
+ * Returns true if the document at +doc_id+ has been deleted.
2029
+ */
2030
+ static VALUE
2031
+ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
2032
+ {
2033
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2034
+ return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse;
2035
+ }
2036
+
2037
+ /*
2038
+ * call-seq:
2039
+ * index_reader.max_doc -> number
2040
+ *
2041
+ * Returns 1 + the maximum document id in the index. It is the the
2042
+ * document_id that will be used by the next document added to the index. If
2043
+ * there are no deletions, this number also refers to the number of documents
2044
+ * in the index.
2045
+ */
2046
+ static VALUE
2047
+ frt_ir_max_doc(VALUE self)
2048
+ {
2049
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2050
+ return INT2FIX(ir->max_doc(ir));
2051
+ }
2052
+
2053
+ /*
2054
+ * call-seq:
2055
+ * index_reader.num_docs -> number
2056
+ *
2057
+ * Returns the number of accessible (not deleted) documents in the index.
2058
+ * This will be equal to IndexReader#max_doc if there have been no documents
2059
+ * deleted from the index.
2060
+ */
2061
+ static VALUE
2062
+ frt_ir_num_docs(VALUE self)
2063
+ {
2064
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2065
+ return INT2FIX(ir->num_docs(ir));
2066
+ }
2067
+
2068
+ /*
2069
+ * call-seq:
2070
+ * index_reader.undelete_all -> index_reader
2071
+ *
2072
+ * Undelete all deleted documents in the index. This is kind of like a
2073
+ * rollback feature. Not that once an index is commited or a merge happens
2074
+ * during index, deletions will be committed and undelete_all will have no
2075
+ * effect on these documents.
2076
+ */
2077
+ static VALUE
2078
+ frt_ir_undelete_all(VALUE self)
2079
+ {
2080
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2081
+ ir_undelete_all(ir);
2082
+ return self;
2083
+ }
2084
+
2085
+ /*
2086
+ * call-seq:
2087
+ * index_reader.get_document(doc_id) -> LazyDoc
2088
+ * index_reader[doc_id] -> LazyDoc
2089
+ *
2090
+ * Retrieve a document from the index. See LazyDoc for more details on the
2091
+ * document returned. Documents are referenced internally by document ids
2092
+ * which are returned by the Searchers search methods.
2093
+ */
2094
+ static VALUE
2095
+ frt_ir_get_doc(VALUE self, VALUE rdoc_id)
2096
+ {
2097
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2098
+ return frt_get_lazy_doc(ir->get_lazy_doc(ir, FIX2INT(rdoc_id)));
2099
+ }
2100
+
2101
+ /*
2102
+ * call-seq:
2103
+ * index_reader.is_latest? -> bool
2104
+ *
2105
+ * Return true if the index version referenced by this IndexReader is the
2106
+ * latest version of the index. If it isn't you should close and reopen the
2107
+ * index to search the latest documents added to the index.
2108
+ */
2109
+ static VALUE
2110
+ frt_ir_is_latest(VALUE self)
2111
+ {
2112
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2113
+ return ir_is_latest(ir) ? Qtrue : Qfalse;
2114
+ }
2115
+
2116
+ /*
2117
+ * call-seq:
2118
+ * index_reader.term_vector(doc_id, field) -> TermVector
2119
+ *
2120
+ * Return the TermVector for the field +field+ in the document at +doc_id+ in
2121
+ * the index. See TermVector.
2122
+ */
2123
+ static VALUE
2124
+ frt_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield)
2125
+ {
2126
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2127
+ TermVector *tv;
2128
+ VALUE rtv;
2129
+ tv = ir->term_vector(ir, FIX2INT(rdoc_id), frt_field(rfield));
2130
+ rtv = frt_get_tv(tv);
2131
+ tv_destroy(tv);
2132
+ return rtv;
2133
+ }
2134
+
2135
+ static void
2136
+ frt_add_each_tv(void *key, void *value, void *rtvs)
2137
+ {
2138
+ rb_hash_aset((VALUE)rtvs, ID2SYM(rb_intern(key)), frt_get_tv(value));
2139
+ }
2140
+
2141
+ /*
2142
+ * call-seq:
2143
+ * index_reader.term_vectors(doc_id) -> hash of TermVector
2144
+ *
2145
+ * Return the TermVectors for the document at +doc_id+ in the index. The
2146
+ * value returned is a hash of the TermVectors for each field in the document
2147
+ * and they are referenced by field names (as symbols).
2148
+ */
2149
+ static VALUE
2150
+ frt_ir_term_vectors(VALUE self, VALUE rdoc_id)
2151
+ {
2152
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2153
+ HashTable *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id));
2154
+ VALUE rtvs = rb_hash_new();
2155
+ h_each(tvs, &frt_add_each_tv, (void *)rtvs);
2156
+ h_destroy(tvs);
2157
+
2158
+ return rtvs;
2159
+ }
2160
+
2161
+ /*
2162
+ * call-seq:
2163
+ * index_reader.term_docs -> TermDocEnum
2164
+ *
2165
+ * Builds a TermDocEnum (term-document enumerator) for the index. You can use
2166
+ * this object to iterate through the documents in which certain terms occur.
2167
+ * See TermDocEnum for more info.
2168
+ */
2169
+ static VALUE
2170
+ frt_ir_term_docs(VALUE self)
2171
+ {
2172
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2173
+ return frt_get_tde(self, ir->term_docs(ir));
2174
+ }
2175
+
2176
+ /*
2177
+ * call-seq:
2178
+ * index_reader.term_docs_for(field, term) -> TermDocEnum
2179
+ *
2180
+ * Builds a TermDocEnum to iterate through the documents that contain the
2181
+ * term +term+ in the field +field+. See TermDocEnum for more info.
2182
+ */
2183
+ static VALUE
2184
+ frt_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm)
2185
+ {
2186
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2187
+ return frt_get_tde(self, ir_term_docs_for(ir,
2188
+ frt_field(rfield),
2189
+ StringValuePtr(rterm)));
2190
+ }
2191
+
2192
+ /*
2193
+ * call-seq:
2194
+ * index_reader.term_positions -> TermDocEnum
2195
+ *
2196
+ * Same as IndexReader#term_docs except the TermDocEnum will also allow you
2197
+ * to scan through the positions at which a term occurs. See TermDocEnum for
2198
+ * more info.
2199
+ */
2200
+ static VALUE
2201
+ frt_ir_term_positions(VALUE self)
2202
+ {
2203
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2204
+ return frt_get_tde(self, ir->term_positions(ir));
2205
+ }
2206
+
2207
+ /*
2208
+ * call-seq:
2209
+ * index_reader.term_positions_for(field, term) -> TermDocEnum
2210
+ *
2211
+ * Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will
2212
+ * also allow you to scan through the positions at which a term occurs. See
2213
+ * TermDocEnum for more info.
2214
+ */
2215
+ static VALUE
2216
+ frt_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm)
2217
+ {
2218
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2219
+ return frt_get_tde(self, ir_term_positions_for(ir,
2220
+ frt_field(rfield),
2221
+ StringValuePtr(rterm)));
2222
+ }
2223
+
2224
+ /*
2225
+ * call-seq:
2226
+ * index_reader.doc_freq(field, term) -> integer
2227
+ *
2228
+ * Return the number of documents in which the term +term+ appears in the
2229
+ * field +field+.
2230
+ */
2231
+ static VALUE
2232
+ frt_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
2233
+ {
2234
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2235
+ return INT2FIX(ir_doc_freq(ir,
2236
+ frt_field(rfield),
2237
+ StringValuePtr(rterm)));
2238
+ }
2239
+
2240
+ /*
2241
+ * call-seq:
2242
+ * index_reader.terms(field) -> TermEnum
2243
+ *
2244
+ * Returns a term enumerator which allows you to iterate through all the
2245
+ * terms in the field +field+ in the index.
2246
+ */
2247
+ static VALUE
2248
+ frt_ir_terms(VALUE self, VALUE rfield)
2249
+ {
2250
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2251
+ return frt_get_te(self, ir_terms(ir, frt_field(rfield)));
2252
+ }
2253
+
2254
+ /*
2255
+ * call-seq:
2256
+ * index_reader.terms_from(field, term) -> TermEnum
2257
+ *
2258
+ * Same as IndexReader#terms(fields) except that it starts the enumerator off
2259
+ * at term +term+.
2260
+ */
2261
+ static VALUE
2262
+ frt_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2263
+ {
2264
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2265
+ return frt_get_te(self, ir_terms_from(ir,
2266
+ frt_field(rfield),
2267
+ StringValuePtr(rterm)));
2268
+ }
2269
+
2270
+ /*
2271
+ * call-seq:
2272
+ * index_reader.field_names -> array of field-names
2273
+ *
2274
+ * Returns an array of field names in the index. This can be used to pass to
2275
+ * the QueryParser so that the QueryParser knows how to expand the "*"
2276
+ * wild-card to all fields in the index. A list of field names can also be
2277
+ * gathered from the FieldInfos object.
2278
+ */
2279
+ static VALUE
2280
+ frt_ir_field_names(VALUE self)
2281
+ {
2282
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2283
+ FieldInfos *fis = ir->fis;
2284
+ VALUE rfield_names = rb_ary_new();
2285
+ int i;
2286
+ for (i = 0; i < fis->size; i++) {
2287
+ rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
2288
+ }
2289
+ return rfield_names;
2290
+ }
2291
+
2292
+ /*
2293
+ * call-seq:
2294
+ * index_reader.field_infos -> FieldInfos
2295
+ *
2296
+ * Get the FieldInfos object for this IndexReader.
2297
+ */
2298
+ static VALUE
2299
+ frt_ir_field_infos(VALUE self)
2300
+ {
2301
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2302
+ return frt_get_field_infos(ir->fis);
2303
+ }
2304
+
2305
+ /****************************************************************************
2306
+ *
2307
+ * Init Functions
2308
+ *
2309
+ ****************************************************************************/
2310
+
2311
+
2312
+ /*
2313
+ * Document-class: Ferret::Index::FieldInfo
2314
+ *
2315
+ * == Summary
2316
+ *
2317
+ * The FieldInfo class is the field descripter for the index. It specifies
2318
+ * whether a field is compressed or not or whether it should be indexed and
2319
+ * tokenized. Every field has a name which must be a symbol. There are three
2320
+ * properties that you can set, +:store+, +:index+ and +:term_vector+. You
2321
+ * can also set the default +:boost+ for a field as well.
2322
+ *
2323
+ * == Properties
2324
+ *
2325
+ * === :store
2326
+ *
2327
+ * The +:store+ property allows you to specify how a field is stored. You can
2328
+ * leave a field unstored (+:no+), store it in it's original format (+:yes+)
2329
+ * or store it in compressed format (+:compressed+). By default the document
2330
+ * is stored in its original format. If the field is large and it is stored
2331
+ * elsewhere where it is easily accessible you might want to leave it
2332
+ * unstored. This will keep the index size a lot smaller and make the
2333
+ * indexing process a lot faster. For example, you should probably leave the
2334
+ * +:content+ field unstored when indexing all the documents in your
2335
+ * file-system.
2336
+ *
2337
+ * === :index
2338
+ *
2339
+ * The +:index+ property allows you to specify how a field is indexed. A
2340
+ * field must be indexed to be searchable. However, a field doesn't need to
2341
+ * be indexed to be store in the Ferret index. You may want to use the index
2342
+ * as a simple database and store things like images or MP3s in the index. By
2343
+ * default each field is indexed and tokenized (split into tokens) (+:yes+).
2344
+ * If you don't want to index the field use +:no+. If you wan the field
2345
+ * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2346
+ * wish to sort by. There are two other values for +:index+; +:omit_norms+
2347
+ * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
2348
+ * +:untokenized+ respectively and are useful if you are not boosting any
2349
+ * fields and you'd like to speed up the index. The norms file is the file
2350
+ * which contains the boost values for each document for a particular field.
2351
+ *
2352
+ * === :term_vector
2353
+ *
2354
+ * See TermVector for a description of term-vectors. You can specify whether
2355
+ * or not you would like to store term-vectors. The available options are
2356
+ * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2357
+ * +:with_positions_offsets+. Note that you need to store the positions to
2358
+ * asscociate offsets with individual terms in the term_vector.
2359
+ *
2360
+ * == Property Table
2361
+ *
2362
+ * Property Value Description
2363
+ * ------------------------------------------------------------------------
2364
+ * :store | :no | Don't store field
2365
+ * | |
2366
+ * | :yes (default) | Store field in its original
2367
+ * | | format. Use this value if you
2368
+ * | | want to highlight matches.
2369
+ * | | or print match excerpts a la
2370
+ * | | Google search.
2371
+ * | |
2372
+ * | :compressed | Store field in compressed
2373
+ * | | format.
2374
+ * -------------|-------------------------|------------------------------
2375
+ * :index | :no | Do not make this field
2376
+ * | | searchable.
2377
+ * | |
2378
+ * | :yes (default) | Make this field searchable and
2379
+ * | | tokenized its contents.
2380
+ * | |
2381
+ * | :untokenized | Make this field searchable but
2382
+ * | | do not tokenize its contents.
2383
+ * | | use this value for fields you
2384
+ * | | wish to sort by.
2385
+ * | |
2386
+ * | :omit_norms | Same as :yes except omit the
2387
+ * | | norms file. The norms file can
2388
+ * | | be omitted if you don't boost
2389
+ * | | any fields and you don't need
2390
+ * | | scoring based on field length.
2391
+ * | |
2392
+ * | :untokenized_omit_norms | Same as :untokenized except omit
2393
+ * | | the norms file. Norms files can
2394
+ * | | be omitted if you don't boost
2395
+ * | | any fields and you don't need
2396
+ * | | scoring based on field length.
2397
+ * | |
2398
+ * -------------|-------------------------|------------------------------
2399
+ * :term_vector | :no | Don't store term-vectors
2400
+ * | |
2401
+ * | :yes | Store term-vectors without
2402
+ * | | storing positions or offsets.
2403
+ * | |
2404
+ * | :with_positions | Store term-vectors with
2405
+ * | | positions.
2406
+ * | |
2407
+ * | :with_offsets | Store term-vectors with
2408
+ * | | offsets.
2409
+ * | |
2410
+ * | :with_positions_offsets | Store term-vectors with
2411
+ * | (default) | positions and offsets.
2412
+ *
2413
+ * == Examples
2414
+ *
2415
+ * fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
2416
+ * :boost => 10.0)
2417
+ *
2418
+ * fi = FieldInfo.new(:content)
2419
+ *
2420
+ * fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
2421
+ * :term_vector => :no)
2422
+ *
2423
+ * fi = FieldInfo.new(:image, :store => :compressed, :index => :no,
2424
+ * :term_vector => :no)
2425
+ */
2426
+ static void
2427
+ Init_FieldInfo(void)
2428
+ {
2429
+ sym_store = ID2SYM(rb_intern("store"));
2430
+ sym_index = ID2SYM(rb_intern("index"));
2431
+ sym_term_vector = ID2SYM(rb_intern("term_vector"));
2432
+
2433
+ sym_compress = ID2SYM(rb_intern("compress"));
2434
+ sym_compressed = ID2SYM(rb_intern("compressed"));
2435
+
2436
+ sym_untokenized = ID2SYM(rb_intern("untokenized"));
2437
+ sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
2438
+ sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
2439
+
2440
+ sym_with_positions = ID2SYM(rb_intern("with_positions"));
2441
+ sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
2442
+ sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
2443
+
2444
+ cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
2445
+ rb_define_alloc_func(cFieldInfo, frt_data_alloc);
2446
+
2447
+ rb_define_method(cFieldInfo, "initialize", frt_fi_init, -1);
2448
+ rb_define_method(cFieldInfo, "stored?", frt_fi_is_stored, 0);
2449
+ rb_define_method(cFieldInfo, "compressed?", frt_fi_is_compressed, 0);
2450
+ rb_define_method(cFieldInfo, "indexed?", frt_fi_is_indexed, 0);
2451
+ rb_define_method(cFieldInfo, "tokenized?", frt_fi_is_tokenized, 0);
2452
+ rb_define_method(cFieldInfo, "omit_norms?", frt_fi_omit_norms, 0);
2453
+ rb_define_method(cFieldInfo, "store_term_vector?",
2454
+ frt_fi_store_term_vector, 0);
2455
+ rb_define_method(cFieldInfo, "store_positions?",
2456
+ frt_fi_store_positions, 0);
2457
+ rb_define_method(cFieldInfo, "store_offsets?",
2458
+ frt_fi_store_offsets, 0);
2459
+ rb_define_method(cFieldInfo, "has_norms?", frt_fi_has_norms, 0);
2460
+ rb_define_method(cFieldInfo, "boost", frt_fi_boost, 0);
2461
+ rb_define_method(cFieldInfo, "to_s", frt_fi_to_s, 0);
2462
+ }
2463
+
2464
+ /*
2465
+ * Document-class: Ferret::Index::FieldInfos
2466
+ *
2467
+ * == Summary
2468
+ *
2469
+ * The FieldInfos class holds all the field descriptors for an index. It is
2470
+ * this class that is used to create a new index using the
2471
+ * FieldInfos#create_index method. If you are happy with the default
2472
+ * properties for FieldInfo then you don't need to worry about this class.
2473
+ * IndexWriter can create the index for you. Otherwise you should set up the
2474
+ * index like in the example;
2475
+ *
2476
+ * == Example
2477
+ *
2478
+ * field_infos = FieldInfos.new(:term_vector => :no)
2479
+ *
2480
+ * field_infos.add_field(:title, :index => :untokenized, :term_vector => :no,
2481
+ * :boost => 10.0)
2482
+ *
2483
+ * field_infos.add_field(:content)
2484
+ *
2485
+ * field_infos.add_field(:created_on, :index => :untokenized_omit_norms,
2486
+ * :term_vector => :no)
2487
+ *
2488
+ * field_infos.add_field(:image, :store => :compressed, :index => :no,
2489
+ * :term_vector => :no)
2490
+ *
2491
+ * field_infos.create_index("/path/to/index")
2492
+ *
2493
+ * == Default Properties
2494
+ *
2495
+ * See FieldInfo for the available field property values.
2496
+ *
2497
+ * When you create the FieldInfos object you specify the default properties
2498
+ * for the fields. Often you'll specify all of the fields in the index before
2499
+ * you create the index so the default values won't come into play. However,
2500
+ * it is possible to continue to dynamically add fields as indexing goes
2501
+ * along. If you add a document to the index which has fields that the index
2502
+ * doesn't know about then the default properties are used for the new field.
2503
+ */
2504
+ static void
2505
+ Init_FieldInfos(void)
2506
+ {
2507
+ Init_FieldInfo();
2508
+
2509
+ cFieldInfos = rb_define_class_under(mIndex, "FieldInfos", rb_cObject);
2510
+ rb_define_alloc_func(cFieldInfos, frt_data_alloc);
2511
+
2512
+ rb_define_method(cFieldInfos, "initialize", frt_fis_init, -1);
2513
+ rb_define_method(cFieldInfos, "to_a", frt_fis_to_a, 0);
2514
+ rb_define_method(cFieldInfos, "[]", frt_fis_get, 1);
2515
+ rb_define_method(cFieldInfos, "add", frt_fis_add, 1);
2516
+ rb_define_method(cFieldInfos, "<<", frt_fis_add, 1);
2517
+ rb_define_method(cFieldInfos, "add_field", frt_fis_add_field, -1);
2518
+ rb_define_method(cFieldInfos, "each", frt_fis_each, 0);
2519
+ rb_define_method(cFieldInfos, "to_s", frt_fis_to_s, 0);
2520
+ rb_define_method(cFieldInfos, "create_index",
2521
+ frt_fis_create_index, 1);
2522
+ rb_define_method(cFieldInfos, "fields", frt_fis_get_fields, -1);
2523
+ }
2524
+
2525
+ /*
2526
+ * Document-class: Ferret::Index::TermEnum
2527
+ *
2528
+ * == Summary
2529
+ *
2530
+ * The TermEnum object is used to iterate through the terms in a field. To
2531
+ * get a TermEnum you need to use the IndexReader#terms(field) method.
2532
+ *
2533
+ * == Example
2534
+ *
2535
+ * te = index_reader.terms(:content)
2536
+ *
2537
+ * te.each {|term, doc_freq| puts "#{term} occured #{doc_freq} times" }
2538
+ *
2539
+ * # or you could do it like this;
2540
+ * te = index_reader.terms(:content)
2541
+ *
2542
+ * while te.next?
2543
+ * puts "#{te.term} occured in #{te.doc_freq} documents in the index"
2544
+ * end
2545
+ */
2546
+ static void
2547
+ Init_TermEnum(void)
2548
+ {
2549
+ id_term = rb_intern("@term");
2550
+
2551
+ cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
2552
+ rb_define_alloc_func(cTermEnum, frt_data_alloc);
2553
+
2554
+ rb_define_method(cTermEnum, "next?", frt_te_next, 0);
2555
+ rb_define_method(cTermEnum, "term", frt_te_term, 0);
2556
+ rb_define_method(cTermEnum, "doc_freq", frt_te_doc_freq, 0);
2557
+ rb_define_method(cTermEnum, "skip_to", frt_te_skip_to, 1);
2558
+ rb_define_method(cTermEnum, "each", frt_te_each, 0);
2559
+ rb_define_method(cTermEnum, "field=", frt_te_set_field, 1);
2560
+ rb_define_method(cTermEnum, "set_field",frt_te_set_field, 1);
2561
+ }
2562
+
2563
+ /*
2564
+ * Document-class: Ferret::Index::TermDocEnum
2565
+ *
2566
+ * == Summary
2567
+ *
2568
+ * Use a TermDocEnum to iterate through the documents that contain a
2569
+ * particular term. You can also iterate through the positions which the term
2570
+ * occurs in a document.
2571
+ *
2572
+ *
2573
+ * == Example
2574
+ *
2575
+ * tde = index_reader.term_docs_for(:content, "fox")
2576
+ *
2577
+ * tde.each do |doc_id, freq|
2578
+ * puts "fox appeared #{freq} times in document #{doc_id}:"
2579
+ * positions = []
2580
+ * tde.each_position {|pos| positions << pos}
2581
+ * puts " #{positions.join(', ')}"
2582
+ * end
2583
+ *
2584
+ * # or you can do it like this;
2585
+ * tde.seek(:title, "red")
2586
+ * while tde.next?
2587
+ * puts "red appeared #{tde.freq} times in document #{tde.doc}:"
2588
+ * positions = []
2589
+ * while pos = tde.next_position
2590
+ * positions << pos
2591
+ * end
2592
+ * puts " #{positions.join(', ')}"
2593
+ * end
2594
+ */
2595
+ static void
2596
+ Init_TermDocEnum(void)
2597
+ {
2598
+ id_fld_num_map = rb_intern("@field_num_map");
2599
+ id_field_num = rb_intern("@field_num");
2600
+
2601
+ cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
2602
+ rb_define_alloc_func(cTermDocEnum, frt_data_alloc);
2603
+ rb_define_method(cTermDocEnum, "seek", frt_tde_seek, 2);
2604
+ rb_define_method(cTermDocEnum, "seek_term_enum", frt_tde_seek_te, 1);
2605
+ rb_define_method(cTermDocEnum, "doc", frt_tde_doc, 0);
2606
+ rb_define_method(cTermDocEnum, "freq", frt_tde_freq, 0);
2607
+ rb_define_method(cTermDocEnum, "next?", frt_tde_next, 0);
2608
+ rb_define_method(cTermDocEnum, "next_position", frt_tde_next_position, 0);
2609
+ rb_define_method(cTermDocEnum, "each", frt_tde_each, 0);
2610
+ rb_define_method(cTermDocEnum, "each_position", frt_tde_each_position, 0);
2611
+ rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
2612
+ }
2613
+
2614
+ /*
2615
+ * Document-class: Ferret::Index::TermVector::TVOffsets
2616
+ *
2617
+ * == Summary
2618
+ *
2619
+ * Holds the start and end byte-offsets of a term in a field. For example, if
2620
+ * the field was "the quick brown fox" then the start and end offsets of
2621
+ * ["the", "quick", "brown", "fox"] would be [(0,3), (4,9), (10,15), (16,19)]
2622
+ * respectively. See the Analysis module for more information on setting the
2623
+ * offsets.
2624
+ */
2625
+ static void
2626
+ Init_TVOffsets(void)
2627
+ {
2628
+ const char *tv_offsets_class = "TVOffsets";
2629
+ cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
2630
+ rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
2631
+ rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
2632
+ }
2633
+
2634
+ /*
2635
+ * Document-class: Ferret::Index::TermVector::TVTerm
2636
+ *
2637
+ * == Summary
2638
+ *
2639
+ * The TVTerm class holds the term information for each term in a TermVector.
2640
+ * That is it holds the term's text and its positions in the document. You
2641
+ * can use those positions to reference the offsets for the term.
2642
+ *
2643
+ * == Example
2644
+ *
2645
+ * tv = index_reader.term_vector(:content)
2646
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
2647
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
2648
+ */
2649
+ static void
2650
+ Init_TVTerm(void)
2651
+ {
2652
+ const char *tv_term_class = "TVTerm";
2653
+ cTVTerm = rb_struct_define(tv_term_class, "text", "positions", NULL);
2654
+ rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
2655
+ rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
2656
+ }
2657
+
2658
+ /*
2659
+ * Document-class: Ferret::Index::TermVector
2660
+ *
2661
+ * == Summary
2662
+ *
2663
+ * TermVectors are most commonly used for creating search result excerpts and
2664
+ * highlight search matches in results. This is all done internally so you
2665
+ * won't need to worry about the TermVector object. There are some other
2666
+ * reasons you may want to use the TermVectors object however. For example,
2667
+ * you may wish to see which terms are the most commonly occuring terms in a
2668
+ * document to implement a MoreLikeThis search.
2669
+ *
2670
+ * == Example
2671
+ *
2672
+ * tv = index_reader.term_vector(:content)
2673
+ * tv_term = tv.find {|tvt| tvt.term = "fox"}
2674
+ *
2675
+ * # get the term frequency
2676
+ * term_freq = tv_term.positions.size
2677
+ *
2678
+ * # get the offsets for a term
2679
+ * offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
2680
+ *
2681
+ * == Note
2682
+ *
2683
+ * +positions+ and +offsets+ can be +nil+ depending on what you set the
2684
+ * +:term_vector+ to when you set the FieldInfo object for the field. Note in
2685
+ * particular that you need to store both positions and offsets if you want
2686
+ * to asscociate offsets with particular terms.
2687
+ */
2688
+ static void
2689
+ Init_TermVector(void)
2690
+ {
2691
+ const char *tv_class = "TermVector";
2692
+ cTermVector = rb_struct_define(tv_class,
2693
+ "field", "terms", "offsets", NULL);
2694
+ rb_set_class_path(cTermVector, mIndex, tv_class);
2695
+ rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
2696
+
2697
+ Init_TVOffsets();
2698
+ Init_TVTerm();
2699
+ }
2700
+
2701
+ /*
2702
+ * Document-class: Ferret::Index::IndexWriter
2703
+ *
2704
+ * == Summary
2705
+ *
2706
+ * The IndexWriter is the class used to add documents to an index. You can
2707
+ * also delete docuements from the index using this class. The indexing
2708
+ * process is highly customizable and the IndexWriter has the following
2709
+ * parameters;
2710
+ *
2711
+ * dir:: This is an Ferret::Store::Directory object. You
2712
+ * should either pass a +:dir+ or a +:path+ when
2713
+ * creating an index.
2714
+ * path:: A string representing the path to the index
2715
+ * directory. If you are creating the index for the
2716
+ * first time the directory will be created if it's
2717
+ * missing. You should not choose a directory which
2718
+ * contains other files as they could be over-written.
2719
+ * To protect against this set +:create_if_missing+ to
2720
+ * false.
2721
+ * create_if_missing:: Default: true. Create the index if no index is
2722
+ * found in the specified directory. Otherwise, use
2723
+ * the existing index.
2724
+ * create:: Default: false. Creates the index, even if one
2725
+ * already exists. That means any existing index will
2726
+ * be deleted. It is probably better to use the
2727
+ * create_if_missing option so that the index is only
2728
+ * created the first time when it doesn't exist.
2729
+ * field_infos:: Default FieldInfos.new. The FieldInfos object to use
2730
+ * when creating a new index if +:create_if_missing+ or
2731
+ * +:create+ is set to true. If an existing index is
2732
+ * opened then this parameter is ignored.
2733
+ * analyzer:: Default: Ferret::Analysis::StandardAnalyzer.
2734
+ * Sets the default analyzer for the index. This is
2735
+ * used by both the IndexWriter and the QueryParser
2736
+ * to tokenize the input. The default is the
2737
+ * StandardAnalyzer.
2738
+ * chunk_size:: Default: 0x100000 or 1Mb. Memory performance tuning
2739
+ * parameter. Sets the default size of chunks of memory
2740
+ * malloced for use during indexing. You can usually
2741
+ * leave this parameter as is.
2742
+ * max_buffer_memory:: Default: 0x1000000 or 16Mb. Memory performance
2743
+ * tuning parameter. Sets the amount of memory to be
2744
+ * used by the indexing process. Set to a larger value
2745
+ * to increase indexing speed. Note that this only
2746
+ * includes memory used by the indexing process, not
2747
+ * the rest of your ruby application.
2748
+ * term_index_interval:: Default: 128. The skip interval between terms in the
2749
+ * term dictionary. A smaller value will possibly
2750
+ * increase search performance while also increasing
2751
+ * memory usage and impacting negatively impacting
2752
+ * indexing performance.
2753
+ * doc_skip_interval:: Default: 16. The skip interval for document numbers
2754
+ * in the index. As with +:term_index_interval+ you
2755
+ * have a trade-off. A smaller number may increase
2756
+ * search performance while also increasing memory
2757
+ * usage and impacting negatively impacting indexing
2758
+ * performance.
2759
+ * merge_factor:: Default: 10. This must never be less than 2.
2760
+ * Specifies the number of segments of a certain size
2761
+ * that must exist before they are merged. A larger
2762
+ * value will improve indexing performance while
2763
+ * slowing search performance.
2764
+ * max_buffered_docs:: Default: 10000. The maximum number of documents that
2765
+ * may be stored in memory before being written to the
2766
+ * index. If you have a lot of memory and are indexing
2767
+ * a large number of small documents (like products in
2768
+ * a product database for example) you may want to set
2769
+ * this to a much higher number (like
2770
+ * Ferret::FIX_INT_MAX). If you are worried about your
2771
+ * application crashing during the middle of index you
2772
+ * might set this to a smaller number so that the index
2773
+ * is committed more often. This is like having an
2774
+ * auto-save in a word processor application.
2775
+ * max_merge_docs:: Set this value to limit the number of documents that
2776
+ * go into a single segment. Use this to avoid
2777
+ * extremely long merge times during indexing which can
2778
+ * make your application seem unresponsive. This is
2779
+ * only necessary for very large indexes (millions of
2780
+ * documents).
2781
+ * max_field_length:: Default: 10000. The maximum number of terms added to
2782
+ * a single field. This can be useful to protect the
2783
+ * indexer when indexing documents fromt the web for
2784
+ * example. Usually the most important terms will occur
2785
+ * early on in a document so you can often safely
2786
+ * ignore the terms in a field after a certain number
2787
+ * of them. If you wanted to speed up indexing and same
2788
+ * space in your index you may only want to index the
2789
+ * first 1000 terms in a field. On the other hand, if
2790
+ * you want to be more thorough and you are indexing
2791
+ * documents from your file-system you may set this
2792
+ * paramter to Ferret::FIX_INT_MAX.
2793
+ * use_compound_file:: Default: true. Uses a compound file to store the
2794
+ * index. This prevents an error being raised for
2795
+ * having too many files open at the same time. The
2796
+ * default is true but performance is better if this is
2797
+ * set to false.
2798
+ *
2799
+ *
2800
+ * === Deleting Documents
2801
+ *
2802
+ * Both IndexReader and IndexWriter allow you to delete documents. You should
2803
+ * use the IndexReader to delete documents by document id and IndexWriter to
2804
+ * delete documents by term which we'll explain now. It is preferrable to
2805
+ * delete documents from an index using IndexWriter for performance reasons.
2806
+ * To delete documents using the IndexWriter you should give each document in
2807
+ * the index a unique ID. If you are indexing documents from the file-system
2808
+ * this unique ID will be the full file path. If indexing documents from the
2809
+ * database you should use the primary key as the ID field. You can then
2810
+ * use the delete method to delete a file referenced by the ID. For example;
2811
+ *
2812
+ * index_writer.delete(:id, "/path/to/indexed/file")
2813
+ */
2814
+ void
2815
+ Init_IndexWriter(void)
2816
+ {
2817
+ id_boost = rb_intern("boost");
2818
+
2819
+ sym_create = ID2SYM(rb_intern("create"));
2820
+ sym_create_if_missing = ID2SYM(rb_intern("create_if_missing"));
2821
+ sym_field_infos = ID2SYM(rb_intern("field_infos"));
2822
+
2823
+ sym_chunk_size = ID2SYM(rb_intern("chunk_size"));
2824
+ sym_max_buffer_memory = ID2SYM(rb_intern("max_buffer_memory"));
2825
+ sym_index_interval = ID2SYM(rb_intern("term_index_interval"));
2826
+ sym_skip_interval = ID2SYM(rb_intern("doc_skip_interval"));
2827
+ sym_merge_factor = ID2SYM(rb_intern("merge_factor"));
2828
+ sym_max_buffered_docs = ID2SYM(rb_intern("max_buffered_docs"));
2829
+ sym_max_merge_docs = ID2SYM(rb_intern("max_merge_docs"));
2830
+ sym_max_field_length = ID2SYM(rb_intern("max_field_length"));
2831
+ sym_use_compound_file = ID2SYM(rb_intern("use_compound_file"));
2832
+
2833
+ cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
2834
+ rb_define_alloc_func(cIndexWriter, frt_data_alloc);
2835
+
2836
+ rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
2837
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
2838
+ rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
2839
+ rb_str_new2(WRITE_LOCK_NAME));
2840
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
2841
+ rb_str_new2(COMMIT_LOCK_NAME));
2842
+ rb_define_const(cIndexWriter, "DEFAULT_CHUNK_SIZE",
2843
+ INT2FIX(default_config.chunk_size));
2844
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFER_MEMORY",
2845
+ INT2FIX(default_config.max_buffer_memory));
2846
+ rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
2847
+ INT2FIX(default_config.index_interval));
2848
+ rb_define_const(cIndexWriter, "DEFAULT_DOC_SKIP_INTERVAL",
2849
+ INT2FIX(default_config.skip_interval));
2850
+ rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
2851
+ INT2FIX(default_config.merge_factor));
2852
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFERED_DOCS",
2853
+ INT2FIX(default_config.max_buffered_docs));
2854
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
2855
+ INT2FIX(default_config.max_merge_docs));
2856
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
2857
+ INT2FIX(default_config.max_field_length));
2858
+ rb_define_const(cIndexWriter, "DEFAULT_USE_COMPOUND_FILE",
2859
+ default_config.use_compound_file ? Qtrue : Qfalse);
2860
+
2861
+ rb_define_method(cIndexWriter, "initialize", frt_iw_init, -1);
2862
+ rb_define_method(cIndexWriter, "doc_count", frt_iw_get_doc_count, 0);
2863
+ rb_define_method(cIndexWriter, "close", frt_iw_close, 0);
2864
+ rb_define_method(cIndexWriter, "add_document", frt_iw_add_doc, 1);
2865
+ rb_define_method(cIndexWriter, "<<", frt_iw_add_doc, 1);
2866
+ rb_define_method(cIndexWriter, "optimize", frt_iw_optimize, 0);
2867
+ rb_define_method(cIndexWriter, "commit", frt_iw_commit, 0);
2868
+ rb_define_method(cIndexWriter, "add_readers", frt_iw_add_readers, 1);
2869
+ rb_define_method(cIndexWriter, "delete", frt_iw_delete, 2);
2870
+ rb_define_method(cIndexWriter, "field_infos", frt_iw_field_infos, 0);
2871
+ rb_define_method(cIndexWriter, "analyzer", frt_iw_get_analyzer, 0);
2872
+ rb_define_method(cIndexWriter, "analyzer=", frt_iw_set_analyzer, 1);
2873
+
2874
+ rb_define_method(cIndexWriter, "chunk_size",
2875
+ frt_iw_get_chunk_size, 0);
2876
+ rb_define_method(cIndexWriter, "chunk_size=",
2877
+ frt_iw_set_chunk_size, 1);
2878
+
2879
+ rb_define_method(cIndexWriter, "max_buffer_memory",
2880
+ frt_iw_get_max_buffer_memory, 0);
2881
+ rb_define_method(cIndexWriter, "max_buffer_memory=",
2882
+ frt_iw_set_max_buffer_memory, 1);
2883
+
2884
+ rb_define_method(cIndexWriter, "term_index_interval",
2885
+ frt_iw_get_index_interval, 0);
2886
+ rb_define_method(cIndexWriter, "term_index_interval=",
2887
+ frt_iw_set_index_interval, 1);
2888
+
2889
+ rb_define_method(cIndexWriter, "doc_skip_interval",
2890
+ frt_iw_get_skip_interval, 0);
2891
+ rb_define_method(cIndexWriter, "doc_skip_interval=",
2892
+ frt_iw_set_skip_interval, 1);
2893
+
2894
+ rb_define_method(cIndexWriter, "merge_factor",
2895
+ frt_iw_get_merge_factor, 0);
2896
+ rb_define_method(cIndexWriter, "merge_factor=",
2897
+ frt_iw_set_merge_factor, 1);
2898
+
2899
+ rb_define_method(cIndexWriter, "max_buffered_docs",
2900
+ frt_iw_get_max_buffered_docs, 0);
2901
+ rb_define_method(cIndexWriter, "max_buffered_docs=",
2902
+ frt_iw_set_max_buffered_docs, 1);
2903
+
2904
+ rb_define_method(cIndexWriter, "max_merge_docs",
2905
+ frt_iw_get_max_merge_docs, 0);
2906
+ rb_define_method(cIndexWriter, "max_merge_docs=",
2907
+ frt_iw_set_max_merge_docs, 1);
2908
+
2909
+ rb_define_method(cIndexWriter, "max_field_length",
2910
+ frt_iw_get_max_field_length, 0);
2911
+ rb_define_method(cIndexWriter, "max_field_length=",
2912
+ frt_iw_set_max_field_length, 1);
2913
+
2914
+ rb_define_method(cIndexWriter, "use_compound_file",
2915
+ frt_iw_get_use_compound_file, 0);
2916
+ rb_define_method(cIndexWriter, "use_compound_file=",
2917
+ frt_iw_set_use_compound_file, 1);
2918
+
2919
+ }
2920
+
2921
+ /*
2922
+ * Document-class: Ferret::Index::LazyDoc
2923
+ *
2924
+ * == Summary
2925
+ *
2926
+ * When a document is retrieved from the index a LazyDoc is returned.
2927
+ * Actually, LazyDoc is just a modified Hash object which lazily adds fields
2928
+ * to itself when they are accessed. You should not that they keys method
2929
+ * will return nothing until you actually access one of the fields. To see
2930
+ * what fields are available use LazyDoc#fields rather than LazyDoc#keys. To
2931
+ * load all fields use the LazyDoc#load method.
2932
+ *
2933
+ * == Example
2934
+ *
2935
+ * doc = index_reader[0]
2936
+ *
2937
+ * doc.keys #=> []
2938
+ * doc.values #=> []
2939
+ * doc.fields #=> [:title, :content]
2940
+ *
2941
+ * title = doc[:title] #=> "the title"
2942
+ * doc.keys #=> [:title]
2943
+ * doc.values #=> ["the title"]
2944
+ * doc.fields #=> [:title, :content]
2945
+ *
2946
+ * doc.load
2947
+ * doc.keys #=> [:title, :content]
2948
+ * doc.values #=> ["the title", "the content"]
2949
+ * doc.fields #=> [:title, :content]
2950
+ */
2951
+ void
2952
+ Init_LazyDoc(void)
2953
+ {
2954
+ id_fields = rb_intern("@fields");
2955
+
2956
+
2957
+ cLazyDoc = rb_define_class_under(mIndex, "LazyDoc", rb_cHash);
2958
+ rb_define_method(cLazyDoc, "default", frt_lzd_default, 1);
2959
+ rb_define_method(cLazyDoc, "load", frt_lzd_load, 0);
2960
+ rb_define_method(cLazyDoc, "fields", frt_lzd_fields, 0);
2961
+
2962
+ cLazyDocData = rb_define_class_under(cLazyDoc, "LazyDocData", rb_cObject);
2963
+ rb_define_alloc_func(cLazyDocData, frt_data_alloc);
2964
+ }
2965
+
2966
+ /*
2967
+ * Document-class: Ferret::Index::IndexReader
2968
+ *
2969
+ * == Summary
2970
+ *
2971
+ * IndexReader is used for reading data from the index. This class is usually
2972
+ * used directly for more advanced tasks like iterating through terms in an
2973
+ * index, accessing term-vectors or deleting documents by document id. It is
2974
+ * also used internally by IndexSearcher.
2975
+ */
2976
+ void
2977
+ Init_IndexReader(void)
2978
+ {
2979
+ cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
2980
+ rb_define_alloc_func(cIndexReader, frt_data_alloc);
2981
+ rb_define_method(cIndexReader, "initialize", frt_ir_init, 1);
2982
+ rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
2983
+ rb_define_method(cIndexReader, "norms", frt_ir_norms, 1);
2984
+ rb_define_method(cIndexReader, "get_norms_into",frt_ir_get_norms_into, 3);
2985
+ rb_define_method(cIndexReader, "commit", frt_ir_commit, 0);
2986
+ rb_define_method(cIndexReader, "close", frt_ir_close, 0);
2987
+ rb_define_method(cIndexReader, "has_deletions?",frt_ir_has_deletions, 0);
2988
+ rb_define_method(cIndexReader, "delete", frt_ir_delete, 1);
2989
+ rb_define_method(cIndexReader, "deleted?", frt_ir_is_deleted, 1);
2990
+ rb_define_method(cIndexReader, "max_doc", frt_ir_max_doc, 0);
2991
+ rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
2992
+ rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
2993
+ rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
2994
+ rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, 1);
2995
+ rb_define_method(cIndexReader, "[]", frt_ir_get_doc, 1);
2996
+ rb_define_method(cIndexReader, "term_vector", frt_ir_term_vector, 2);
2997
+ rb_define_method(cIndexReader, "term_vectors", frt_ir_term_vectors, 1);
2998
+ rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
2999
+ rb_define_method(cIndexReader, "term_positions",frt_ir_term_positions, 0);
3000
+ rb_define_method(cIndexReader, "term_docs_for", frt_ir_term_docs_for, 2);
3001
+ rb_define_method(cIndexReader, "term_positions_for", frt_ir_t_pos_for, 2);
3002
+ rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 2);
3003
+ rb_define_method(cIndexReader, "terms", frt_ir_terms, 1);
3004
+ rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 2);
3005
+ rb_define_method(cIndexReader, "field_names", frt_ir_field_names, 0);
3006
+ rb_define_method(cIndexReader, "field_infos", frt_ir_field_infos, 0);
3007
+ }
3008
+
3009
+ /* rdoc hack
3010
+ extern VALUE mFerret = rb_define_module("Ferret");
3011
+ */
3012
+
3013
+ /*
3014
+ * Document-module: Ferret::Index
3015
+ *
3016
+ * == Summary
3017
+ *
3018
+ * The Index module contains all the classes used for adding to and
3019
+ * retrieving from the index. The important classes to know about are;
3020
+ *
3021
+ * * FieldInfo
3022
+ * * FieldInfos
3023
+ * * IndexWriter
3024
+ * * IndexReader
3025
+ * * LazyDoc
3026
+ *
3027
+ * The other classes in this module are useful for more advanced uses like
3028
+ * building tag clouds, creating more-like-this queries, custom highlighting
3029
+ * etc. They are also useful for index browsers.
3030
+ */
3031
+ void
3032
+ Init_Index(void)
3033
+ {
3034
+ mIndex = rb_define_module_under(mFerret, "Index");
3035
+
3036
+ sym_boost = ID2SYM(rb_intern("boost"));
3037
+ sym_analyzer = ID2SYM(rb_intern("analyzer"));
3038
+ sym_close_dir = ID2SYM(rb_intern("close_dir"));
3039
+
3040
+ Init_TermVector();
3041
+ Init_TermEnum();
3042
+ Init_TermDocEnum();
3043
+
3044
+ Init_FieldInfos();
3045
+
3046
+ Init_LazyDoc();
3047
+ Init_IndexWriter();
3048
+ Init_IndexReader();
3049
+ }