ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_index.c
ADDED
@@ -0,0 +1,3049 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
#include "index.h"
|
3
|
+
#include <st.h>
|
4
|
+
|
5
|
+
VALUE mIndex;
|
6
|
+
|
7
|
+
VALUE cFieldInfo;
|
8
|
+
VALUE cFieldInfos;
|
9
|
+
|
10
|
+
VALUE cTVOffsets;
|
11
|
+
VALUE cTVTerm;
|
12
|
+
VALUE cTermVector;
|
13
|
+
|
14
|
+
VALUE cTermEnum;
|
15
|
+
VALUE cTermDocEnum;
|
16
|
+
|
17
|
+
VALUE cLazyDoc;
|
18
|
+
VALUE cLazyDocData;
|
19
|
+
VALUE cIndexWriter;
|
20
|
+
VALUE cIndexReader;
|
21
|
+
|
22
|
+
VALUE sym_analyzer;
|
23
|
+
static VALUE sym_close_dir;
|
24
|
+
static VALUE sym_create;
|
25
|
+
static VALUE sym_create_if_missing;
|
26
|
+
|
27
|
+
static VALUE sym_chunk_size;
|
28
|
+
static VALUE sym_max_buffer_memory;
|
29
|
+
static VALUE sym_index_interval;
|
30
|
+
static VALUE sym_skip_interval;
|
31
|
+
static VALUE sym_merge_factor;
|
32
|
+
static VALUE sym_max_buffered_docs;
|
33
|
+
static VALUE sym_max_merge_docs;
|
34
|
+
static VALUE sym_max_field_length;
|
35
|
+
static VALUE sym_use_compound_file;
|
36
|
+
|
37
|
+
static VALUE sym_boost;
|
38
|
+
static VALUE sym_field_infos;
|
39
|
+
|
40
|
+
static VALUE sym_store;
|
41
|
+
static VALUE sym_index;
|
42
|
+
static VALUE sym_term_vector;
|
43
|
+
|
44
|
+
static VALUE sym_compress;
|
45
|
+
static VALUE sym_compressed;
|
46
|
+
|
47
|
+
static VALUE sym_untokenized;
|
48
|
+
static VALUE sym_omit_norms;
|
49
|
+
static VALUE sym_untokenized_omit_norms;
|
50
|
+
|
51
|
+
static VALUE sym_with_positions;
|
52
|
+
static VALUE sym_with_offsets;
|
53
|
+
static VALUE sym_with_positions_offsets;
|
54
|
+
|
55
|
+
static ID id_term;
|
56
|
+
static ID id_fields;
|
57
|
+
static ID id_fld_num_map;
|
58
|
+
static ID id_field_num;
|
59
|
+
static ID id_boost;
|
60
|
+
|
61
|
+
extern void frt_set_term(VALUE rterm, Term *t);
|
62
|
+
extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
|
63
|
+
extern VALUE frt_get_analyzer(Analyzer *a);
|
64
|
+
|
65
|
+
/****************************************************************************
|
66
|
+
*
|
67
|
+
* FieldInfo Methods
|
68
|
+
*
|
69
|
+
****************************************************************************/
|
70
|
+
|
71
|
+
static void
|
72
|
+
frt_fi_free(void *p)
|
73
|
+
{
|
74
|
+
object_del(p);
|
75
|
+
fi_deref((FieldInfo *)p);
|
76
|
+
}
|
77
|
+
|
78
|
+
static void
|
79
|
+
frt_fi_get_params(VALUE roptions,
|
80
|
+
enum StoreValues *store,
|
81
|
+
enum IndexValues *index,
|
82
|
+
enum TermVectorValues *term_vector,
|
83
|
+
float *boost)
|
84
|
+
{
|
85
|
+
VALUE v;
|
86
|
+
Check_Type(roptions, T_HASH);
|
87
|
+
v = rb_hash_aref(roptions, sym_boost);
|
88
|
+
if (Qnil != v) {
|
89
|
+
*boost = (float)NUM2DBL(v);
|
90
|
+
} else {
|
91
|
+
*boost = 1.0f;
|
92
|
+
}
|
93
|
+
v = rb_hash_aref(roptions, sym_store);
|
94
|
+
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
95
|
+
if (v == sym_no || v == sym_false || v == Qfalse) {
|
96
|
+
*store = STORE_NO;
|
97
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue || v == Qnil) {
|
98
|
+
*store = STORE_YES;
|
99
|
+
} else if (v == sym_compress || v == sym_compressed) {
|
100
|
+
*store = STORE_COMPRESS;
|
101
|
+
} else {
|
102
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
|
103
|
+
" Please choose from [:yes, :no, :compressed]",
|
104
|
+
rb_id2name(SYM2ID(v)));
|
105
|
+
}
|
106
|
+
|
107
|
+
v = rb_hash_aref(roptions, sym_index);
|
108
|
+
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
109
|
+
if (v == sym_no || v == sym_false || v == Qfalse) {
|
110
|
+
*index = INDEX_NO;
|
111
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue || v == Qnil) {
|
112
|
+
*index = INDEX_YES;
|
113
|
+
} else if (v == sym_untokenized) {
|
114
|
+
*index = INDEX_UNTOKENIZED;
|
115
|
+
} else if (v == sym_omit_norms) {
|
116
|
+
*index = INDEX_YES_OMIT_NORMS;
|
117
|
+
} else if (v == sym_untokenized_omit_norms) {
|
118
|
+
*index = INDEX_UNTOKENIZED_OMIT_NORMS;
|
119
|
+
} else {
|
120
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
|
121
|
+
" Please choose from [:no, :yes, :untokenized, "
|
122
|
+
":omit_norms, :untokenized_omit_norms]",
|
123
|
+
rb_id2name(SYM2ID(v)));
|
124
|
+
}
|
125
|
+
|
126
|
+
v = rb_hash_aref(roptions, sym_term_vector);
|
127
|
+
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
128
|
+
if (v == sym_no || v == sym_false || v == Qfalse) {
|
129
|
+
*term_vector = TERM_VECTOR_NO;
|
130
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
131
|
+
*term_vector = TERM_VECTOR_YES;
|
132
|
+
} else if (v == sym_with_positions) {
|
133
|
+
*term_vector = TERM_VECTOR_WITH_POSITIONS;
|
134
|
+
} else if (v == sym_with_offsets) {
|
135
|
+
*term_vector = TERM_VECTOR_WITH_OFFSETS;
|
136
|
+
} else if (v == sym_with_positions_offsets || v == Qnil) {
|
137
|
+
*term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
138
|
+
} else {
|
139
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for "
|
140
|
+
":term_vector. Please choose from [:no, :yes, "
|
141
|
+
":with_positions, :with_offsets, "
|
142
|
+
":with_positions_offsets]",
|
143
|
+
rb_id2name(SYM2ID(v)));
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
static VALUE
|
148
|
+
frt_get_field_info(FieldInfo *fi)
|
149
|
+
{
|
150
|
+
|
151
|
+
VALUE rfi = Qnil;
|
152
|
+
if (fi) {
|
153
|
+
rfi = object_get(fi);
|
154
|
+
if (rfi == Qnil) {
|
155
|
+
rfi = Data_Wrap_Struct(cFieldInfo, NULL, &frt_fi_free, fi);
|
156
|
+
REF(fi);
|
157
|
+
object_add(fi, rfi);
|
158
|
+
}
|
159
|
+
}
|
160
|
+
return rfi;
|
161
|
+
}
|
162
|
+
|
163
|
+
/*
|
164
|
+
* call-seq:
|
165
|
+
* FieldInfo.new(name, options = {}) -> field_info
|
166
|
+
*
|
167
|
+
* Create a new FieldInfo object with the name +name+ and the properties
|
168
|
+
* specified in +options+. The available options are [:store, :index,
|
169
|
+
* :term_vector, :boost]. See the description of FieldInfo for more
|
170
|
+
* information on these properties.
|
171
|
+
*/
|
172
|
+
static VALUE
|
173
|
+
frt_fi_init(int argc, VALUE *argv, VALUE self)
|
174
|
+
{
|
175
|
+
VALUE roptions, rname;
|
176
|
+
FieldInfo *fi;
|
177
|
+
enum StoreValues store = STORE_YES;
|
178
|
+
enum IndexValues index = INDEX_YES;
|
179
|
+
enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
180
|
+
float boost = 1.0f;
|
181
|
+
|
182
|
+
rb_scan_args(argc, argv, "11", &rname, &roptions);
|
183
|
+
if (argc > 1) {
|
184
|
+
frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
|
185
|
+
}
|
186
|
+
fi = fi_new(frt_field(rname), store, index, term_vector);
|
187
|
+
fi->boost = boost;
|
188
|
+
Frt_Wrap_Struct(self, NULL, &frt_fi_free, fi);
|
189
|
+
object_add(fi, self);
|
190
|
+
return self;
|
191
|
+
}
|
192
|
+
|
193
|
+
/*
|
194
|
+
* call-seq:
|
195
|
+
* fi.stored? -> bool
|
196
|
+
*
|
197
|
+
* Return true if the field is stored in the index.
|
198
|
+
*/
|
199
|
+
static VALUE
|
200
|
+
frt_fi_is_stored(VALUE self)
|
201
|
+
{
|
202
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
203
|
+
return fi_is_stored(fi) ? Qtrue : Qfalse;
|
204
|
+
}
|
205
|
+
|
206
|
+
/*
|
207
|
+
* call-seq:
|
208
|
+
* fi.compressed? -> bool
|
209
|
+
*
|
210
|
+
* Return true if the field is stored in the index in compressed format.
|
211
|
+
*/
|
212
|
+
static VALUE
|
213
|
+
frt_fi_is_compressed(VALUE self)
|
214
|
+
{
|
215
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
216
|
+
return fi_is_compressed(fi) ? Qtrue : Qfalse;
|
217
|
+
}
|
218
|
+
|
219
|
+
/*
|
220
|
+
* call-seq:
|
221
|
+
* fi.indexed? -> bool
|
222
|
+
*
|
223
|
+
* Return true if the field is indexed, ie searchable in the index.
|
224
|
+
*/
|
225
|
+
static VALUE
|
226
|
+
frt_fi_is_indexed(VALUE self)
|
227
|
+
{
|
228
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
229
|
+
return fi_is_indexed(fi) ? Qtrue : Qfalse;
|
230
|
+
}
|
231
|
+
|
232
|
+
/*
|
233
|
+
* call-seq:
|
234
|
+
* fi.tokenized? -> bool
|
235
|
+
*
|
236
|
+
* Return true if the field is tokenized. Tokenizing is the process of
|
237
|
+
* breaking the field up into tokens. That is "the quick brown fox" becomes
|
238
|
+
* ["the", "quick", "brown", "fox"] This is only possible if the field in
|
239
|
+
* indexed.
|
240
|
+
*/
|
241
|
+
static VALUE
|
242
|
+
frt_fi_is_tokenized(VALUE self)
|
243
|
+
{
|
244
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
245
|
+
return fi_is_tokenized(fi) ? Qtrue : Qfalse;
|
246
|
+
}
|
247
|
+
|
248
|
+
/*
|
249
|
+
* call-seq:
|
250
|
+
* fi.omit_norms? -> bool
|
251
|
+
*
|
252
|
+
* Return true if the field omits the norm file. The norm file is the file
|
253
|
+
* used to store the field boosts for an indexed field. If you do not boost
|
254
|
+
* any fields, and you can live without scoring based on field length then
|
255
|
+
* you can omit the norms file. This will give the index a slight performance
|
256
|
+
* boost and it will use less memory, escpecially for indexes which have a
|
257
|
+
* large number of documents.
|
258
|
+
*/
|
259
|
+
static VALUE
|
260
|
+
frt_fi_omit_norms(VALUE self)
|
261
|
+
{
|
262
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
263
|
+
return fi_omit_norms(fi) ? Qtrue : Qfalse;
|
264
|
+
}
|
265
|
+
|
266
|
+
/*
|
267
|
+
* call-seq:
|
268
|
+
* fi.store_term_vector? -> bool
|
269
|
+
*
|
270
|
+
* Return true if the term-vectors are stored for this field.
|
271
|
+
*/
|
272
|
+
static VALUE
|
273
|
+
frt_fi_store_term_vector(VALUE self)
|
274
|
+
{
|
275
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
276
|
+
return fi_store_term_vector(fi) ? Qtrue : Qfalse;
|
277
|
+
}
|
278
|
+
|
279
|
+
/*
|
280
|
+
* call-seq:
|
281
|
+
* fi.store_positions? -> bool
|
282
|
+
*
|
283
|
+
* Return true if positions are stored with the term-vectors for this field.
|
284
|
+
*/
|
285
|
+
static VALUE
|
286
|
+
frt_fi_store_positions(VALUE self)
|
287
|
+
{
|
288
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
289
|
+
return fi_store_positions(fi) ? Qtrue : Qfalse;
|
290
|
+
}
|
291
|
+
|
292
|
+
/*
|
293
|
+
* call-seq:
|
294
|
+
* fi.store_offsets? -> bool
|
295
|
+
*
|
296
|
+
* Return true if offsets are stored with the term-vectors for this field.
|
297
|
+
*/
|
298
|
+
static VALUE
|
299
|
+
frt_fi_store_offsets(VALUE self)
|
300
|
+
{
|
301
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
302
|
+
return fi_store_offsets(fi) ? Qtrue : Qfalse;
|
303
|
+
}
|
304
|
+
|
305
|
+
/*
|
306
|
+
* call-seq:
|
307
|
+
* fi.has_norms? -> bool
|
308
|
+
*
|
309
|
+
* Return true if this field has a norms file. This is the same as calling;
|
310
|
+
*
|
311
|
+
* fi.indexed? and not fi.omit_norms?
|
312
|
+
*/
|
313
|
+
static VALUE
|
314
|
+
frt_fi_has_norms(VALUE self)
|
315
|
+
{
|
316
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
317
|
+
return fi_has_norms(fi) ? Qtrue : Qfalse;
|
318
|
+
}
|
319
|
+
|
320
|
+
/*
|
321
|
+
* call-seq:
|
322
|
+
* fi.boost -> boost
|
323
|
+
*
|
324
|
+
* Return the default boost for this field
|
325
|
+
*/
|
326
|
+
static VALUE
|
327
|
+
frt_fi_boost(VALUE self)
|
328
|
+
{
|
329
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
330
|
+
return rb_float_new((double)fi->boost);
|
331
|
+
}
|
332
|
+
|
333
|
+
/*
|
334
|
+
* call-seq:
|
335
|
+
* fi.to_s -> string
|
336
|
+
*
|
337
|
+
* Return a string representation of the FieldInfo object.
|
338
|
+
*/
|
339
|
+
static VALUE
|
340
|
+
frt_fi_to_s(VALUE self)
|
341
|
+
{
|
342
|
+
FieldInfo *fi = (FieldInfo *)DATA_PTR(self);
|
343
|
+
char *fi_s = fi_to_s(fi);
|
344
|
+
VALUE rfi_s = rb_str_new2(fi_s);
|
345
|
+
free(fi_s);
|
346
|
+
return rfi_s;
|
347
|
+
}
|
348
|
+
|
349
|
+
/****************************************************************************
|
350
|
+
*
|
351
|
+
* FieldInfos Methods
|
352
|
+
*
|
353
|
+
****************************************************************************/
|
354
|
+
|
355
|
+
static void
|
356
|
+
frt_fis_free(void *p)
|
357
|
+
{
|
358
|
+
object_del(p);
|
359
|
+
fis_deref((FieldInfos *)p);
|
360
|
+
}
|
361
|
+
|
362
|
+
static void
|
363
|
+
frt_fis_mark(void *p)
|
364
|
+
{
|
365
|
+
int i;
|
366
|
+
FieldInfos *fis = (FieldInfos *)p;
|
367
|
+
|
368
|
+
for (i = 0; i < fis->size; i++) {
|
369
|
+
frt_gc_mark(fis->fields[i]);
|
370
|
+
}
|
371
|
+
}
|
372
|
+
|
373
|
+
static VALUE
|
374
|
+
frt_get_field_infos(FieldInfos *fis)
|
375
|
+
{
|
376
|
+
|
377
|
+
VALUE rfis = Qnil;
|
378
|
+
if (fis) {
|
379
|
+
rfis = object_get(fis);
|
380
|
+
if (rfis == Qnil) {
|
381
|
+
rfis = Data_Wrap_Struct(cFieldInfos, &frt_fis_mark, &frt_fis_free,
|
382
|
+
fis);
|
383
|
+
REF(fis);
|
384
|
+
object_add(fis, rfis);
|
385
|
+
}
|
386
|
+
}
|
387
|
+
return rfis;
|
388
|
+
}
|
389
|
+
|
390
|
+
/*
|
391
|
+
* call-seq:
|
392
|
+
* FieldInfos.new(defaults = {}) -> field_infos
|
393
|
+
*
|
394
|
+
* Create a new FieldInfos object which uses the default values for fields
|
395
|
+
* specified in the +default+ hash parameter. See FieldInfo for available
|
396
|
+
* property values.
|
397
|
+
*/
|
398
|
+
static VALUE
|
399
|
+
frt_fis_init(int argc, VALUE *argv, VALUE self)
|
400
|
+
{
|
401
|
+
VALUE roptions;
|
402
|
+
FieldInfos *fis;
|
403
|
+
enum StoreValues store = STORE_YES;
|
404
|
+
enum IndexValues index = INDEX_YES;
|
405
|
+
enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
406
|
+
float boost;
|
407
|
+
|
408
|
+
rb_scan_args(argc, argv, "01", &roptions);
|
409
|
+
if (argc > 0) {
|
410
|
+
frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
|
411
|
+
}
|
412
|
+
fis = fis_new(store, index, term_vector);
|
413
|
+
Frt_Wrap_Struct(self, &frt_fis_mark, &frt_fis_free, fis);
|
414
|
+
object_add(fis, self);
|
415
|
+
return self;
|
416
|
+
}
|
417
|
+
|
418
|
+
/*
|
419
|
+
* call-seq:
|
420
|
+
* fis.to_a -> array
|
421
|
+
*
|
422
|
+
* Return an array of the FieldInfo objects contained but this FieldInfos
|
423
|
+
* object.
|
424
|
+
*/
|
425
|
+
static VALUE
|
426
|
+
frt_fis_to_a(VALUE self)
|
427
|
+
{
|
428
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
429
|
+
VALUE rary = rb_ary_new();
|
430
|
+
int i;
|
431
|
+
|
432
|
+
for (i = 0; i < fis->size; i++) {
|
433
|
+
rb_ary_push(rary, frt_get_field_info(fis->fields[i]));
|
434
|
+
}
|
435
|
+
return rary;
|
436
|
+
}
|
437
|
+
|
438
|
+
/*
|
439
|
+
* call-seq:
|
440
|
+
* fis[name] -> field_info
|
441
|
+
* fis[number] -> field_info
|
442
|
+
*
|
443
|
+
* Get the FieldInfo object. FieldInfo objects can be referenced by either
|
444
|
+
* their field-number of the field-name (which must be a symbol). For
|
445
|
+
* example;
|
446
|
+
*
|
447
|
+
* fi = fis[:name]
|
448
|
+
* fi = fis[2]
|
449
|
+
*/
|
450
|
+
static VALUE
|
451
|
+
frt_fis_get(VALUE self, VALUE ridx)
|
452
|
+
{
|
453
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
454
|
+
VALUE rfi = Qnil;
|
455
|
+
switch (TYPE(ridx)) {
|
456
|
+
case T_FIXNUM: {
|
457
|
+
int index = FIX2INT(ridx);
|
458
|
+
if (index < 0) index += fis->size;
|
459
|
+
if (index < 0 || index >= fis->size) {
|
460
|
+
rb_raise(rb_eArgError, "index of %d is out of range (0..%d)\n",
|
461
|
+
index, fis->size);
|
462
|
+
}
|
463
|
+
rfi = frt_get_field_info(fis->fields[index]);
|
464
|
+
break;
|
465
|
+
}
|
466
|
+
case T_SYMBOL:
|
467
|
+
rfi = frt_get_field_info(fis_get_field(fis, frt_field(ridx)));
|
468
|
+
break;
|
469
|
+
case T_STRING:
|
470
|
+
rfi = frt_get_field_info(fis_get_field(fis, StringValuePtr(ridx)));
|
471
|
+
break;
|
472
|
+
default:
|
473
|
+
rb_raise(rb_eArgError, "Can't index FieldInfos with %s",
|
474
|
+
RSTRING(rb_obj_as_string(ridx))->ptr);
|
475
|
+
break;
|
476
|
+
}
|
477
|
+
return rfi;
|
478
|
+
}
|
479
|
+
|
480
|
+
/*
|
481
|
+
* call-seq:
|
482
|
+
* fis << fi -> fis
|
483
|
+
* fis.add(fi) -> fis
|
484
|
+
*
|
485
|
+
* Add a FieldInfo object. Use the FieldInfos#add_field method where
|
486
|
+
* possible.
|
487
|
+
*/
|
488
|
+
static VALUE
|
489
|
+
frt_fis_add(VALUE self, VALUE rfi)
|
490
|
+
{
|
491
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
492
|
+
FieldInfo *fi = (FieldInfo *)frt_rb_data_ptr(rfi);
|
493
|
+
fis_add_field(fis, fi);
|
494
|
+
REF(fi);
|
495
|
+
return self;
|
496
|
+
}
|
497
|
+
|
498
|
+
/*
|
499
|
+
* call-seq:
|
500
|
+
* fis.add_field(name, properties = {} -> fis
|
501
|
+
*
|
502
|
+
* Add a new field to the FieldInfos object. See FieldInfo for a description
|
503
|
+
* of the available properties.
|
504
|
+
*/
|
505
|
+
static VALUE
|
506
|
+
frt_fis_add_field(int argc, VALUE *argv, VALUE self)
|
507
|
+
{
|
508
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
509
|
+
FieldInfo *fi;
|
510
|
+
enum StoreValues store = STORE_YES;
|
511
|
+
enum IndexValues index = INDEX_YES;
|
512
|
+
enum TermVectorValues term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
513
|
+
float boost = 1.0f;
|
514
|
+
VALUE rname, roptions;
|
515
|
+
|
516
|
+
rb_scan_args(argc, argv, "11", &rname, &roptions);
|
517
|
+
if (argc > 1) {
|
518
|
+
frt_fi_get_params(roptions, &store, &index, &term_vector, &boost);
|
519
|
+
}
|
520
|
+
fi = fi_new(frt_field(rname), store, index, term_vector);
|
521
|
+
fi->boost = boost;
|
522
|
+
fis_add_field(fis, fi);
|
523
|
+
return self;
|
524
|
+
}
|
525
|
+
|
526
|
+
/*
|
527
|
+
* call-seq:
|
528
|
+
* fis.each {|fi| do_something } -> fis
|
529
|
+
*
|
530
|
+
* Iterate through the FieldInfo objects.
|
531
|
+
*/
|
532
|
+
static VALUE
|
533
|
+
frt_fis_each(VALUE self)
|
534
|
+
{
|
535
|
+
int i;
|
536
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
537
|
+
|
538
|
+
for (i = 0; i < fis->size; i++) {
|
539
|
+
rb_yield(frt_get_field_info(fis->fields[i]));
|
540
|
+
}
|
541
|
+
return self;
|
542
|
+
}
|
543
|
+
|
544
|
+
/*
|
545
|
+
* call-seq:
|
546
|
+
* fis.to_s -> string
|
547
|
+
*
|
548
|
+
* Return a string representation of the FieldInfos object.
|
549
|
+
*/
|
550
|
+
static VALUE
|
551
|
+
frt_fis_to_s(VALUE self)
|
552
|
+
{
|
553
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
554
|
+
char *fis_s = fis_to_s(fis);
|
555
|
+
VALUE rfis_s = rb_str_new2(fis_s);
|
556
|
+
free(fis_s);
|
557
|
+
return rfis_s;
|
558
|
+
}
|
559
|
+
|
560
|
+
/*
|
561
|
+
* call-seq:
|
562
|
+
* fis.create_index(dir) -> self
|
563
|
+
*
|
564
|
+
* Create a new index in the directory specified. The directory +dir+ can
|
565
|
+
* either be a string path representing a directory on the file-system or an
|
566
|
+
* actual directory object. Care should be taken when using this method. Any
|
567
|
+
* existing index (or other files for that matter) will be deleted from the
|
568
|
+
* directory and overwritten by the new index.
|
569
|
+
*/
|
570
|
+
static VALUE
|
571
|
+
frt_fis_create_index(VALUE self, VALUE rdir)
|
572
|
+
{
|
573
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
574
|
+
Store *store = NULL;
|
575
|
+
if (TYPE(rdir) == T_DATA) {
|
576
|
+
store = DATA_PTR(rdir);
|
577
|
+
REF(store);
|
578
|
+
} else {
|
579
|
+
StringValue(rdir);
|
580
|
+
frt_create_dir(rdir);
|
581
|
+
store = open_fs_store(RSTRING(rdir)->ptr);
|
582
|
+
}
|
583
|
+
index_create(store, fis);
|
584
|
+
store_deref(store);
|
585
|
+
return self;
|
586
|
+
}
|
587
|
+
|
588
|
+
/*
|
589
|
+
* call-seq:
|
590
|
+
* fis.fields -> symbol array
|
591
|
+
*
|
592
|
+
* Return a list of the the field names (as symbols) in the index.
|
593
|
+
*/
|
594
|
+
static VALUE
|
595
|
+
frt_fis_get_fields(VALUE self)
|
596
|
+
{
|
597
|
+
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
598
|
+
VALUE rfield_names = rb_ary_new();
|
599
|
+
int i;
|
600
|
+
for (i = 0; i < fis->size; i++) {
|
601
|
+
rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
|
602
|
+
}
|
603
|
+
return rfield_names;
|
604
|
+
}
|
605
|
+
|
606
|
+
/****************************************************************************
|
607
|
+
*
|
608
|
+
* TermEnum Methods
|
609
|
+
*
|
610
|
+
****************************************************************************/
|
611
|
+
|
612
|
+
static void
|
613
|
+
frt_te_free(void *p)
|
614
|
+
{
|
615
|
+
TermEnum *te = (TermEnum *)p;
|
616
|
+
te->close(te);
|
617
|
+
}
|
618
|
+
|
619
|
+
static VALUE
|
620
|
+
frt_te_get_set_term(VALUE self, const char *term)
|
621
|
+
{
|
622
|
+
TermEnum *te = (TermEnum *)DATA_PTR(self);
|
623
|
+
VALUE str = term ? rb_str_new(term, te->curr_term_len) : Qnil;
|
624
|
+
rb_ivar_set(self, id_term, str);
|
625
|
+
return str;
|
626
|
+
}
|
627
|
+
|
628
|
+
static VALUE
|
629
|
+
frt_get_te(VALUE rir, TermEnum *te)
|
630
|
+
{
|
631
|
+
VALUE self = Qnil;
|
632
|
+
if (te != NULL) {
|
633
|
+
self = Data_Wrap_Struct(cTermEnum, NULL, &frt_te_free, te);
|
634
|
+
frt_te_get_set_term(self, te->curr_term);
|
635
|
+
rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
|
636
|
+
}
|
637
|
+
return self;
|
638
|
+
}
|
639
|
+
|
640
|
+
/*
|
641
|
+
* call-seq:
|
642
|
+
* term_enum.next -> term_string
|
643
|
+
*
|
644
|
+
* Returns the next term in the enumeration or nil otherwise.
|
645
|
+
*/
|
646
|
+
static VALUE
|
647
|
+
frt_te_next(VALUE self)
|
648
|
+
{
|
649
|
+
TermEnum *te = (TermEnum *)DATA_PTR(self);
|
650
|
+
return frt_te_get_set_term(self, te->next(te));
|
651
|
+
}
|
652
|
+
|
653
|
+
/*
|
654
|
+
* call-seq:
|
655
|
+
* term_enum.term -> term_string
|
656
|
+
*
|
657
|
+
* Returns the current term pointed to by the enum. This method should only
|
658
|
+
* be called after a successful call to TermEnum#next.
|
659
|
+
*/
|
660
|
+
static VALUE
|
661
|
+
frt_te_term(VALUE self)
|
662
|
+
{
|
663
|
+
return rb_ivar_get(self, id_term);
|
664
|
+
}
|
665
|
+
|
666
|
+
/*
|
667
|
+
* call-seq:
|
668
|
+
* term_enum.doc_freq -> integer
|
669
|
+
*
|
670
|
+
* Returns the document frequency of the current term pointed to by the enum.
|
671
|
+
* That is the number of documents that this term appears in. The method
|
672
|
+
* should only be called after a successful call to TermEnum#next.
|
673
|
+
*/
|
674
|
+
static VALUE
|
675
|
+
frt_te_doc_freq(VALUE self)
|
676
|
+
{
|
677
|
+
TermEnum *te = (TermEnum *)DATA_PTR(self);
|
678
|
+
return INT2FIX(te->curr_ti.doc_freq);
|
679
|
+
}
|
680
|
+
|
681
|
+
/*
|
682
|
+
* call-seq:
|
683
|
+
* term_enum.skip_to(target) -> term
|
684
|
+
*
|
685
|
+
* Skip to term +target+. This method can skip forwards or backwards. If you
|
686
|
+
* want to skip back to the start, pass the empty string "". That is;
|
687
|
+
*
|
688
|
+
* term_enum.skip_to("")
|
689
|
+
*
|
690
|
+
* Returns the first term greater than or equal to +target+
|
691
|
+
*/
|
692
|
+
static VALUE
|
693
|
+
frt_te_skip_to(VALUE self, VALUE rterm)
|
694
|
+
{
|
695
|
+
TermEnum *te = (TermEnum *)DATA_PTR(self);
|
696
|
+
return frt_te_get_set_term(self, te->skip_to(te, frt_field(rterm)));
|
697
|
+
}
|
698
|
+
|
699
|
+
/*
|
700
|
+
* call-seq:
|
701
|
+
* term_enum.each {|term, doc_freq| do_something() } -> term_count
|
702
|
+
*
|
703
|
+
* Iterates through all the terms in the field, yielding the term and the
|
704
|
+
* document frequency.
|
705
|
+
*/
|
706
|
+
static VALUE
|
707
|
+
frt_te_each(VALUE self)
|
708
|
+
{
|
709
|
+
TermEnum *te = (TermEnum *)DATA_PTR(self);
|
710
|
+
char *term;
|
711
|
+
int term_cnt = 0;
|
712
|
+
VALUE vals = rb_ary_new2(2);
|
713
|
+
RARRAY(vals)->len = 2;
|
714
|
+
|
715
|
+
|
716
|
+
/* each is being called so there will be no current term */
|
717
|
+
rb_ivar_set(self, id_term, Qnil);
|
718
|
+
|
719
|
+
|
720
|
+
while (NULL != (term = te->next(te))) {
|
721
|
+
term_cnt++;
|
722
|
+
RARRAY(vals)->ptr[0] = rb_str_new(term, te->curr_term_len);
|
723
|
+
RARRAY(vals)->ptr[1] = INT2FIX(te->curr_ti.doc_freq);
|
724
|
+
rb_yield(vals);
|
725
|
+
}
|
726
|
+
return INT2FIX(term_cnt);
|
727
|
+
}
|
728
|
+
|
729
|
+
/*
|
730
|
+
* call-seq:
|
731
|
+
* term_enum.set_field(field) -> self
|
732
|
+
*
|
733
|
+
* Set the field for the term_enum. The field value should be a symbol as
|
734
|
+
* usual. For example, to scan all title terms you'd do this;
|
735
|
+
*
|
736
|
+
* term_enum.set_field(:title).each do |term, doc_freq|
|
737
|
+
* do_something()
|
738
|
+
* end
|
739
|
+
*/
|
740
|
+
static VALUE
|
741
|
+
frt_te_set_field(VALUE self, VALUE rfield)
|
742
|
+
{
|
743
|
+
TermEnum *te = (TermEnum *)DATA_PTR(self);
|
744
|
+
int field_num = 0;
|
745
|
+
VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
|
746
|
+
VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
|
747
|
+
if (rfnum != Qnil) {
|
748
|
+
field_num = FIX2INT(rfnum);
|
749
|
+
rb_ivar_set(self, id_field_num, rfnum);
|
750
|
+
} else {
|
751
|
+
Check_Type(rfield, T_SYMBOL);
|
752
|
+
rb_raise(rb_eArgError, "field %s doesn't exist in the index",
|
753
|
+
frt_field(rfield));
|
754
|
+
}
|
755
|
+
te->set_field(te, field_num);
|
756
|
+
|
757
|
+
return self;
|
758
|
+
}
|
759
|
+
|
760
|
+
/****************************************************************************
|
761
|
+
*
|
762
|
+
* TermDocEnum Methods
|
763
|
+
*
|
764
|
+
****************************************************************************/
|
765
|
+
|
766
|
+
static void
|
767
|
+
frt_tde_free(void *p)
|
768
|
+
{
|
769
|
+
TermDocEnum *tde = (TermDocEnum *)p;
|
770
|
+
tde->close(tde);
|
771
|
+
}
|
772
|
+
|
773
|
+
static VALUE
|
774
|
+
frt_get_tde(VALUE rir, TermDocEnum *tde)
|
775
|
+
{
|
776
|
+
VALUE self = Data_Wrap_Struct(cTermDocEnum, NULL, &frt_tde_free, tde);
|
777
|
+
rb_ivar_set(self, id_fld_num_map, rb_ivar_get(rir, id_fld_num_map));
|
778
|
+
return self;
|
779
|
+
}
|
780
|
+
|
781
|
+
/*
|
782
|
+
* call-seq:
|
783
|
+
* term_doc_enum.seek(field, term) -> self
|
784
|
+
*
|
785
|
+
* Seek the term +term+ in the index for +field+. After you call this method
|
786
|
+
* you can call next or each to skip through the documents and positions of
|
787
|
+
* this particular term.
|
788
|
+
*/
|
789
|
+
static VALUE
|
790
|
+
frt_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
|
791
|
+
{
|
792
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
793
|
+
char *term;
|
794
|
+
VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
|
795
|
+
VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
|
796
|
+
int field_num = -1;
|
797
|
+
term = StringValuePtr(rterm);
|
798
|
+
if (rfnum != Qnil) {
|
799
|
+
field_num = FIX2INT(rfnum);
|
800
|
+
} else {
|
801
|
+
rb_raise(rb_eArgError, "field %s doesn't exist in the index",
|
802
|
+
frt_field(rfield));
|
803
|
+
}
|
804
|
+
tde->seek(tde, field_num, term);
|
805
|
+
return self;
|
806
|
+
}
|
807
|
+
|
808
|
+
/*
|
809
|
+
* call-seq:
|
810
|
+
* term_doc_enum.seek_term_enum(term_enum) -> self
|
811
|
+
*
|
812
|
+
* Seek the current term in +term_enum+. You could just use the standard seek
|
813
|
+
* method like this;
|
814
|
+
*
|
815
|
+
* term_doc_enum.seek(term_enum.term)
|
816
|
+
*
|
817
|
+
* However the +seek_term_enum+ method saves an index lookup so should offer
|
818
|
+
* a large performance improvement.
|
819
|
+
*/
|
820
|
+
static VALUE
|
821
|
+
frt_tde_seek_te(VALUE self, VALUE rterm_enum)
|
822
|
+
{
|
823
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
824
|
+
TermEnum *te = (TermEnum *)frt_rb_data_ptr(rterm_enum);
|
825
|
+
tde->seek_te(tde, te);
|
826
|
+
return self;
|
827
|
+
}
|
828
|
+
|
829
|
+
/*
|
830
|
+
* call-seq:
|
831
|
+
* term_doc_enum.doc -> doc_id
|
832
|
+
*
|
833
|
+
* Returns the current document number pointed to by the +term_doc_enum+.
|
834
|
+
*/
|
835
|
+
static VALUE
|
836
|
+
frt_tde_doc(VALUE self)
|
837
|
+
{
|
838
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
839
|
+
return INT2FIX(tde->doc_num(tde));
|
840
|
+
}
|
841
|
+
|
842
|
+
/*
|
843
|
+
* call-seq:
|
844
|
+
* term_doc_enum.doc -> doc_id
|
845
|
+
*
|
846
|
+
* Returns the frequency of the current document pointed to by the
|
847
|
+
* +term_doc_enum+.
|
848
|
+
*/
|
849
|
+
static VALUE
|
850
|
+
frt_tde_freq(VALUE self)
|
851
|
+
{
|
852
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
853
|
+
return INT2FIX(tde->freq(tde));
|
854
|
+
}
|
855
|
+
|
856
|
+
/*
|
857
|
+
* call-seq:
|
858
|
+
* term_doc_enum.doc -> doc_id
|
859
|
+
*
|
860
|
+
* Move forward to the next document in the enumeration. Returns +true+ if
|
861
|
+
* there is another document or +false+ otherwise.
|
862
|
+
*/
|
863
|
+
static VALUE
|
864
|
+
frt_tde_next(VALUE self)
|
865
|
+
{
|
866
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
867
|
+
return tde->next(tde) ? Qtrue : Qfalse;
|
868
|
+
}
|
869
|
+
|
870
|
+
/*
|
871
|
+
* call-seq:
|
872
|
+
* term_doc_enum.doc -> doc_id
|
873
|
+
*
|
874
|
+
* Move forward to the next document in the enumeration. Returns +true+ if
|
875
|
+
* there is another document or +false+ otherwise.
|
876
|
+
*/
|
877
|
+
static VALUE
|
878
|
+
frt_tde_next_position(VALUE self)
|
879
|
+
{
|
880
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
881
|
+
int pos;
|
882
|
+
if (tde->next_position == NULL) {
|
883
|
+
rb_raise(rb_eNotImpError, "to scan through positions you must create "
|
884
|
+
"the TermDocEnum with Index#term_positions method rather "
|
885
|
+
"than the Index#term_docs method");
|
886
|
+
}
|
887
|
+
pos = tde->next_position(tde);
|
888
|
+
return pos >= 0 ? INT2FIX(pos) : Qnil;
|
889
|
+
}
|
890
|
+
|
891
|
+
/*
|
892
|
+
* call-seq:
|
893
|
+
* term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
|
894
|
+
*
|
895
|
+
* Iterate through the documents and document frequencies in the
|
896
|
+
* +term_doc_enum+.
|
897
|
+
*
|
898
|
+
* NOTE: this method can only be called once after each seek. If you need to
|
899
|
+
* call +#each+ again then you should call +#seek+ again too.
|
900
|
+
*/
|
901
|
+
static VALUE
|
902
|
+
frt_tde_each(VALUE self)
|
903
|
+
{
|
904
|
+
int doc_cnt = 0;
|
905
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
906
|
+
VALUE vals = rb_ary_new2(2);
|
907
|
+
RARRAY(vals)->len = 2;
|
908
|
+
|
909
|
+
while (tde->next(tde)) {
|
910
|
+
doc_cnt++;
|
911
|
+
RARRAY(vals)->ptr[0] = INT2FIX(tde->doc_num(tde));
|
912
|
+
RARRAY(vals)->ptr[1] = INT2FIX(tde->freq(tde));
|
913
|
+
rb_yield(vals);
|
914
|
+
|
915
|
+
}
|
916
|
+
return INT2FIX(doc_cnt);
|
917
|
+
}
|
918
|
+
|
919
|
+
/*
|
920
|
+
* call-seq:
|
921
|
+
* term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
|
922
|
+
*
|
923
|
+
* Iterate through each of the positions occupied by the current term in the
|
924
|
+
* current document. This can only be called once per document. It can be
|
925
|
+
* used within the each method. For example, to print the terms documents and
|
926
|
+
* positions;
|
927
|
+
*
|
928
|
+
* tde.each do |doc_id, freq|
|
929
|
+
* puts "term appeared #{freq} times in document #{doc_id}:"
|
930
|
+
* positions = []
|
931
|
+
* tde.each_position {|pos| positions << pos}
|
932
|
+
* puts " #{positions.join(', ')}"
|
933
|
+
* end
|
934
|
+
*/
|
935
|
+
static VALUE
|
936
|
+
frt_tde_each_position(VALUE self)
|
937
|
+
{
|
938
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
939
|
+
int pos;
|
940
|
+
if (tde->next_position == NULL) {
|
941
|
+
rb_raise(rb_eNotImpError, "to scan through positions you must create "
|
942
|
+
"the TermDocEnum with Index#term_positions method rather "
|
943
|
+
"than the Index#term_docs method");
|
944
|
+
}
|
945
|
+
while (0 <= (pos = tde->next_position(tde))) {
|
946
|
+
rb_yield(INT2FIX(pos));
|
947
|
+
}
|
948
|
+
return self;
|
949
|
+
}
|
950
|
+
|
951
|
+
/*
|
952
|
+
* call-seq:
|
953
|
+
* term_doc_enum.skip_to(target) -> bool
|
954
|
+
*
|
955
|
+
* Skip to the required document number +target+ and return true if there is
|
956
|
+
* a document >= +target+.
|
957
|
+
*/
|
958
|
+
static VALUE
|
959
|
+
frt_tde_skip_to(VALUE self, VALUE rtarget)
|
960
|
+
{
|
961
|
+
TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
|
962
|
+
return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
|
963
|
+
}
|
964
|
+
|
965
|
+
/****************************************************************************
|
966
|
+
*
|
967
|
+
* TVOffsets Methods
|
968
|
+
*
|
969
|
+
****************************************************************************/
|
970
|
+
|
971
|
+
static VALUE
|
972
|
+
frt_get_tv_offsets(Offset *offset)
|
973
|
+
{
|
974
|
+
return rb_struct_new(cTVOffsets,
|
975
|
+
INT2FIX(offset->start),
|
976
|
+
INT2FIX(offset->end),
|
977
|
+
NULL);
|
978
|
+
}
|
979
|
+
|
980
|
+
/****************************************************************************
|
981
|
+
*
|
982
|
+
* TVTerm Methods
|
983
|
+
*
|
984
|
+
****************************************************************************/
|
985
|
+
|
986
|
+
static VALUE
|
987
|
+
frt_get_tv_term(TVTerm *tv_term)
|
988
|
+
{
|
989
|
+
int i;
|
990
|
+
const int freq = tv_term->freq;
|
991
|
+
VALUE rtext;
|
992
|
+
VALUE rpositions = Qnil;
|
993
|
+
rtext = rb_str_new2(tv_term->text);
|
994
|
+
if (tv_term->positions) {
|
995
|
+
VALUE *rpos;
|
996
|
+
int *positions = tv_term->positions;
|
997
|
+
rpositions = rb_ary_new2(freq);
|
998
|
+
rpos = RARRAY(rpositions)->ptr;
|
999
|
+
RARRAY(rpositions)->len = freq;
|
1000
|
+
for (i = 0; i < freq; i++) {
|
1001
|
+
rpos[i] = INT2FIX(positions[i]);
|
1002
|
+
}
|
1003
|
+
}
|
1004
|
+
return rb_struct_new(cTVTerm, rtext, rpositions, NULL);
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
/****************************************************************************
|
1008
|
+
*
|
1009
|
+
* TermVector Methods
|
1010
|
+
*
|
1011
|
+
****************************************************************************/
|
1012
|
+
|
1013
|
+
static VALUE
|
1014
|
+
frt_get_tv(TermVector *tv)
|
1015
|
+
{
|
1016
|
+
int i;
|
1017
|
+
TVTerm *terms = tv->terms;
|
1018
|
+
const int t_cnt = tv->term_cnt;
|
1019
|
+
const int o_cnt = tv->offset_cnt;
|
1020
|
+
VALUE rfield, rterms, *rts;
|
1021
|
+
VALUE roffsets = Qnil;
|
1022
|
+
rfield = ID2SYM(rb_intern(tv->field));
|
1023
|
+
|
1024
|
+
rterms = rb_ary_new2(t_cnt);
|
1025
|
+
RARRAY(rterms)->len = t_cnt;
|
1026
|
+
rts = RARRAY(rterms)->ptr;
|
1027
|
+
for (i = 0; i < t_cnt; i++) {
|
1028
|
+
rts[i] = frt_get_tv_term(&terms[i]);
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
if (tv->offsets) {
|
1032
|
+
VALUE *ros;
|
1033
|
+
Offset *offsets = tv->offsets;
|
1034
|
+
roffsets = rb_ary_new2(o_cnt);
|
1035
|
+
ros = RARRAY(roffsets)->ptr;
|
1036
|
+
RARRAY(roffsets)->len = o_cnt;
|
1037
|
+
for (i = 0; i < o_cnt; i++) {
|
1038
|
+
ros[i] = frt_get_tv_offsets(&offsets[i]);
|
1039
|
+
}
|
1040
|
+
}
|
1041
|
+
|
1042
|
+
return rb_struct_new(cTermVector, rfield, rterms, roffsets, NULL);
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
/****************************************************************************
|
1046
|
+
*
|
1047
|
+
* IndexWriter Methods
|
1048
|
+
*
|
1049
|
+
****************************************************************************/
|
1050
|
+
|
1051
|
+
void
|
1052
|
+
frt_iw_free(void *p)
|
1053
|
+
{
|
1054
|
+
iw_close((IndexWriter *)p);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
void
|
1058
|
+
frt_iw_mark(void *p)
|
1059
|
+
{
|
1060
|
+
IndexWriter *iw = (IndexWriter *)p;
|
1061
|
+
frt_gc_mark(iw->analyzer);
|
1062
|
+
frt_gc_mark(iw->store);
|
1063
|
+
frt_gc_mark(iw->fis);
|
1064
|
+
}
|
1065
|
+
|
1066
|
+
/*
|
1067
|
+
* call-seq:
|
1068
|
+
* index_writer.close -> nil
|
1069
|
+
*
|
1070
|
+
* Close the IndexWriter. This will close and free all resources used
|
1071
|
+
* exclusively by the index writer. The garbage collector will do this
|
1072
|
+
* automatically if not called explicitly.
|
1073
|
+
*/
|
1074
|
+
static VALUE
|
1075
|
+
frt_iw_close(VALUE self)
|
1076
|
+
{
|
1077
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1078
|
+
Frt_Unwrap_Struct(self);
|
1079
|
+
iw_close(iw);
|
1080
|
+
return Qnil;
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
#define SET_INT_ATTR(attr) \
|
1084
|
+
do {\
|
1085
|
+
if (RTEST(rval = rb_hash_aref(roptions, sym_##attr)))\
|
1086
|
+
config.attr = FIX2INT(rval);\
|
1087
|
+
} while (0)
|
1088
|
+
|
1089
|
+
/*
|
1090
|
+
* call-seq:
|
1091
|
+
* IndexWriter.new(options = {}) -> index_writer
|
1092
|
+
*
|
1093
|
+
* Create a new IndexWriter. You should either pass a path or a directory to
|
1094
|
+
* this constructor. For example, here are three ways you can create an
|
1095
|
+
* IndexWriter;
|
1096
|
+
*
|
1097
|
+
* dir = RAMDirectory.new()
|
1098
|
+
* iw = IndexWriter.new(:dir => dir)
|
1099
|
+
*
|
1100
|
+
* dir = FSDirectory.new("/path/to/index")
|
1101
|
+
* iw = IndexWriter.new(:dir => dir)
|
1102
|
+
*
|
1103
|
+
* iw = IndexWriter.new(:path => "/path/to/index")
|
1104
|
+
*
|
1105
|
+
* See IndexWriter for more options.
|
1106
|
+
*/
|
1107
|
+
static VALUE
|
1108
|
+
frt_iw_init(int argc, VALUE *argv, VALUE self)
|
1109
|
+
{
|
1110
|
+
VALUE roptions, rval;
|
1111
|
+
bool create = false;
|
1112
|
+
bool create_if_missing = true;
|
1113
|
+
Store *store = NULL;
|
1114
|
+
Analyzer *analyzer = NULL;
|
1115
|
+
IndexWriter *volatile iw = NULL;
|
1116
|
+
Config config = default_config;
|
1117
|
+
|
1118
|
+
rb_scan_args(argc, argv, "01", &roptions);
|
1119
|
+
if (argc > 0) {
|
1120
|
+
Check_Type(roptions, T_HASH);
|
1121
|
+
|
1122
|
+
if ((rval = rb_hash_aref(roptions, sym_dir)) != Qnil) {
|
1123
|
+
Check_Type(rval, T_DATA);
|
1124
|
+
store = DATA_PTR(rval);
|
1125
|
+
} else if ((rval = rb_hash_aref(roptions, sym_path)) != Qnil) {
|
1126
|
+
StringValue(rval);
|
1127
|
+
frt_create_dir(rval);
|
1128
|
+
store = open_fs_store(RSTRING(rval)->ptr);
|
1129
|
+
DEREF(store);
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
/* Let ruby's garbage collector handle the closing of the store
|
1133
|
+
if (!close_dir) {
|
1134
|
+
close_dir = RTEST(rb_hash_aref(roptions, sym_close_dir));
|
1135
|
+
}
|
1136
|
+
*/
|
1137
|
+
/* use_compound_file defaults to true */
|
1138
|
+
config.use_compound_file =
|
1139
|
+
(rb_hash_aref(roptions, sym_use_compound_file) == Qfalse)
|
1140
|
+
? false
|
1141
|
+
: true;
|
1142
|
+
|
1143
|
+
if ((rval = rb_hash_aref(roptions, sym_analyzer)) != Qnil) {
|
1144
|
+
analyzer = frt_get_cwrapped_analyzer(rval);
|
1145
|
+
}
|
1146
|
+
|
1147
|
+
create = RTEST(rb_hash_aref(roptions, sym_create));
|
1148
|
+
if ((rval = rb_hash_aref(roptions, sym_create_if_missing)) != Qnil) {
|
1149
|
+
create_if_missing = RTEST(rval);
|
1150
|
+
}
|
1151
|
+
SET_INT_ATTR(chunk_size);
|
1152
|
+
SET_INT_ATTR(max_buffer_memory);
|
1153
|
+
SET_INT_ATTR(index_interval);
|
1154
|
+
SET_INT_ATTR(skip_interval);
|
1155
|
+
SET_INT_ATTR(merge_factor);
|
1156
|
+
SET_INT_ATTR(max_buffered_docs);
|
1157
|
+
SET_INT_ATTR(max_merge_docs);
|
1158
|
+
SET_INT_ATTR(max_field_length);
|
1159
|
+
}
|
1160
|
+
if (NULL == store) {
|
1161
|
+
store = open_ram_store();
|
1162
|
+
DEREF(store);
|
1163
|
+
}
|
1164
|
+
if (!create && create_if_missing && !store->exists(store, "segments")) {
|
1165
|
+
create = true;
|
1166
|
+
}
|
1167
|
+
if (create) {
|
1168
|
+
FieldInfos *fis;
|
1169
|
+
if ((rval = rb_hash_aref(roptions, sym_field_infos)) != Qnil) {
|
1170
|
+
Data_Get_Struct(rval, FieldInfos, fis);
|
1171
|
+
index_create(store, fis);
|
1172
|
+
} else {
|
1173
|
+
fis = fis_new(STORE_YES, INDEX_YES,
|
1174
|
+
TERM_VECTOR_WITH_POSITIONS_OFFSETS);
|
1175
|
+
index_create(store, fis);
|
1176
|
+
fis_deref(fis);
|
1177
|
+
}
|
1178
|
+
}
|
1179
|
+
|
1180
|
+
iw = iw_open(store, analyzer, &config);
|
1181
|
+
|
1182
|
+
Frt_Wrap_Struct(self, &frt_iw_mark, &frt_iw_free, iw);
|
1183
|
+
|
1184
|
+
if (rb_block_given_p()) {
|
1185
|
+
rb_yield(self);
|
1186
|
+
frt_iw_close(self);
|
1187
|
+
return Qnil;
|
1188
|
+
} else {
|
1189
|
+
return self;
|
1190
|
+
}
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
/*
|
1194
|
+
* call-seq:
|
1195
|
+
* iw.doc_count -> number
|
1196
|
+
*
|
1197
|
+
* Returns the number of documents in the Index. Note that deletions won't be
|
1198
|
+
* taken into account until the IndexWriter has been commited.
|
1199
|
+
*/
|
1200
|
+
static VALUE
|
1201
|
+
frt_iw_get_doc_count(VALUE self)
|
1202
|
+
{
|
1203
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1204
|
+
return INT2FIX(iw_doc_count(iw));
|
1205
|
+
}
|
1206
|
+
|
1207
|
+
static int
|
1208
|
+
frt_hash_to_doc_i(VALUE key, VALUE value, VALUE arg)
|
1209
|
+
{
|
1210
|
+
if (key == Qundef) {
|
1211
|
+
return ST_CONTINUE;
|
1212
|
+
} else {
|
1213
|
+
Document *doc = (Document *)arg;
|
1214
|
+
char *field;
|
1215
|
+
VALUE val;
|
1216
|
+
DocField *df;
|
1217
|
+
switch (TYPE(key)) {
|
1218
|
+
case T_STRING:
|
1219
|
+
field = RSTRING(key)->ptr;
|
1220
|
+
break;
|
1221
|
+
case T_SYMBOL:
|
1222
|
+
field = rb_id2name(SYM2ID(key));
|
1223
|
+
break;
|
1224
|
+
default:
|
1225
|
+
rb_raise(rb_eArgError,
|
1226
|
+
"%s cannot be a key to a field. Field keys must "
|
1227
|
+
" be symbols.", RSTRING(rb_obj_as_string(key))->ptr);
|
1228
|
+
break;
|
1229
|
+
}
|
1230
|
+
if (NULL == (df = doc_get_field(doc, field))) {
|
1231
|
+
df = df_new(field);
|
1232
|
+
}
|
1233
|
+
if (rb_respond_to(value, id_boost)) {
|
1234
|
+
df->boost = (float)NUM2DBL(rb_funcall(value, id_boost, 0));
|
1235
|
+
}
|
1236
|
+
switch (TYPE(value)) {
|
1237
|
+
case T_ARRAY:
|
1238
|
+
{
|
1239
|
+
int i;
|
1240
|
+
for (i = 0; i < RARRAY(value)->len; i++) {
|
1241
|
+
val = rb_obj_as_string(RARRAY(value)->ptr[i]);
|
1242
|
+
df_add_data_len(df,
|
1243
|
+
RSTRING(val)->ptr,
|
1244
|
+
RSTRING(val)->len);
|
1245
|
+
}
|
1246
|
+
}
|
1247
|
+
break;
|
1248
|
+
default:
|
1249
|
+
val = rb_obj_as_string(value);
|
1250
|
+
df_add_data_len(df, RSTRING(val)->ptr, RSTRING(val)->len);
|
1251
|
+
break;
|
1252
|
+
}
|
1253
|
+
doc_add_field(doc, df);
|
1254
|
+
}
|
1255
|
+
return ST_CONTINUE;
|
1256
|
+
}
|
1257
|
+
|
1258
|
+
static Document *
|
1259
|
+
frt_get_doc(VALUE rdoc)
|
1260
|
+
{
|
1261
|
+
VALUE val;
|
1262
|
+
Document *doc = doc_new();
|
1263
|
+
DocField *df;
|
1264
|
+
|
1265
|
+
if (rb_respond_to(rdoc, id_boost)) {
|
1266
|
+
doc->boost = (float)NUM2DBL(rb_funcall(rdoc, id_boost, 0));
|
1267
|
+
}
|
1268
|
+
|
1269
|
+
switch (TYPE(rdoc)) {
|
1270
|
+
case T_HASH:
|
1271
|
+
rb_hash_foreach(rdoc, frt_hash_to_doc_i, (VALUE)doc);
|
1272
|
+
break;
|
1273
|
+
case T_ARRAY:
|
1274
|
+
{
|
1275
|
+
int i;
|
1276
|
+
df = df_new("content");
|
1277
|
+
for (i = 0; i < RARRAY(rdoc)->len; i++) {
|
1278
|
+
val = rb_obj_as_string(RARRAY(rdoc)->ptr[i]);
|
1279
|
+
df_add_data_len(df,
|
1280
|
+
RSTRING(val)->ptr,
|
1281
|
+
RSTRING(val)->len);
|
1282
|
+
}
|
1283
|
+
doc_add_field(doc, df);
|
1284
|
+
}
|
1285
|
+
break;
|
1286
|
+
case T_SYMBOL:
|
1287
|
+
df = df_add_data(df_new("content"), rb_id2name(SYM2ID(rdoc)));
|
1288
|
+
doc_add_field(doc, df);
|
1289
|
+
break;
|
1290
|
+
case T_STRING:
|
1291
|
+
default:
|
1292
|
+
val = rb_obj_as_string(rdoc);
|
1293
|
+
df = df_add_data_len(df_new("content"),
|
1294
|
+
RSTRING(val)->ptr,
|
1295
|
+
RSTRING(val)->len);
|
1296
|
+
doc_add_field(doc, df);
|
1297
|
+
break;
|
1298
|
+
}
|
1299
|
+
return doc;
|
1300
|
+
}
|
1301
|
+
|
1302
|
+
/*
|
1303
|
+
* call-seq:
|
1304
|
+
* iw << document -> iw
|
1305
|
+
* iw.add_document(document) -> iw
|
1306
|
+
*
|
1307
|
+
* Add a document to the index. See Document. A document can also be a simple
|
1308
|
+
* hash object.
|
1309
|
+
*/
|
1310
|
+
static VALUE
|
1311
|
+
frt_iw_add_doc(VALUE self, VALUE rdoc)
|
1312
|
+
{
|
1313
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1314
|
+
Document *doc = frt_get_doc(rdoc);
|
1315
|
+
iw_add_doc(iw, doc);
|
1316
|
+
doc_destroy(doc);
|
1317
|
+
return self;
|
1318
|
+
}
|
1319
|
+
|
1320
|
+
/*
|
1321
|
+
* call-seq:
|
1322
|
+
* iw.optimize -> iw
|
1323
|
+
*
|
1324
|
+
* Optimize the index for searching. This commits any unwritten data to the
|
1325
|
+
* index and optimizes the index into a single segment to improve search
|
1326
|
+
* performance. This is an expensive operation and should not be called too
|
1327
|
+
* often. The best time to call this is at the end of a long batch indexing
|
1328
|
+
* process. Note that calling the optimize method do not in any way effect
|
1329
|
+
* indexing speed (except for the time taken to complete the optimization
|
1330
|
+
* process).
|
1331
|
+
*/
|
1332
|
+
static VALUE
|
1333
|
+
frt_iw_optimize(VALUE self)
|
1334
|
+
{
|
1335
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1336
|
+
iw_optimize(iw);
|
1337
|
+
return self;
|
1338
|
+
}
|
1339
|
+
|
1340
|
+
/*
|
1341
|
+
* call-seq:
|
1342
|
+
* iw.commit -> iw
|
1343
|
+
*
|
1344
|
+
* Explicitly commit any changes to the index that may be hanging around in
|
1345
|
+
* memory. You should call this method if you want to read the latest index
|
1346
|
+
* with an IndexWriter.
|
1347
|
+
*/
|
1348
|
+
static VALUE
|
1349
|
+
frt_iw_commit(VALUE self)
|
1350
|
+
{
|
1351
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1352
|
+
iw_commit(iw);
|
1353
|
+
return self;
|
1354
|
+
}
|
1355
|
+
|
1356
|
+
/*
|
1357
|
+
* call-seq:
|
1358
|
+
* iw.add_readers(reader_array) -> iw
|
1359
|
+
*
|
1360
|
+
* Use this method to merge other indexes into the one being written by
|
1361
|
+
* IndexWriter. This is useful for parallel indexing. You can have several
|
1362
|
+
* indexing processes running in parallel, possibly even on different
|
1363
|
+
* machines. Then you can finish by merging all of the indexes into a single
|
1364
|
+
* index.
|
1365
|
+
*/
|
1366
|
+
static VALUE
|
1367
|
+
frt_iw_add_readers(VALUE self, VALUE rreaders)
|
1368
|
+
{
|
1369
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1370
|
+
int i;
|
1371
|
+
IndexReader **irs;
|
1372
|
+
Check_Type(rreaders, T_ARRAY);
|
1373
|
+
|
1374
|
+
irs = ALLOC_N(IndexReader *, RARRAY(rreaders)->len);
|
1375
|
+
i = RARRAY(rreaders)->len;
|
1376
|
+
while (i-- > 0) {
|
1377
|
+
IndexReader *ir;
|
1378
|
+
Data_Get_Struct(RARRAY(rreaders)->ptr[i], IndexReader, ir);
|
1379
|
+
irs[i] = ir;
|
1380
|
+
}
|
1381
|
+
iw_add_readers(iw, irs, RARRAY(rreaders)->len);
|
1382
|
+
free(irs);
|
1383
|
+
return self;
|
1384
|
+
}
|
1385
|
+
|
1386
|
+
/*
|
1387
|
+
* call-seq:
|
1388
|
+
* iw.delete(field, term) -> iw
|
1389
|
+
*
|
1390
|
+
* Delete all documents in the index with the term +term+ in the field
|
1391
|
+
* +field+. You should usually have a unique document id which you use with
|
1392
|
+
* this method, rather then deleting all documents with the word "the" in
|
1393
|
+
* them. You may however use this method to delete spam.
|
1394
|
+
*/
|
1395
|
+
static VALUE
|
1396
|
+
frt_iw_delete(VALUE self, VALUE rfield, VALUE rterm)
|
1397
|
+
{
|
1398
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1399
|
+
iw_delete_term(iw, frt_field(rfield), StringValuePtr(rterm));
|
1400
|
+
return self;
|
1401
|
+
}
|
1402
|
+
|
1403
|
+
/*
|
1404
|
+
* call-seq:
|
1405
|
+
* index_writer.field_infos -> FieldInfos
|
1406
|
+
*
|
1407
|
+
* Get the FieldInfos object for this IndexWriter. This is useful if you need
|
1408
|
+
* to dynamically add new fields to the index with specific properties.
|
1409
|
+
*/
|
1410
|
+
static VALUE
|
1411
|
+
frt_iw_field_infos(VALUE self)
|
1412
|
+
{
|
1413
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1414
|
+
return frt_get_field_infos(iw->fis);
|
1415
|
+
}
|
1416
|
+
|
1417
|
+
/*
|
1418
|
+
* call-seq:
|
1419
|
+
* index_writer.analyzer -> Analyzer
|
1420
|
+
*
|
1421
|
+
* Get the Analyzer for this IndexWriter. This is useful if you need
|
1422
|
+
* to use the same analyzer in a QueryParser.
|
1423
|
+
*/
|
1424
|
+
static VALUE
|
1425
|
+
frt_iw_get_analyzer(VALUE self)
|
1426
|
+
{
|
1427
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1428
|
+
return frt_get_analyzer(iw->analyzer);
|
1429
|
+
}
|
1430
|
+
|
1431
|
+
/*
|
1432
|
+
* call-seq:
|
1433
|
+
* index_writer.analyzer -> Analyzer
|
1434
|
+
*
|
1435
|
+
* Set the Analyzer for this IndexWriter. This is useful if you need to
|
1436
|
+
* change the analyzer for a special document. It is risky though as the
|
1437
|
+
* same anlyzer will be used for all documents during search.
|
1438
|
+
*/
|
1439
|
+
static VALUE
|
1440
|
+
frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
|
1441
|
+
{
|
1442
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1443
|
+
|
1444
|
+
a_deref(iw->analyzer);
|
1445
|
+
iw->analyzer = frt_get_cwrapped_analyzer(ranalyzer);
|
1446
|
+
return ranalyzer;
|
1447
|
+
}
|
1448
|
+
|
1449
|
+
/*
|
1450
|
+
* call-seq:
|
1451
|
+
* iw.chunk_size -> number
|
1452
|
+
*
|
1453
|
+
* Return the current value of chunk_size
|
1454
|
+
*/
|
1455
|
+
static VALUE
|
1456
|
+
frt_iw_get_chunk_size(VALUE self)
|
1457
|
+
{
|
1458
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1459
|
+
return INT2FIX(iw->config.chunk_size);
|
1460
|
+
}
|
1461
|
+
|
1462
|
+
/*
|
1463
|
+
* call-seq:
|
1464
|
+
* iw.chunk_size = chunk_size -> chunk_size
|
1465
|
+
*
|
1466
|
+
* Set the chunk_size parameter
|
1467
|
+
*/
|
1468
|
+
static VALUE
|
1469
|
+
frt_iw_set_chunk_size(VALUE self, VALUE rval)
|
1470
|
+
{
|
1471
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1472
|
+
iw->config.chunk_size = FIX2INT(rval);
|
1473
|
+
return rval;
|
1474
|
+
}
|
1475
|
+
|
1476
|
+
/*
|
1477
|
+
* call-seq:
|
1478
|
+
* iw.max_buffer_memory -> number
|
1479
|
+
*
|
1480
|
+
* Return the current value of max_buffer_memory
|
1481
|
+
*/
|
1482
|
+
static VALUE
|
1483
|
+
frt_iw_get_max_buffer_memory(VALUE self)
|
1484
|
+
{
|
1485
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1486
|
+
return INT2FIX(iw->config.max_buffer_memory);
|
1487
|
+
}
|
1488
|
+
|
1489
|
+
/*
|
1490
|
+
* call-seq:
|
1491
|
+
* iw.max_buffer_memory = max_buffer_memory -> max_buffer_memory
|
1492
|
+
*
|
1493
|
+
* Set the max_buffer_memory parameter
|
1494
|
+
*/
|
1495
|
+
static VALUE
|
1496
|
+
frt_iw_set_max_buffer_memory(VALUE self, VALUE rval)
|
1497
|
+
{
|
1498
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1499
|
+
iw->config.max_buffer_memory = FIX2INT(rval);
|
1500
|
+
return rval;
|
1501
|
+
}
|
1502
|
+
|
1503
|
+
/*
|
1504
|
+
* call-seq:
|
1505
|
+
* iw.term_index_interval -> number
|
1506
|
+
*
|
1507
|
+
* Return the current value of term_index_interval
|
1508
|
+
*/
|
1509
|
+
static VALUE
|
1510
|
+
frt_iw_get_index_interval(VALUE self)
|
1511
|
+
{
|
1512
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1513
|
+
return INT2FIX(iw->config.index_interval);
|
1514
|
+
}
|
1515
|
+
|
1516
|
+
/*
|
1517
|
+
* call-seq:
|
1518
|
+
* iw.term_index_interval = term_index_interval -> term_index_interval
|
1519
|
+
*
|
1520
|
+
* Set the term_index_interval parameter
|
1521
|
+
*/
|
1522
|
+
static VALUE
|
1523
|
+
frt_iw_set_index_interval(VALUE self, VALUE rval)
|
1524
|
+
{
|
1525
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1526
|
+
iw->config.index_interval = FIX2INT(rval);
|
1527
|
+
return rval;
|
1528
|
+
}
|
1529
|
+
|
1530
|
+
/*
|
1531
|
+
* call-seq:
|
1532
|
+
* iw.doc_skip_interval -> number
|
1533
|
+
*
|
1534
|
+
* Return the current value of doc_skip_interval
|
1535
|
+
*/
|
1536
|
+
static VALUE
|
1537
|
+
frt_iw_get_skip_interval(VALUE self)
|
1538
|
+
{
|
1539
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1540
|
+
return INT2FIX(iw->config.skip_interval);
|
1541
|
+
}
|
1542
|
+
|
1543
|
+
/*
|
1544
|
+
* call-seq:
|
1545
|
+
* iw.doc_skip_interval = doc_skip_interval -> doc_skip_interval
|
1546
|
+
*
|
1547
|
+
* Set the doc_skip_interval parameter
|
1548
|
+
*/
|
1549
|
+
static VALUE
|
1550
|
+
frt_iw_set_skip_interval(VALUE self, VALUE rval)
|
1551
|
+
{
|
1552
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1553
|
+
iw->config.skip_interval = FIX2INT(rval);
|
1554
|
+
return rval;
|
1555
|
+
}
|
1556
|
+
|
1557
|
+
/*
|
1558
|
+
* call-seq:
|
1559
|
+
* iw.merge_factor -> number
|
1560
|
+
*
|
1561
|
+
* Return the current value of merge_factor
|
1562
|
+
*/
|
1563
|
+
static VALUE
|
1564
|
+
frt_iw_get_merge_factor(VALUE self)
|
1565
|
+
{
|
1566
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1567
|
+
return INT2FIX(iw->config.merge_factor);
|
1568
|
+
}
|
1569
|
+
|
1570
|
+
/*
|
1571
|
+
* call-seq:
|
1572
|
+
* iw.merge_factor = merge_factor -> merge_factor
|
1573
|
+
*
|
1574
|
+
* Set the merge_factor parameter
|
1575
|
+
*/
|
1576
|
+
static VALUE
|
1577
|
+
frt_iw_set_merge_factor(VALUE self, VALUE rval)
|
1578
|
+
{
|
1579
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1580
|
+
iw->config.merge_factor = FIX2INT(rval);
|
1581
|
+
return rval;
|
1582
|
+
}
|
1583
|
+
|
1584
|
+
/*
|
1585
|
+
* call-seq:
|
1586
|
+
* iw.max_buffered_docs -> number
|
1587
|
+
*
|
1588
|
+
* Return the current value of max_buffered_docs
|
1589
|
+
*/
|
1590
|
+
static VALUE
|
1591
|
+
frt_iw_get_max_buffered_docs(VALUE self)
|
1592
|
+
{
|
1593
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1594
|
+
return INT2FIX(iw->config.max_buffered_docs);
|
1595
|
+
}
|
1596
|
+
|
1597
|
+
/*
|
1598
|
+
* call-seq:
|
1599
|
+
* iw.max_buffered_docs = max_buffered_docs -> max_buffered_docs
|
1600
|
+
*
|
1601
|
+
* Set the max_buffered_docs parameter
|
1602
|
+
*/
|
1603
|
+
static VALUE
|
1604
|
+
frt_iw_set_max_buffered_docs(VALUE self, VALUE rval)
|
1605
|
+
{
|
1606
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1607
|
+
iw->config.max_buffered_docs = FIX2INT(rval);
|
1608
|
+
return rval;
|
1609
|
+
}
|
1610
|
+
|
1611
|
+
/*
|
1612
|
+
* call-seq:
|
1613
|
+
* iw.max_merge_docs -> number
|
1614
|
+
*
|
1615
|
+
* Return the current value of max_merge_docs
|
1616
|
+
*/
|
1617
|
+
static VALUE
|
1618
|
+
frt_iw_get_max_merge_docs(VALUE self)
|
1619
|
+
{
|
1620
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1621
|
+
return INT2FIX(iw->config.max_merge_docs);
|
1622
|
+
}
|
1623
|
+
|
1624
|
+
/*
|
1625
|
+
* call-seq:
|
1626
|
+
* iw.max_merge_docs = max_merge_docs -> max_merge_docs
|
1627
|
+
*
|
1628
|
+
* Set the max_merge_docs parameter
|
1629
|
+
*/
|
1630
|
+
static VALUE
|
1631
|
+
frt_iw_set_max_merge_docs(VALUE self, VALUE rval)
|
1632
|
+
{
|
1633
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1634
|
+
iw->config.max_merge_docs = FIX2INT(rval);
|
1635
|
+
return rval;
|
1636
|
+
}
|
1637
|
+
|
1638
|
+
/*
|
1639
|
+
* call-seq:
|
1640
|
+
* iw.max_field_length -> number
|
1641
|
+
*
|
1642
|
+
* Return the current value of max_field_length
|
1643
|
+
*/
|
1644
|
+
static VALUE
|
1645
|
+
frt_iw_get_max_field_length(VALUE self)
|
1646
|
+
{
|
1647
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1648
|
+
return INT2FIX(iw->config.max_field_length);
|
1649
|
+
}
|
1650
|
+
|
1651
|
+
/*
|
1652
|
+
* call-seq:
|
1653
|
+
* iw.max_field_length = max_field_length -> max_field_length
|
1654
|
+
*
|
1655
|
+
* Set the max_field_length parameter
|
1656
|
+
*/
|
1657
|
+
static VALUE
|
1658
|
+
frt_iw_set_max_field_length(VALUE self, VALUE rval)
|
1659
|
+
{
|
1660
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1661
|
+
iw->config.max_field_length = FIX2INT(rval);
|
1662
|
+
return rval;
|
1663
|
+
}
|
1664
|
+
|
1665
|
+
/*
|
1666
|
+
* call-seq:
|
1667
|
+
* iw.use_compound_file -> number
|
1668
|
+
*
|
1669
|
+
* Return the current value of use_compound_file
|
1670
|
+
*/
|
1671
|
+
static VALUE
|
1672
|
+
frt_iw_get_use_compound_file(VALUE self)
|
1673
|
+
{
|
1674
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1675
|
+
return iw->config.use_compound_file ? Qtrue : Qfalse;
|
1676
|
+
}
|
1677
|
+
|
1678
|
+
/*
|
1679
|
+
* call-seq:
|
1680
|
+
* iw.use_compound_file = use_compound_file -> use_compound_file
|
1681
|
+
*
|
1682
|
+
* Set the use_compound_file parameter
|
1683
|
+
*/
|
1684
|
+
static VALUE
|
1685
|
+
frt_iw_set_use_compound_file(VALUE self, VALUE rval)
|
1686
|
+
{
|
1687
|
+
IndexWriter *iw = (IndexWriter *)DATA_PTR(self);
|
1688
|
+
iw->config.use_compound_file = RTEST(rval);
|
1689
|
+
return rval;
|
1690
|
+
}
|
1691
|
+
|
1692
|
+
/****************************************************************************
|
1693
|
+
*
|
1694
|
+
* LazyDoc Methods
|
1695
|
+
*
|
1696
|
+
****************************************************************************/
|
1697
|
+
|
1698
|
+
static void
|
1699
|
+
frt_lzd_date_free(void *p)
|
1700
|
+
{
|
1701
|
+
lazy_doc_close((LazyDoc *)p);
|
1702
|
+
}
|
1703
|
+
|
1704
|
+
static VALUE
|
1705
|
+
frt_lazy_df_load(VALUE self, VALUE rkey, LazyDocField *lazy_df)
|
1706
|
+
{
|
1707
|
+
VALUE rdata = Qnil;
|
1708
|
+
if (lazy_df) {
|
1709
|
+
if (lazy_df->size == 1) {
|
1710
|
+
char *data = lazy_df_get_data(lazy_df, 0);
|
1711
|
+
rdata = rb_str_new(data, lazy_df->len);
|
1712
|
+
} else {
|
1713
|
+
int i;
|
1714
|
+
rdata = rb_ary_new2(lazy_df->size);
|
1715
|
+
for (i = 0; i < lazy_df->size; i++) {
|
1716
|
+
char *data = lazy_df_get_data(lazy_df, i);
|
1717
|
+
RARRAY(rdata)->ptr[i] =
|
1718
|
+
rb_str_new(data, lazy_df->data[i].length);
|
1719
|
+
RARRAY(rdata)->len++;
|
1720
|
+
}
|
1721
|
+
}
|
1722
|
+
rb_hash_aset(self, rkey, rdata);
|
1723
|
+
}
|
1724
|
+
return rdata;
|
1725
|
+
}
|
1726
|
+
|
1727
|
+
/*
|
1728
|
+
* call-seq:
|
1729
|
+
* lazy_doc.default(key) -> string
|
1730
|
+
*
|
1731
|
+
* This method is used internally to lazily load fields. You should never
|
1732
|
+
* really need to call it yourself.
|
1733
|
+
*/
|
1734
|
+
static VALUE
|
1735
|
+
frt_lzd_default(VALUE self, VALUE rkey)
|
1736
|
+
{
|
1737
|
+
LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
|
1738
|
+
char *field = NULL;
|
1739
|
+
switch (TYPE(rkey)) {
|
1740
|
+
case T_STRING:
|
1741
|
+
field = RSTRING(rkey)->ptr;
|
1742
|
+
rkey = ID2SYM(rb_intern(field));
|
1743
|
+
break;
|
1744
|
+
case T_SYMBOL:
|
1745
|
+
field = frt_field(rkey);
|
1746
|
+
break;
|
1747
|
+
default:
|
1748
|
+
rb_raise(rb_eArgError,
|
1749
|
+
"%s cannot be a key to a field. Field keys must "
|
1750
|
+
" be symbols.", RSTRING(rb_obj_as_string(rkey))->ptr);
|
1751
|
+
break;
|
1752
|
+
}
|
1753
|
+
return frt_lazy_df_load(self, rkey, h_get(lazy_doc->field_dict, field));
|
1754
|
+
}
|
1755
|
+
|
1756
|
+
/*
|
1757
|
+
* call-seq:
|
1758
|
+
* lazy_doc.fields -> array of available fields
|
1759
|
+
*
|
1760
|
+
* Returns the list of fields stored for this particular document. If you try
|
1761
|
+
* to access any of these fields in the document the field will be loaded.
|
1762
|
+
* Try to access any other field an nil will be returned.
|
1763
|
+
*/
|
1764
|
+
static VALUE
|
1765
|
+
frt_lzd_fields(VALUE self)
|
1766
|
+
{
|
1767
|
+
return rb_ivar_get(self, id_fields);
|
1768
|
+
}
|
1769
|
+
|
1770
|
+
/*
|
1771
|
+
* call-seq:
|
1772
|
+
* lazy_doc.load -> lazy_doc
|
1773
|
+
*
|
1774
|
+
* Load all unloaded fields in the document from the index.
|
1775
|
+
*/
|
1776
|
+
static VALUE
|
1777
|
+
frt_lzd_load(VALUE self)
|
1778
|
+
{
|
1779
|
+
LazyDoc *lazy_doc = (LazyDoc *)DATA_PTR(rb_ivar_get(self, id_data));
|
1780
|
+
int i;
|
1781
|
+
for (i = 0; i < lazy_doc->size; i++) {
|
1782
|
+
LazyDocField *lazy_df = lazy_doc->fields[i];
|
1783
|
+
frt_lazy_df_load(self, ID2SYM(rb_intern(lazy_df->name)), lazy_df);
|
1784
|
+
}
|
1785
|
+
return self;
|
1786
|
+
}
|
1787
|
+
|
1788
|
+
VALUE
|
1789
|
+
frt_get_lazy_doc(LazyDoc *lazy_doc)
|
1790
|
+
{
|
1791
|
+
int i;
|
1792
|
+
VALUE rfields = rb_ary_new2(lazy_doc->size);
|
1793
|
+
|
1794
|
+
VALUE self, rdata;
|
1795
|
+
self = rb_hash_new();
|
1796
|
+
OBJSETUP(self, cLazyDoc, T_HASH);
|
1797
|
+
|
1798
|
+
rdata = Data_Wrap_Struct(cLazyDocData, NULL, &frt_lzd_date_free, lazy_doc);
|
1799
|
+
rb_ivar_set(self, id_data, rdata);
|
1800
|
+
|
1801
|
+
for (i = 0; i < lazy_doc->size; i++) {
|
1802
|
+
RARRAY(rfields)->ptr[i] = rb_intern(lazy_doc->fields[i]->name);
|
1803
|
+
RARRAY(rfields)->len++;
|
1804
|
+
}
|
1805
|
+
rb_ivar_set(self, id_fields, rfields);
|
1806
|
+
|
1807
|
+
return self;
|
1808
|
+
}
|
1809
|
+
|
1810
|
+
/****************************************************************************
|
1811
|
+
*
|
1812
|
+
* IndexReader Methods
|
1813
|
+
*
|
1814
|
+
****************************************************************************/
|
1815
|
+
|
1816
|
+
void
|
1817
|
+
frt_ir_free(void *p)
|
1818
|
+
{
|
1819
|
+
object_del(p);
|
1820
|
+
ir_close((IndexReader *)p);
|
1821
|
+
}
|
1822
|
+
|
1823
|
+
void
|
1824
|
+
frt_ir_mark(void *p)
|
1825
|
+
{
|
1826
|
+
IndexReader *ir = (IndexReader *)p;
|
1827
|
+
frt_gc_mark(ir->store);
|
1828
|
+
}
|
1829
|
+
|
1830
|
+
/*
|
1831
|
+
* call-seq:
|
1832
|
+
* IndexReader.new(dir) -> index_reader
|
1833
|
+
*
|
1834
|
+
* Create a new IndexReader. You can either pass a string path to a
|
1835
|
+
* file-system directory or an actual Ferret::Store::Directory object. For
|
1836
|
+
* example;
|
1837
|
+
*
|
1838
|
+
* dir = RAMDirectory.new()
|
1839
|
+
* iw = IndexReader.new(dir)
|
1840
|
+
*
|
1841
|
+
* dir = FSDirectory.new("/path/to/index")
|
1842
|
+
* iw = IndexReader.new(dir)
|
1843
|
+
*
|
1844
|
+
* iw = IndexReader.new("/path/to/index")
|
1845
|
+
*/
|
1846
|
+
static VALUE
|
1847
|
+
frt_ir_init(VALUE self, VALUE rdir)
|
1848
|
+
{
|
1849
|
+
Store *store = NULL;
|
1850
|
+
IndexReader *ir;
|
1851
|
+
int i;
|
1852
|
+
FieldInfos *fis;
|
1853
|
+
VALUE rfield_num_map = rb_hash_new();
|
1854
|
+
|
1855
|
+
if (TYPE(rdir) == T_ARRAY) {
|
1856
|
+
const int reader_cnt = RARRAY(rdir)->len;
|
1857
|
+
IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
|
1858
|
+
int i;
|
1859
|
+
for (i = 0; i < reader_cnt; i++) {
|
1860
|
+
Data_Get_Struct(RARRAY(rdir)->ptr[i], IndexReader, sub_readers[i]);
|
1861
|
+
REF(sub_readers[i]);
|
1862
|
+
}
|
1863
|
+
ir = mr_open(sub_readers, reader_cnt);
|
1864
|
+
} else {
|
1865
|
+
switch (TYPE(rdir)) {
|
1866
|
+
case T_DATA:
|
1867
|
+
store = DATA_PTR(rdir);
|
1868
|
+
break;
|
1869
|
+
case T_STRING:
|
1870
|
+
frt_create_dir(rdir);
|
1871
|
+
store = open_fs_store(RSTRING(rdir)->ptr);
|
1872
|
+
DEREF(store);
|
1873
|
+
break;
|
1874
|
+
default:
|
1875
|
+
rb_raise(rb_eArgError, "%s isn't a valid directory argument. "
|
1876
|
+
"You should use either a String or a Directory",
|
1877
|
+
RSTRING(rb_obj_as_string(rdir))->ptr);
|
1878
|
+
break;
|
1879
|
+
}
|
1880
|
+
ir = ir_open(store);
|
1881
|
+
}
|
1882
|
+
Frt_Wrap_Struct(self, &frt_ir_mark, &frt_ir_free, ir);
|
1883
|
+
object_add(ir, self);
|
1884
|
+
|
1885
|
+
fis = ir->fis;
|
1886
|
+
for (i = 0; i < fis->size; i++) {
|
1887
|
+
FieldInfo *fi = fis->fields[i];
|
1888
|
+
rb_hash_aset(rfield_num_map,
|
1889
|
+
ID2SYM(rb_intern(fi->name)),
|
1890
|
+
INT2FIX(fi->number));
|
1891
|
+
}
|
1892
|
+
rb_ivar_set(self, id_fld_num_map, rfield_num_map);
|
1893
|
+
|
1894
|
+
return self;
|
1895
|
+
}
|
1896
|
+
|
1897
|
+
/*
|
1898
|
+
* call-seq:
|
1899
|
+
* index_reader.set_norm(doc_id, field, val)
|
1900
|
+
*
|
1901
|
+
* Expert: change the boost value for a +field+ in document at +doc_id+.
|
1902
|
+
* +val+ should be an integer in the range 0..255 which corresponds to an
|
1903
|
+
* encoced float value.
|
1904
|
+
*/
|
1905
|
+
static VALUE
|
1906
|
+
frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
|
1907
|
+
{
|
1908
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
1909
|
+
ir_set_norm(ir, FIX2INT(rdoc_id), frt_field(rfield), NUM2CHR(rval));
|
1910
|
+
return self;
|
1911
|
+
}
|
1912
|
+
|
1913
|
+
/*
|
1914
|
+
* call-seq:
|
1915
|
+
* index_reader.norms(field) -> string
|
1916
|
+
*
|
1917
|
+
* Expert: Returns a string containing the norm values for a field. The
|
1918
|
+
* string length will be equal to the number of documents in the index and it
|
1919
|
+
* could have null bytes.
|
1920
|
+
*/
|
1921
|
+
static VALUE
|
1922
|
+
frt_ir_norms(VALUE self, VALUE rfield)
|
1923
|
+
{
|
1924
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
1925
|
+
uchar *norms;
|
1926
|
+
norms = ir_get_norms(ir, frt_field(rfield));
|
1927
|
+
if (norms) {
|
1928
|
+
return rb_str_new((char *)norms, ir->max_doc(ir));
|
1929
|
+
} else {
|
1930
|
+
return Qnil;
|
1931
|
+
}
|
1932
|
+
}
|
1933
|
+
|
1934
|
+
/*
|
1935
|
+
* call-seq:
|
1936
|
+
* index_reader.get_norms_into(field, buffer, offset) -> buffer
|
1937
|
+
*
|
1938
|
+
* Expert: Get the norm values into a string +buffer+ starting at +offset+.
|
1939
|
+
*/
|
1940
|
+
static VALUE
|
1941
|
+
frt_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
|
1942
|
+
{
|
1943
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
1944
|
+
int offset;
|
1945
|
+
offset = FIX2INT(roffset);
|
1946
|
+
Check_Type(rnorms, T_STRING);
|
1947
|
+
if (RSTRING(rnorms)->len < offset + ir->max_doc(ir)) {
|
1948
|
+
rb_raise(rb_eArgError, "supplied a string of length:%d to "
|
1949
|
+
"IndexReader#get_norms_into but needed a string of length "
|
1950
|
+
"offset:%d + maxdoc:%d",
|
1951
|
+
RSTRING(rnorms)->len, offset, ir->max_doc(ir));
|
1952
|
+
}
|
1953
|
+
|
1954
|
+
ir_get_norms_into(ir, frt_field(rfield),
|
1955
|
+
(uchar *)RSTRING(rnorms)->ptr + offset);
|
1956
|
+
return rnorms;
|
1957
|
+
}
|
1958
|
+
|
1959
|
+
/*
|
1960
|
+
* call-seq:
|
1961
|
+
* index_reader.commit -> index_reader
|
1962
|
+
*
|
1963
|
+
* Commit any deletes made by this particular IndexReader to the index. This
|
1964
|
+
* will use open a Commit lock.
|
1965
|
+
*/
|
1966
|
+
static VALUE
|
1967
|
+
frt_ir_commit(VALUE self)
|
1968
|
+
{
|
1969
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
1970
|
+
ir_commit(ir);
|
1971
|
+
return self;
|
1972
|
+
}
|
1973
|
+
|
1974
|
+
/*
|
1975
|
+
* call-seq:
|
1976
|
+
* index_reader.close -> index_reader
|
1977
|
+
*
|
1978
|
+
* Close the IndexReader. This method also commits any deletions made by this
|
1979
|
+
* IndexReader. Thise method will be called explicitly by the garbage
|
1980
|
+
* collector but you should call it explicitly to commit any changes as soon
|
1981
|
+
* as possible and to close any locks held by the object to prevent locking
|
1982
|
+
* errors.
|
1983
|
+
*/
|
1984
|
+
static VALUE
|
1985
|
+
frt_ir_close(VALUE self)
|
1986
|
+
{
|
1987
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
1988
|
+
object_del(ir);
|
1989
|
+
Frt_Unwrap_Struct(self);
|
1990
|
+
ir_close(ir);
|
1991
|
+
return self;
|
1992
|
+
}
|
1993
|
+
|
1994
|
+
/*
|
1995
|
+
* call-seq:
|
1996
|
+
* index_reader.has_deletions? -> bool
|
1997
|
+
*
|
1998
|
+
* Return true if the index has any deletions, either uncommited by this
|
1999
|
+
* IndexReader or committed by any other IndexReader.
|
2000
|
+
*/
|
2001
|
+
static VALUE
|
2002
|
+
frt_ir_has_deletions(VALUE self)
|
2003
|
+
{
|
2004
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2005
|
+
return ir->has_deletions(ir) ? Qtrue : Qfalse;
|
2006
|
+
}
|
2007
|
+
|
2008
|
+
/*
|
2009
|
+
* call-seq:
|
2010
|
+
* index_reader.delete(doc_id) -> index_reader
|
2011
|
+
*
|
2012
|
+
* Delete document referenced internally by document id +doc_id+. The
|
2013
|
+
* document_id is the number used to reference documents in the index and is
|
2014
|
+
* returned by search methods.
|
2015
|
+
*/
|
2016
|
+
static VALUE
|
2017
|
+
frt_ir_delete(VALUE self, VALUE rdoc_id)
|
2018
|
+
{
|
2019
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2020
|
+
ir_delete_doc(ir, FIX2INT(rdoc_id));
|
2021
|
+
return self;
|
2022
|
+
}
|
2023
|
+
|
2024
|
+
/*
|
2025
|
+
* call-seq:
|
2026
|
+
* index_reader.deleted?(doc_id) -> bool
|
2027
|
+
*
|
2028
|
+
* Returns true if the document at +doc_id+ has been deleted.
|
2029
|
+
*/
|
2030
|
+
static VALUE
|
2031
|
+
frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
|
2032
|
+
{
|
2033
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2034
|
+
return ir->is_deleted(ir, FIX2INT(rdoc_id)) ? Qtrue : Qfalse;
|
2035
|
+
}
|
2036
|
+
|
2037
|
+
/*
|
2038
|
+
* call-seq:
|
2039
|
+
* index_reader.max_doc -> number
|
2040
|
+
*
|
2041
|
+
* Returns 1 + the maximum document id in the index. It is the the
|
2042
|
+
* document_id that will be used by the next document added to the index. If
|
2043
|
+
* there are no deletions, this number also refers to the number of documents
|
2044
|
+
* in the index.
|
2045
|
+
*/
|
2046
|
+
static VALUE
|
2047
|
+
frt_ir_max_doc(VALUE self)
|
2048
|
+
{
|
2049
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2050
|
+
return INT2FIX(ir->max_doc(ir));
|
2051
|
+
}
|
2052
|
+
|
2053
|
+
/*
|
2054
|
+
* call-seq:
|
2055
|
+
* index_reader.num_docs -> number
|
2056
|
+
*
|
2057
|
+
* Returns the number of accessible (not deleted) documents in the index.
|
2058
|
+
* This will be equal to IndexReader#max_doc if there have been no documents
|
2059
|
+
* deleted from the index.
|
2060
|
+
*/
|
2061
|
+
static VALUE
|
2062
|
+
frt_ir_num_docs(VALUE self)
|
2063
|
+
{
|
2064
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2065
|
+
return INT2FIX(ir->num_docs(ir));
|
2066
|
+
}
|
2067
|
+
|
2068
|
+
/*
|
2069
|
+
* call-seq:
|
2070
|
+
* index_reader.undelete_all -> index_reader
|
2071
|
+
*
|
2072
|
+
* Undelete all deleted documents in the index. This is kind of like a
|
2073
|
+
* rollback feature. Not that once an index is commited or a merge happens
|
2074
|
+
* during index, deletions will be committed and undelete_all will have no
|
2075
|
+
* effect on these documents.
|
2076
|
+
*/
|
2077
|
+
static VALUE
|
2078
|
+
frt_ir_undelete_all(VALUE self)
|
2079
|
+
{
|
2080
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2081
|
+
ir_undelete_all(ir);
|
2082
|
+
return self;
|
2083
|
+
}
|
2084
|
+
|
2085
|
+
/*
|
2086
|
+
* call-seq:
|
2087
|
+
* index_reader.get_document(doc_id) -> LazyDoc
|
2088
|
+
* index_reader[doc_id] -> LazyDoc
|
2089
|
+
*
|
2090
|
+
* Retrieve a document from the index. See LazyDoc for more details on the
|
2091
|
+
* document returned. Documents are referenced internally by document ids
|
2092
|
+
* which are returned by the Searchers search methods.
|
2093
|
+
*/
|
2094
|
+
static VALUE
|
2095
|
+
frt_ir_get_doc(VALUE self, VALUE rdoc_id)
|
2096
|
+
{
|
2097
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2098
|
+
return frt_get_lazy_doc(ir->get_lazy_doc(ir, FIX2INT(rdoc_id)));
|
2099
|
+
}
|
2100
|
+
|
2101
|
+
/*
|
2102
|
+
* call-seq:
|
2103
|
+
* index_reader.is_latest? -> bool
|
2104
|
+
*
|
2105
|
+
* Return true if the index version referenced by this IndexReader is the
|
2106
|
+
* latest version of the index. If it isn't you should close and reopen the
|
2107
|
+
* index to search the latest documents added to the index.
|
2108
|
+
*/
|
2109
|
+
static VALUE
|
2110
|
+
frt_ir_is_latest(VALUE self)
|
2111
|
+
{
|
2112
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2113
|
+
return ir_is_latest(ir) ? Qtrue : Qfalse;
|
2114
|
+
}
|
2115
|
+
|
2116
|
+
/*
|
2117
|
+
* call-seq:
|
2118
|
+
* index_reader.term_vector(doc_id, field) -> TermVector
|
2119
|
+
*
|
2120
|
+
* Return the TermVector for the field +field+ in the document at +doc_id+ in
|
2121
|
+
* the index. See TermVector.
|
2122
|
+
*/
|
2123
|
+
static VALUE
|
2124
|
+
frt_ir_term_vector(VALUE self, VALUE rdoc_id, VALUE rfield)
|
2125
|
+
{
|
2126
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2127
|
+
TermVector *tv;
|
2128
|
+
VALUE rtv;
|
2129
|
+
tv = ir->term_vector(ir, FIX2INT(rdoc_id), frt_field(rfield));
|
2130
|
+
rtv = frt_get_tv(tv);
|
2131
|
+
tv_destroy(tv);
|
2132
|
+
return rtv;
|
2133
|
+
}
|
2134
|
+
|
2135
|
+
static void
|
2136
|
+
frt_add_each_tv(void *key, void *value, void *rtvs)
|
2137
|
+
{
|
2138
|
+
rb_hash_aset((VALUE)rtvs, ID2SYM(rb_intern(key)), frt_get_tv(value));
|
2139
|
+
}
|
2140
|
+
|
2141
|
+
/*
|
2142
|
+
* call-seq:
|
2143
|
+
* index_reader.term_vectors(doc_id) -> hash of TermVector
|
2144
|
+
*
|
2145
|
+
* Return the TermVectors for the document at +doc_id+ in the index. The
|
2146
|
+
* value returned is a hash of the TermVectors for each field in the document
|
2147
|
+
* and they are referenced by field names (as symbols).
|
2148
|
+
*/
|
2149
|
+
static VALUE
|
2150
|
+
frt_ir_term_vectors(VALUE self, VALUE rdoc_id)
|
2151
|
+
{
|
2152
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2153
|
+
HashTable *tvs = ir->term_vectors(ir, FIX2INT(rdoc_id));
|
2154
|
+
VALUE rtvs = rb_hash_new();
|
2155
|
+
h_each(tvs, &frt_add_each_tv, (void *)rtvs);
|
2156
|
+
h_destroy(tvs);
|
2157
|
+
|
2158
|
+
return rtvs;
|
2159
|
+
}
|
2160
|
+
|
2161
|
+
/*
|
2162
|
+
* call-seq:
|
2163
|
+
* index_reader.term_docs -> TermDocEnum
|
2164
|
+
*
|
2165
|
+
* Builds a TermDocEnum (term-document enumerator) for the index. You can use
|
2166
|
+
* this object to iterate through the documents in which certain terms occur.
|
2167
|
+
* See TermDocEnum for more info.
|
2168
|
+
*/
|
2169
|
+
static VALUE
|
2170
|
+
frt_ir_term_docs(VALUE self)
|
2171
|
+
{
|
2172
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2173
|
+
return frt_get_tde(self, ir->term_docs(ir));
|
2174
|
+
}
|
2175
|
+
|
2176
|
+
/*
|
2177
|
+
* call-seq:
|
2178
|
+
* index_reader.term_docs_for(field, term) -> TermDocEnum
|
2179
|
+
*
|
2180
|
+
* Builds a TermDocEnum to iterate through the documents that contain the
|
2181
|
+
* term +term+ in the field +field+. See TermDocEnum for more info.
|
2182
|
+
*/
|
2183
|
+
static VALUE
|
2184
|
+
frt_ir_term_docs_for(VALUE self, VALUE rfield, VALUE rterm)
|
2185
|
+
{
|
2186
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2187
|
+
return frt_get_tde(self, ir_term_docs_for(ir,
|
2188
|
+
frt_field(rfield),
|
2189
|
+
StringValuePtr(rterm)));
|
2190
|
+
}
|
2191
|
+
|
2192
|
+
/*
|
2193
|
+
* call-seq:
|
2194
|
+
* index_reader.term_positions -> TermDocEnum
|
2195
|
+
*
|
2196
|
+
* Same as IndexReader#term_docs except the TermDocEnum will also allow you
|
2197
|
+
* to scan through the positions at which a term occurs. See TermDocEnum for
|
2198
|
+
* more info.
|
2199
|
+
*/
|
2200
|
+
static VALUE
|
2201
|
+
frt_ir_term_positions(VALUE self)
|
2202
|
+
{
|
2203
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2204
|
+
return frt_get_tde(self, ir->term_positions(ir));
|
2205
|
+
}
|
2206
|
+
|
2207
|
+
/*
|
2208
|
+
* call-seq:
|
2209
|
+
* index_reader.term_positions_for(field, term) -> TermDocEnum
|
2210
|
+
*
|
2211
|
+
* Same as IndexReader#term_docs_for(field, term) except the TermDocEnum will
|
2212
|
+
* also allow you to scan through the positions at which a term occurs. See
|
2213
|
+
* TermDocEnum for more info.
|
2214
|
+
*/
|
2215
|
+
static VALUE
|
2216
|
+
frt_ir_t_pos_for(VALUE self, VALUE rfield, VALUE rterm)
|
2217
|
+
{
|
2218
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2219
|
+
return frt_get_tde(self, ir_term_positions_for(ir,
|
2220
|
+
frt_field(rfield),
|
2221
|
+
StringValuePtr(rterm)));
|
2222
|
+
}
|
2223
|
+
|
2224
|
+
/*
|
2225
|
+
* call-seq:
|
2226
|
+
* index_reader.doc_freq(field, term) -> integer
|
2227
|
+
*
|
2228
|
+
* Return the number of documents in which the term +term+ appears in the
|
2229
|
+
* field +field+.
|
2230
|
+
*/
|
2231
|
+
static VALUE
|
2232
|
+
frt_ir_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
|
2233
|
+
{
|
2234
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2235
|
+
return INT2FIX(ir_doc_freq(ir,
|
2236
|
+
frt_field(rfield),
|
2237
|
+
StringValuePtr(rterm)));
|
2238
|
+
}
|
2239
|
+
|
2240
|
+
/*
|
2241
|
+
* call-seq:
|
2242
|
+
* index_reader.terms(field) -> TermEnum
|
2243
|
+
*
|
2244
|
+
* Returns a term enumerator which allows you to iterate through all the
|
2245
|
+
* terms in the field +field+ in the index.
|
2246
|
+
*/
|
2247
|
+
static VALUE
|
2248
|
+
frt_ir_terms(VALUE self, VALUE rfield)
|
2249
|
+
{
|
2250
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2251
|
+
return frt_get_te(self, ir_terms(ir, frt_field(rfield)));
|
2252
|
+
}
|
2253
|
+
|
2254
|
+
/*
|
2255
|
+
* call-seq:
|
2256
|
+
* index_reader.terms_from(field, term) -> TermEnum
|
2257
|
+
*
|
2258
|
+
* Same as IndexReader#terms(fields) except that it starts the enumerator off
|
2259
|
+
* at term +term+.
|
2260
|
+
*/
|
2261
|
+
static VALUE
|
2262
|
+
frt_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
|
2263
|
+
{
|
2264
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2265
|
+
return frt_get_te(self, ir_terms_from(ir,
|
2266
|
+
frt_field(rfield),
|
2267
|
+
StringValuePtr(rterm)));
|
2268
|
+
}
|
2269
|
+
|
2270
|
+
/*
|
2271
|
+
* call-seq:
|
2272
|
+
* index_reader.field_names -> array of field-names
|
2273
|
+
*
|
2274
|
+
* Returns an array of field names in the index. This can be used to pass to
|
2275
|
+
* the QueryParser so that the QueryParser knows how to expand the "*"
|
2276
|
+
* wild-card to all fields in the index. A list of field names can also be
|
2277
|
+
* gathered from the FieldInfos object.
|
2278
|
+
*/
|
2279
|
+
static VALUE
|
2280
|
+
frt_ir_field_names(VALUE self)
|
2281
|
+
{
|
2282
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2283
|
+
FieldInfos *fis = ir->fis;
|
2284
|
+
VALUE rfield_names = rb_ary_new();
|
2285
|
+
int i;
|
2286
|
+
for (i = 0; i < fis->size; i++) {
|
2287
|
+
rb_ary_push(rfield_names, ID2SYM(rb_intern(fis->fields[i]->name)));
|
2288
|
+
}
|
2289
|
+
return rfield_names;
|
2290
|
+
}
|
2291
|
+
|
2292
|
+
/*
|
2293
|
+
* call-seq:
|
2294
|
+
* index_reader.field_infos -> FieldInfos
|
2295
|
+
*
|
2296
|
+
* Get the FieldInfos object for this IndexReader.
|
2297
|
+
*/
|
2298
|
+
static VALUE
|
2299
|
+
frt_ir_field_infos(VALUE self)
|
2300
|
+
{
|
2301
|
+
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2302
|
+
return frt_get_field_infos(ir->fis);
|
2303
|
+
}
|
2304
|
+
|
2305
|
+
/****************************************************************************
|
2306
|
+
*
|
2307
|
+
* Init Functions
|
2308
|
+
*
|
2309
|
+
****************************************************************************/
|
2310
|
+
|
2311
|
+
|
2312
|
+
/*
|
2313
|
+
* Document-class: Ferret::Index::FieldInfo
|
2314
|
+
*
|
2315
|
+
* == Summary
|
2316
|
+
*
|
2317
|
+
* The FieldInfo class is the field descripter for the index. It specifies
|
2318
|
+
* whether a field is compressed or not or whether it should be indexed and
|
2319
|
+
* tokenized. Every field has a name which must be a symbol. There are three
|
2320
|
+
* properties that you can set, +:store+, +:index+ and +:term_vector+. You
|
2321
|
+
* can also set the default +:boost+ for a field as well.
|
2322
|
+
*
|
2323
|
+
* == Properties
|
2324
|
+
*
|
2325
|
+
* === :store
|
2326
|
+
*
|
2327
|
+
* The +:store+ property allows you to specify how a field is stored. You can
|
2328
|
+
* leave a field unstored (+:no+), store it in it's original format (+:yes+)
|
2329
|
+
* or store it in compressed format (+:compressed+). By default the document
|
2330
|
+
* is stored in its original format. If the field is large and it is stored
|
2331
|
+
* elsewhere where it is easily accessible you might want to leave it
|
2332
|
+
* unstored. This will keep the index size a lot smaller and make the
|
2333
|
+
* indexing process a lot faster. For example, you should probably leave the
|
2334
|
+
* +:content+ field unstored when indexing all the documents in your
|
2335
|
+
* file-system.
|
2336
|
+
*
|
2337
|
+
* === :index
|
2338
|
+
*
|
2339
|
+
* The +:index+ property allows you to specify how a field is indexed. A
|
2340
|
+
* field must be indexed to be searchable. However, a field doesn't need to
|
2341
|
+
* be indexed to be store in the Ferret index. You may want to use the index
|
2342
|
+
* as a simple database and store things like images or MP3s in the index. By
|
2343
|
+
* default each field is indexed and tokenized (split into tokens) (+:yes+).
|
2344
|
+
* If you don't want to index the field use +:no+. If you wan the field
|
2345
|
+
* indexed but not tokenized, use +:untokenized+. Do this for the fields you
|
2346
|
+
* wish to sort by. There are two other values for +:index+; +:omit_norms+
|
2347
|
+
* and +:untokenized_omit_norms+. These values correspond to +:yes+ and
|
2348
|
+
* +:untokenized+ respectively and are useful if you are not boosting any
|
2349
|
+
* fields and you'd like to speed up the index. The norms file is the file
|
2350
|
+
* which contains the boost values for each document for a particular field.
|
2351
|
+
*
|
2352
|
+
* === :term_vector
|
2353
|
+
*
|
2354
|
+
* See TermVector for a description of term-vectors. You can specify whether
|
2355
|
+
* or not you would like to store term-vectors. The available options are
|
2356
|
+
* +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
|
2357
|
+
* +:with_positions_offsets+. Note that you need to store the positions to
|
2358
|
+
* asscociate offsets with individual terms in the term_vector.
|
2359
|
+
*
|
2360
|
+
* == Property Table
|
2361
|
+
*
|
2362
|
+
* Property Value Description
|
2363
|
+
* ------------------------------------------------------------------------
|
2364
|
+
* :store | :no | Don't store field
|
2365
|
+
* | |
|
2366
|
+
* | :yes (default) | Store field in its original
|
2367
|
+
* | | format. Use this value if you
|
2368
|
+
* | | want to highlight matches.
|
2369
|
+
* | | or print match excerpts a la
|
2370
|
+
* | | Google search.
|
2371
|
+
* | |
|
2372
|
+
* | :compressed | Store field in compressed
|
2373
|
+
* | | format.
|
2374
|
+
* -------------|-------------------------|------------------------------
|
2375
|
+
* :index | :no | Do not make this field
|
2376
|
+
* | | searchable.
|
2377
|
+
* | |
|
2378
|
+
* | :yes (default) | Make this field searchable and
|
2379
|
+
* | | tokenized its contents.
|
2380
|
+
* | |
|
2381
|
+
* | :untokenized | Make this field searchable but
|
2382
|
+
* | | do not tokenize its contents.
|
2383
|
+
* | | use this value for fields you
|
2384
|
+
* | | wish to sort by.
|
2385
|
+
* | |
|
2386
|
+
* | :omit_norms | Same as :yes except omit the
|
2387
|
+
* | | norms file. The norms file can
|
2388
|
+
* | | be omitted if you don't boost
|
2389
|
+
* | | any fields and you don't need
|
2390
|
+
* | | scoring based on field length.
|
2391
|
+
* | |
|
2392
|
+
* | :untokenized_omit_norms | Same as :untokenized except omit
|
2393
|
+
* | | the norms file. Norms files can
|
2394
|
+
* | | be omitted if you don't boost
|
2395
|
+
* | | any fields and you don't need
|
2396
|
+
* | | scoring based on field length.
|
2397
|
+
* | |
|
2398
|
+
* -------------|-------------------------|------------------------------
|
2399
|
+
* :term_vector | :no | Don't store term-vectors
|
2400
|
+
* | |
|
2401
|
+
* | :yes | Store term-vectors without
|
2402
|
+
* | | storing positions or offsets.
|
2403
|
+
* | |
|
2404
|
+
* | :with_positions | Store term-vectors with
|
2405
|
+
* | | positions.
|
2406
|
+
* | |
|
2407
|
+
* | :with_offsets | Store term-vectors with
|
2408
|
+
* | | offsets.
|
2409
|
+
* | |
|
2410
|
+
* | :with_positions_offsets | Store term-vectors with
|
2411
|
+
* | (default) | positions and offsets.
|
2412
|
+
*
|
2413
|
+
* == Examples
|
2414
|
+
*
|
2415
|
+
* fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
|
2416
|
+
* :boost => 10.0)
|
2417
|
+
*
|
2418
|
+
* fi = FieldInfo.new(:content)
|
2419
|
+
*
|
2420
|
+
* fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
|
2421
|
+
* :term_vector => :no)
|
2422
|
+
*
|
2423
|
+
* fi = FieldInfo.new(:image, :store => :compressed, :index => :no,
|
2424
|
+
* :term_vector => :no)
|
2425
|
+
*/
|
2426
|
+
static void
|
2427
|
+
Init_FieldInfo(void)
|
2428
|
+
{
|
2429
|
+
sym_store = ID2SYM(rb_intern("store"));
|
2430
|
+
sym_index = ID2SYM(rb_intern("index"));
|
2431
|
+
sym_term_vector = ID2SYM(rb_intern("term_vector"));
|
2432
|
+
|
2433
|
+
sym_compress = ID2SYM(rb_intern("compress"));
|
2434
|
+
sym_compressed = ID2SYM(rb_intern("compressed"));
|
2435
|
+
|
2436
|
+
sym_untokenized = ID2SYM(rb_intern("untokenized"));
|
2437
|
+
sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
|
2438
|
+
sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
|
2439
|
+
|
2440
|
+
sym_with_positions = ID2SYM(rb_intern("with_positions"));
|
2441
|
+
sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
|
2442
|
+
sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
|
2443
|
+
|
2444
|
+
cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
|
2445
|
+
rb_define_alloc_func(cFieldInfo, frt_data_alloc);
|
2446
|
+
|
2447
|
+
rb_define_method(cFieldInfo, "initialize", frt_fi_init, -1);
|
2448
|
+
rb_define_method(cFieldInfo, "stored?", frt_fi_is_stored, 0);
|
2449
|
+
rb_define_method(cFieldInfo, "compressed?", frt_fi_is_compressed, 0);
|
2450
|
+
rb_define_method(cFieldInfo, "indexed?", frt_fi_is_indexed, 0);
|
2451
|
+
rb_define_method(cFieldInfo, "tokenized?", frt_fi_is_tokenized, 0);
|
2452
|
+
rb_define_method(cFieldInfo, "omit_norms?", frt_fi_omit_norms, 0);
|
2453
|
+
rb_define_method(cFieldInfo, "store_term_vector?",
|
2454
|
+
frt_fi_store_term_vector, 0);
|
2455
|
+
rb_define_method(cFieldInfo, "store_positions?",
|
2456
|
+
frt_fi_store_positions, 0);
|
2457
|
+
rb_define_method(cFieldInfo, "store_offsets?",
|
2458
|
+
frt_fi_store_offsets, 0);
|
2459
|
+
rb_define_method(cFieldInfo, "has_norms?", frt_fi_has_norms, 0);
|
2460
|
+
rb_define_method(cFieldInfo, "boost", frt_fi_boost, 0);
|
2461
|
+
rb_define_method(cFieldInfo, "to_s", frt_fi_to_s, 0);
|
2462
|
+
}
|
2463
|
+
|
2464
|
+
/*
|
2465
|
+
* Document-class: Ferret::Index::FieldInfos
|
2466
|
+
*
|
2467
|
+
* == Summary
|
2468
|
+
*
|
2469
|
+
* The FieldInfos class holds all the field descriptors for an index. It is
|
2470
|
+
* this class that is used to create a new index using the
|
2471
|
+
* FieldInfos#create_index method. If you are happy with the default
|
2472
|
+
* properties for FieldInfo then you don't need to worry about this class.
|
2473
|
+
* IndexWriter can create the index for you. Otherwise you should set up the
|
2474
|
+
* index like in the example;
|
2475
|
+
*
|
2476
|
+
* == Example
|
2477
|
+
*
|
2478
|
+
* field_infos = FieldInfos.new(:term_vector => :no)
|
2479
|
+
*
|
2480
|
+
* field_infos.add_field(:title, :index => :untokenized, :term_vector => :no,
|
2481
|
+
* :boost => 10.0)
|
2482
|
+
*
|
2483
|
+
* field_infos.add_field(:content)
|
2484
|
+
*
|
2485
|
+
* field_infos.add_field(:created_on, :index => :untokenized_omit_norms,
|
2486
|
+
* :term_vector => :no)
|
2487
|
+
*
|
2488
|
+
* field_infos.add_field(:image, :store => :compressed, :index => :no,
|
2489
|
+
* :term_vector => :no)
|
2490
|
+
*
|
2491
|
+
* field_infos.create_index("/path/to/index")
|
2492
|
+
*
|
2493
|
+
* == Default Properties
|
2494
|
+
*
|
2495
|
+
* See FieldInfo for the available field property values.
|
2496
|
+
*
|
2497
|
+
* When you create the FieldInfos object you specify the default properties
|
2498
|
+
* for the fields. Often you'll specify all of the fields in the index before
|
2499
|
+
* you create the index so the default values won't come into play. However,
|
2500
|
+
* it is possible to continue to dynamically add fields as indexing goes
|
2501
|
+
* along. If you add a document to the index which has fields that the index
|
2502
|
+
* doesn't know about then the default properties are used for the new field.
|
2503
|
+
*/
|
2504
|
+
static void
|
2505
|
+
Init_FieldInfos(void)
|
2506
|
+
{
|
2507
|
+
Init_FieldInfo();
|
2508
|
+
|
2509
|
+
cFieldInfos = rb_define_class_under(mIndex, "FieldInfos", rb_cObject);
|
2510
|
+
rb_define_alloc_func(cFieldInfos, frt_data_alloc);
|
2511
|
+
|
2512
|
+
rb_define_method(cFieldInfos, "initialize", frt_fis_init, -1);
|
2513
|
+
rb_define_method(cFieldInfos, "to_a", frt_fis_to_a, 0);
|
2514
|
+
rb_define_method(cFieldInfos, "[]", frt_fis_get, 1);
|
2515
|
+
rb_define_method(cFieldInfos, "add", frt_fis_add, 1);
|
2516
|
+
rb_define_method(cFieldInfos, "<<", frt_fis_add, 1);
|
2517
|
+
rb_define_method(cFieldInfos, "add_field", frt_fis_add_field, -1);
|
2518
|
+
rb_define_method(cFieldInfos, "each", frt_fis_each, 0);
|
2519
|
+
rb_define_method(cFieldInfos, "to_s", frt_fis_to_s, 0);
|
2520
|
+
rb_define_method(cFieldInfos, "create_index",
|
2521
|
+
frt_fis_create_index, 1);
|
2522
|
+
rb_define_method(cFieldInfos, "fields", frt_fis_get_fields, -1);
|
2523
|
+
}
|
2524
|
+
|
2525
|
+
/*
|
2526
|
+
* Document-class: Ferret::Index::TermEnum
|
2527
|
+
*
|
2528
|
+
* == Summary
|
2529
|
+
*
|
2530
|
+
* The TermEnum object is used to iterate through the terms in a field. To
|
2531
|
+
* get a TermEnum you need to use the IndexReader#terms(field) method.
|
2532
|
+
*
|
2533
|
+
* == Example
|
2534
|
+
*
|
2535
|
+
* te = index_reader.terms(:content)
|
2536
|
+
*
|
2537
|
+
* te.each {|term, doc_freq| puts "#{term} occured #{doc_freq} times" }
|
2538
|
+
*
|
2539
|
+
* # or you could do it like this;
|
2540
|
+
* te = index_reader.terms(:content)
|
2541
|
+
*
|
2542
|
+
* while te.next?
|
2543
|
+
* puts "#{te.term} occured in #{te.doc_freq} documents in the index"
|
2544
|
+
* end
|
2545
|
+
*/
|
2546
|
+
static void
|
2547
|
+
Init_TermEnum(void)
|
2548
|
+
{
|
2549
|
+
id_term = rb_intern("@term");
|
2550
|
+
|
2551
|
+
cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
|
2552
|
+
rb_define_alloc_func(cTermEnum, frt_data_alloc);
|
2553
|
+
|
2554
|
+
rb_define_method(cTermEnum, "next?", frt_te_next, 0);
|
2555
|
+
rb_define_method(cTermEnum, "term", frt_te_term, 0);
|
2556
|
+
rb_define_method(cTermEnum, "doc_freq", frt_te_doc_freq, 0);
|
2557
|
+
rb_define_method(cTermEnum, "skip_to", frt_te_skip_to, 1);
|
2558
|
+
rb_define_method(cTermEnum, "each", frt_te_each, 0);
|
2559
|
+
rb_define_method(cTermEnum, "field=", frt_te_set_field, 1);
|
2560
|
+
rb_define_method(cTermEnum, "set_field",frt_te_set_field, 1);
|
2561
|
+
}
|
2562
|
+
|
2563
|
+
/*
|
2564
|
+
* Document-class: Ferret::Index::TermDocEnum
|
2565
|
+
*
|
2566
|
+
* == Summary
|
2567
|
+
*
|
2568
|
+
* Use a TermDocEnum to iterate through the documents that contain a
|
2569
|
+
* particular term. You can also iterate through the positions which the term
|
2570
|
+
* occurs in a document.
|
2571
|
+
*
|
2572
|
+
*
|
2573
|
+
* == Example
|
2574
|
+
*
|
2575
|
+
* tde = index_reader.term_docs_for(:content, "fox")
|
2576
|
+
*
|
2577
|
+
* tde.each do |doc_id, freq|
|
2578
|
+
* puts "fox appeared #{freq} times in document #{doc_id}:"
|
2579
|
+
* positions = []
|
2580
|
+
* tde.each_position {|pos| positions << pos}
|
2581
|
+
* puts " #{positions.join(', ')}"
|
2582
|
+
* end
|
2583
|
+
*
|
2584
|
+
* # or you can do it like this;
|
2585
|
+
* tde.seek(:title, "red")
|
2586
|
+
* while tde.next?
|
2587
|
+
* puts "red appeared #{tde.freq} times in document #{tde.doc}:"
|
2588
|
+
* positions = []
|
2589
|
+
* while pos = tde.next_position
|
2590
|
+
* positions << pos
|
2591
|
+
* end
|
2592
|
+
* puts " #{positions.join(', ')}"
|
2593
|
+
* end
|
2594
|
+
*/
|
2595
|
+
static void
|
2596
|
+
Init_TermDocEnum(void)
|
2597
|
+
{
|
2598
|
+
id_fld_num_map = rb_intern("@field_num_map");
|
2599
|
+
id_field_num = rb_intern("@field_num");
|
2600
|
+
|
2601
|
+
cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
|
2602
|
+
rb_define_alloc_func(cTermDocEnum, frt_data_alloc);
|
2603
|
+
rb_define_method(cTermDocEnum, "seek", frt_tde_seek, 2);
|
2604
|
+
rb_define_method(cTermDocEnum, "seek_term_enum", frt_tde_seek_te, 1);
|
2605
|
+
rb_define_method(cTermDocEnum, "doc", frt_tde_doc, 0);
|
2606
|
+
rb_define_method(cTermDocEnum, "freq", frt_tde_freq, 0);
|
2607
|
+
rb_define_method(cTermDocEnum, "next?", frt_tde_next, 0);
|
2608
|
+
rb_define_method(cTermDocEnum, "next_position", frt_tde_next_position, 0);
|
2609
|
+
rb_define_method(cTermDocEnum, "each", frt_tde_each, 0);
|
2610
|
+
rb_define_method(cTermDocEnum, "each_position", frt_tde_each_position, 0);
|
2611
|
+
rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
|
2612
|
+
}
|
2613
|
+
|
2614
|
+
/*
|
2615
|
+
* Document-class: Ferret::Index::TermVector::TVOffsets
|
2616
|
+
*
|
2617
|
+
* == Summary
|
2618
|
+
*
|
2619
|
+
* Holds the start and end byte-offsets of a term in a field. For example, if
|
2620
|
+
* the field was "the quick brown fox" then the start and end offsets of
|
2621
|
+
* ["the", "quick", "brown", "fox"] would be [(0,3), (4,9), (10,15), (16,19)]
|
2622
|
+
* respectively. See the Analysis module for more information on setting the
|
2623
|
+
* offsets.
|
2624
|
+
*/
|
2625
|
+
static void
|
2626
|
+
Init_TVOffsets(void)
|
2627
|
+
{
|
2628
|
+
const char *tv_offsets_class = "TVOffsets";
|
2629
|
+
cTVOffsets = rb_struct_define(tv_offsets_class, "start", "end", NULL);
|
2630
|
+
rb_set_class_path(cTVOffsets, cTermVector, tv_offsets_class);
|
2631
|
+
rb_const_set(mIndex, rb_intern(tv_offsets_class), cTVOffsets);
|
2632
|
+
}
|
2633
|
+
|
2634
|
+
/*
|
2635
|
+
* Document-class: Ferret::Index::TermVector::TVTerm
|
2636
|
+
*
|
2637
|
+
* == Summary
|
2638
|
+
*
|
2639
|
+
* The TVTerm class holds the term information for each term in a TermVector.
|
2640
|
+
* That is it holds the term's text and its positions in the document. You
|
2641
|
+
* can use those positions to reference the offsets for the term.
|
2642
|
+
*
|
2643
|
+
* == Example
|
2644
|
+
*
|
2645
|
+
* tv = index_reader.term_vector(:content)
|
2646
|
+
* tv_term = tv.find {|tvt| tvt.term = "fox"}
|
2647
|
+
* offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
|
2648
|
+
*/
|
2649
|
+
static void
|
2650
|
+
Init_TVTerm(void)
|
2651
|
+
{
|
2652
|
+
const char *tv_term_class = "TVTerm";
|
2653
|
+
cTVTerm = rb_struct_define(tv_term_class, "text", "positions", NULL);
|
2654
|
+
rb_set_class_path(cTVTerm, cTermVector, tv_term_class);
|
2655
|
+
rb_const_set(mIndex, rb_intern(tv_term_class), cTVTerm);
|
2656
|
+
}
|
2657
|
+
|
2658
|
+
/*
|
2659
|
+
* Document-class: Ferret::Index::TermVector
|
2660
|
+
*
|
2661
|
+
* == Summary
|
2662
|
+
*
|
2663
|
+
* TermVectors are most commonly used for creating search result excerpts and
|
2664
|
+
* highlight search matches in results. This is all done internally so you
|
2665
|
+
* won't need to worry about the TermVector object. There are some other
|
2666
|
+
* reasons you may want to use the TermVectors object however. For example,
|
2667
|
+
* you may wish to see which terms are the most commonly occuring terms in a
|
2668
|
+
* document to implement a MoreLikeThis search.
|
2669
|
+
*
|
2670
|
+
* == Example
|
2671
|
+
*
|
2672
|
+
* tv = index_reader.term_vector(:content)
|
2673
|
+
* tv_term = tv.find {|tvt| tvt.term = "fox"}
|
2674
|
+
*
|
2675
|
+
* # get the term frequency
|
2676
|
+
* term_freq = tv_term.positions.size
|
2677
|
+
*
|
2678
|
+
* # get the offsets for a term
|
2679
|
+
* offsets = tv_term.positions.collect {|pos| tv.offsets[pos]}
|
2680
|
+
*
|
2681
|
+
* == Note
|
2682
|
+
*
|
2683
|
+
* +positions+ and +offsets+ can be +nil+ depending on what you set the
|
2684
|
+
* +:term_vector+ to when you set the FieldInfo object for the field. Note in
|
2685
|
+
* particular that you need to store both positions and offsets if you want
|
2686
|
+
* to asscociate offsets with particular terms.
|
2687
|
+
*/
|
2688
|
+
static void
|
2689
|
+
Init_TermVector(void)
|
2690
|
+
{
|
2691
|
+
const char *tv_class = "TermVector";
|
2692
|
+
cTermVector = rb_struct_define(tv_class,
|
2693
|
+
"field", "terms", "offsets", NULL);
|
2694
|
+
rb_set_class_path(cTermVector, mIndex, tv_class);
|
2695
|
+
rb_const_set(mIndex, rb_intern(tv_class), cTermVector);
|
2696
|
+
|
2697
|
+
Init_TVOffsets();
|
2698
|
+
Init_TVTerm();
|
2699
|
+
}
|
2700
|
+
|
2701
|
+
/*
|
2702
|
+
* Document-class: Ferret::Index::IndexWriter
|
2703
|
+
*
|
2704
|
+
* == Summary
|
2705
|
+
*
|
2706
|
+
* The IndexWriter is the class used to add documents to an index. You can
|
2707
|
+
* also delete docuements from the index using this class. The indexing
|
2708
|
+
* process is highly customizable and the IndexWriter has the following
|
2709
|
+
* parameters;
|
2710
|
+
*
|
2711
|
+
* dir:: This is an Ferret::Store::Directory object. You
|
2712
|
+
* should either pass a +:dir+ or a +:path+ when
|
2713
|
+
* creating an index.
|
2714
|
+
* path:: A string representing the path to the index
|
2715
|
+
* directory. If you are creating the index for the
|
2716
|
+
* first time the directory will be created if it's
|
2717
|
+
* missing. You should not choose a directory which
|
2718
|
+
* contains other files as they could be over-written.
|
2719
|
+
* To protect against this set +:create_if_missing+ to
|
2720
|
+
* false.
|
2721
|
+
* create_if_missing:: Default: true. Create the index if no index is
|
2722
|
+
* found in the specified directory. Otherwise, use
|
2723
|
+
* the existing index.
|
2724
|
+
* create:: Default: false. Creates the index, even if one
|
2725
|
+
* already exists. That means any existing index will
|
2726
|
+
* be deleted. It is probably better to use the
|
2727
|
+
* create_if_missing option so that the index is only
|
2728
|
+
* created the first time when it doesn't exist.
|
2729
|
+
* field_infos:: Default FieldInfos.new. The FieldInfos object to use
|
2730
|
+
* when creating a new index if +:create_if_missing+ or
|
2731
|
+
* +:create+ is set to true. If an existing index is
|
2732
|
+
* opened then this parameter is ignored.
|
2733
|
+
* analyzer:: Default: Ferret::Analysis::StandardAnalyzer.
|
2734
|
+
* Sets the default analyzer for the index. This is
|
2735
|
+
* used by both the IndexWriter and the QueryParser
|
2736
|
+
* to tokenize the input. The default is the
|
2737
|
+
* StandardAnalyzer.
|
2738
|
+
* chunk_size:: Default: 0x100000 or 1Mb. Memory performance tuning
|
2739
|
+
* parameter. Sets the default size of chunks of memory
|
2740
|
+
* malloced for use during indexing. You can usually
|
2741
|
+
* leave this parameter as is.
|
2742
|
+
* max_buffer_memory:: Default: 0x1000000 or 16Mb. Memory performance
|
2743
|
+
* tuning parameter. Sets the amount of memory to be
|
2744
|
+
* used by the indexing process. Set to a larger value
|
2745
|
+
* to increase indexing speed. Note that this only
|
2746
|
+
* includes memory used by the indexing process, not
|
2747
|
+
* the rest of your ruby application.
|
2748
|
+
* term_index_interval:: Default: 128. The skip interval between terms in the
|
2749
|
+
* term dictionary. A smaller value will possibly
|
2750
|
+
* increase search performance while also increasing
|
2751
|
+
* memory usage and impacting negatively impacting
|
2752
|
+
* indexing performance.
|
2753
|
+
* doc_skip_interval:: Default: 16. The skip interval for document numbers
|
2754
|
+
* in the index. As with +:term_index_interval+ you
|
2755
|
+
* have a trade-off. A smaller number may increase
|
2756
|
+
* search performance while also increasing memory
|
2757
|
+
* usage and impacting negatively impacting indexing
|
2758
|
+
* performance.
|
2759
|
+
* merge_factor:: Default: 10. This must never be less than 2.
|
2760
|
+
* Specifies the number of segments of a certain size
|
2761
|
+
* that must exist before they are merged. A larger
|
2762
|
+
* value will improve indexing performance while
|
2763
|
+
* slowing search performance.
|
2764
|
+
* max_buffered_docs:: Default: 10000. The maximum number of documents that
|
2765
|
+
* may be stored in memory before being written to the
|
2766
|
+
* index. If you have a lot of memory and are indexing
|
2767
|
+
* a large number of small documents (like products in
|
2768
|
+
* a product database for example) you may want to set
|
2769
|
+
* this to a much higher number (like
|
2770
|
+
* Ferret::FIX_INT_MAX). If you are worried about your
|
2771
|
+
* application crashing during the middle of index you
|
2772
|
+
* might set this to a smaller number so that the index
|
2773
|
+
* is committed more often. This is like having an
|
2774
|
+
* auto-save in a word processor application.
|
2775
|
+
* max_merge_docs:: Set this value to limit the number of documents that
|
2776
|
+
* go into a single segment. Use this to avoid
|
2777
|
+
* extremely long merge times during indexing which can
|
2778
|
+
* make your application seem unresponsive. This is
|
2779
|
+
* only necessary for very large indexes (millions of
|
2780
|
+
* documents).
|
2781
|
+
* max_field_length:: Default: 10000. The maximum number of terms added to
|
2782
|
+
* a single field. This can be useful to protect the
|
2783
|
+
* indexer when indexing documents fromt the web for
|
2784
|
+
* example. Usually the most important terms will occur
|
2785
|
+
* early on in a document so you can often safely
|
2786
|
+
* ignore the terms in a field after a certain number
|
2787
|
+
* of them. If you wanted to speed up indexing and same
|
2788
|
+
* space in your index you may only want to index the
|
2789
|
+
* first 1000 terms in a field. On the other hand, if
|
2790
|
+
* you want to be more thorough and you are indexing
|
2791
|
+
* documents from your file-system you may set this
|
2792
|
+
* paramter to Ferret::FIX_INT_MAX.
|
2793
|
+
* use_compound_file:: Default: true. Uses a compound file to store the
|
2794
|
+
* index. This prevents an error being raised for
|
2795
|
+
* having too many files open at the same time. The
|
2796
|
+
* default is true but performance is better if this is
|
2797
|
+
* set to false.
|
2798
|
+
*
|
2799
|
+
*
|
2800
|
+
* === Deleting Documents
|
2801
|
+
*
|
2802
|
+
* Both IndexReader and IndexWriter allow you to delete documents. You should
|
2803
|
+
* use the IndexReader to delete documents by document id and IndexWriter to
|
2804
|
+
* delete documents by term which we'll explain now. It is preferrable to
|
2805
|
+
* delete documents from an index using IndexWriter for performance reasons.
|
2806
|
+
* To delete documents using the IndexWriter you should give each document in
|
2807
|
+
* the index a unique ID. If you are indexing documents from the file-system
|
2808
|
+
* this unique ID will be the full file path. If indexing documents from the
|
2809
|
+
* database you should use the primary key as the ID field. You can then
|
2810
|
+
* use the delete method to delete a file referenced by the ID. For example;
|
2811
|
+
*
|
2812
|
+
* index_writer.delete(:id, "/path/to/indexed/file")
|
2813
|
+
*/
|
2814
|
+
void
|
2815
|
+
Init_IndexWriter(void)
|
2816
|
+
{
|
2817
|
+
id_boost = rb_intern("boost");
|
2818
|
+
|
2819
|
+
sym_create = ID2SYM(rb_intern("create"));
|
2820
|
+
sym_create_if_missing = ID2SYM(rb_intern("create_if_missing"));
|
2821
|
+
sym_field_infos = ID2SYM(rb_intern("field_infos"));
|
2822
|
+
|
2823
|
+
sym_chunk_size = ID2SYM(rb_intern("chunk_size"));
|
2824
|
+
sym_max_buffer_memory = ID2SYM(rb_intern("max_buffer_memory"));
|
2825
|
+
sym_index_interval = ID2SYM(rb_intern("term_index_interval"));
|
2826
|
+
sym_skip_interval = ID2SYM(rb_intern("doc_skip_interval"));
|
2827
|
+
sym_merge_factor = ID2SYM(rb_intern("merge_factor"));
|
2828
|
+
sym_max_buffered_docs = ID2SYM(rb_intern("max_buffered_docs"));
|
2829
|
+
sym_max_merge_docs = ID2SYM(rb_intern("max_merge_docs"));
|
2830
|
+
sym_max_field_length = ID2SYM(rb_intern("max_field_length"));
|
2831
|
+
sym_use_compound_file = ID2SYM(rb_intern("use_compound_file"));
|
2832
|
+
|
2833
|
+
cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
|
2834
|
+
rb_define_alloc_func(cIndexWriter, frt_data_alloc);
|
2835
|
+
|
2836
|
+
rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
|
2837
|
+
rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
|
2838
|
+
rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
|
2839
|
+
rb_str_new2(WRITE_LOCK_NAME));
|
2840
|
+
rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
|
2841
|
+
rb_str_new2(COMMIT_LOCK_NAME));
|
2842
|
+
rb_define_const(cIndexWriter, "DEFAULT_CHUNK_SIZE",
|
2843
|
+
INT2FIX(default_config.chunk_size));
|
2844
|
+
rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFER_MEMORY",
|
2845
|
+
INT2FIX(default_config.max_buffer_memory));
|
2846
|
+
rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
|
2847
|
+
INT2FIX(default_config.index_interval));
|
2848
|
+
rb_define_const(cIndexWriter, "DEFAULT_DOC_SKIP_INTERVAL",
|
2849
|
+
INT2FIX(default_config.skip_interval));
|
2850
|
+
rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
|
2851
|
+
INT2FIX(default_config.merge_factor));
|
2852
|
+
rb_define_const(cIndexWriter, "DEFAULT_MAX_BUFFERED_DOCS",
|
2853
|
+
INT2FIX(default_config.max_buffered_docs));
|
2854
|
+
rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
|
2855
|
+
INT2FIX(default_config.max_merge_docs));
|
2856
|
+
rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
|
2857
|
+
INT2FIX(default_config.max_field_length));
|
2858
|
+
rb_define_const(cIndexWriter, "DEFAULT_USE_COMPOUND_FILE",
|
2859
|
+
default_config.use_compound_file ? Qtrue : Qfalse);
|
2860
|
+
|
2861
|
+
rb_define_method(cIndexWriter, "initialize", frt_iw_init, -1);
|
2862
|
+
rb_define_method(cIndexWriter, "doc_count", frt_iw_get_doc_count, 0);
|
2863
|
+
rb_define_method(cIndexWriter, "close", frt_iw_close, 0);
|
2864
|
+
rb_define_method(cIndexWriter, "add_document", frt_iw_add_doc, 1);
|
2865
|
+
rb_define_method(cIndexWriter, "<<", frt_iw_add_doc, 1);
|
2866
|
+
rb_define_method(cIndexWriter, "optimize", frt_iw_optimize, 0);
|
2867
|
+
rb_define_method(cIndexWriter, "commit", frt_iw_commit, 0);
|
2868
|
+
rb_define_method(cIndexWriter, "add_readers", frt_iw_add_readers, 1);
|
2869
|
+
rb_define_method(cIndexWriter, "delete", frt_iw_delete, 2);
|
2870
|
+
rb_define_method(cIndexWriter, "field_infos", frt_iw_field_infos, 0);
|
2871
|
+
rb_define_method(cIndexWriter, "analyzer", frt_iw_get_analyzer, 0);
|
2872
|
+
rb_define_method(cIndexWriter, "analyzer=", frt_iw_set_analyzer, 1);
|
2873
|
+
|
2874
|
+
rb_define_method(cIndexWriter, "chunk_size",
|
2875
|
+
frt_iw_get_chunk_size, 0);
|
2876
|
+
rb_define_method(cIndexWriter, "chunk_size=",
|
2877
|
+
frt_iw_set_chunk_size, 1);
|
2878
|
+
|
2879
|
+
rb_define_method(cIndexWriter, "max_buffer_memory",
|
2880
|
+
frt_iw_get_max_buffer_memory, 0);
|
2881
|
+
rb_define_method(cIndexWriter, "max_buffer_memory=",
|
2882
|
+
frt_iw_set_max_buffer_memory, 1);
|
2883
|
+
|
2884
|
+
rb_define_method(cIndexWriter, "term_index_interval",
|
2885
|
+
frt_iw_get_index_interval, 0);
|
2886
|
+
rb_define_method(cIndexWriter, "term_index_interval=",
|
2887
|
+
frt_iw_set_index_interval, 1);
|
2888
|
+
|
2889
|
+
rb_define_method(cIndexWriter, "doc_skip_interval",
|
2890
|
+
frt_iw_get_skip_interval, 0);
|
2891
|
+
rb_define_method(cIndexWriter, "doc_skip_interval=",
|
2892
|
+
frt_iw_set_skip_interval, 1);
|
2893
|
+
|
2894
|
+
rb_define_method(cIndexWriter, "merge_factor",
|
2895
|
+
frt_iw_get_merge_factor, 0);
|
2896
|
+
rb_define_method(cIndexWriter, "merge_factor=",
|
2897
|
+
frt_iw_set_merge_factor, 1);
|
2898
|
+
|
2899
|
+
rb_define_method(cIndexWriter, "max_buffered_docs",
|
2900
|
+
frt_iw_get_max_buffered_docs, 0);
|
2901
|
+
rb_define_method(cIndexWriter, "max_buffered_docs=",
|
2902
|
+
frt_iw_set_max_buffered_docs, 1);
|
2903
|
+
|
2904
|
+
rb_define_method(cIndexWriter, "max_merge_docs",
|
2905
|
+
frt_iw_get_max_merge_docs, 0);
|
2906
|
+
rb_define_method(cIndexWriter, "max_merge_docs=",
|
2907
|
+
frt_iw_set_max_merge_docs, 1);
|
2908
|
+
|
2909
|
+
rb_define_method(cIndexWriter, "max_field_length",
|
2910
|
+
frt_iw_get_max_field_length, 0);
|
2911
|
+
rb_define_method(cIndexWriter, "max_field_length=",
|
2912
|
+
frt_iw_set_max_field_length, 1);
|
2913
|
+
|
2914
|
+
rb_define_method(cIndexWriter, "use_compound_file",
|
2915
|
+
frt_iw_get_use_compound_file, 0);
|
2916
|
+
rb_define_method(cIndexWriter, "use_compound_file=",
|
2917
|
+
frt_iw_set_use_compound_file, 1);
|
2918
|
+
|
2919
|
+
}
|
2920
|
+
|
2921
|
+
/*
|
2922
|
+
* Document-class: Ferret::Index::LazyDoc
|
2923
|
+
*
|
2924
|
+
* == Summary
|
2925
|
+
*
|
2926
|
+
* When a document is retrieved from the index a LazyDoc is returned.
|
2927
|
+
* Actually, LazyDoc is just a modified Hash object which lazily adds fields
|
2928
|
+
* to itself when they are accessed. You should not that they keys method
|
2929
|
+
* will return nothing until you actually access one of the fields. To see
|
2930
|
+
* what fields are available use LazyDoc#fields rather than LazyDoc#keys. To
|
2931
|
+
* load all fields use the LazyDoc#load method.
|
2932
|
+
*
|
2933
|
+
* == Example
|
2934
|
+
*
|
2935
|
+
* doc = index_reader[0]
|
2936
|
+
*
|
2937
|
+
* doc.keys #=> []
|
2938
|
+
* doc.values #=> []
|
2939
|
+
* doc.fields #=> [:title, :content]
|
2940
|
+
*
|
2941
|
+
* title = doc[:title] #=> "the title"
|
2942
|
+
* doc.keys #=> [:title]
|
2943
|
+
* doc.values #=> ["the title"]
|
2944
|
+
* doc.fields #=> [:title, :content]
|
2945
|
+
*
|
2946
|
+
* doc.load
|
2947
|
+
* doc.keys #=> [:title, :content]
|
2948
|
+
* doc.values #=> ["the title", "the content"]
|
2949
|
+
* doc.fields #=> [:title, :content]
|
2950
|
+
*/
|
2951
|
+
void
|
2952
|
+
Init_LazyDoc(void)
|
2953
|
+
{
|
2954
|
+
id_fields = rb_intern("@fields");
|
2955
|
+
|
2956
|
+
|
2957
|
+
cLazyDoc = rb_define_class_under(mIndex, "LazyDoc", rb_cHash);
|
2958
|
+
rb_define_method(cLazyDoc, "default", frt_lzd_default, 1);
|
2959
|
+
rb_define_method(cLazyDoc, "load", frt_lzd_load, 0);
|
2960
|
+
rb_define_method(cLazyDoc, "fields", frt_lzd_fields, 0);
|
2961
|
+
|
2962
|
+
cLazyDocData = rb_define_class_under(cLazyDoc, "LazyDocData", rb_cObject);
|
2963
|
+
rb_define_alloc_func(cLazyDocData, frt_data_alloc);
|
2964
|
+
}
|
2965
|
+
|
2966
|
+
/*
|
2967
|
+
* Document-class: Ferret::Index::IndexReader
|
2968
|
+
*
|
2969
|
+
* == Summary
|
2970
|
+
*
|
2971
|
+
* IndexReader is used for reading data from the index. This class is usually
|
2972
|
+
* used directly for more advanced tasks like iterating through terms in an
|
2973
|
+
* index, accessing term-vectors or deleting documents by document id. It is
|
2974
|
+
* also used internally by IndexSearcher.
|
2975
|
+
*/
|
2976
|
+
void
|
2977
|
+
Init_IndexReader(void)
|
2978
|
+
{
|
2979
|
+
cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
|
2980
|
+
rb_define_alloc_func(cIndexReader, frt_data_alloc);
|
2981
|
+
rb_define_method(cIndexReader, "initialize", frt_ir_init, 1);
|
2982
|
+
rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
|
2983
|
+
rb_define_method(cIndexReader, "norms", frt_ir_norms, 1);
|
2984
|
+
rb_define_method(cIndexReader, "get_norms_into",frt_ir_get_norms_into, 3);
|
2985
|
+
rb_define_method(cIndexReader, "commit", frt_ir_commit, 0);
|
2986
|
+
rb_define_method(cIndexReader, "close", frt_ir_close, 0);
|
2987
|
+
rb_define_method(cIndexReader, "has_deletions?",frt_ir_has_deletions, 0);
|
2988
|
+
rb_define_method(cIndexReader, "delete", frt_ir_delete, 1);
|
2989
|
+
rb_define_method(cIndexReader, "deleted?", frt_ir_is_deleted, 1);
|
2990
|
+
rb_define_method(cIndexReader, "max_doc", frt_ir_max_doc, 0);
|
2991
|
+
rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
|
2992
|
+
rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
|
2993
|
+
rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
|
2994
|
+
rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, 1);
|
2995
|
+
rb_define_method(cIndexReader, "[]", frt_ir_get_doc, 1);
|
2996
|
+
rb_define_method(cIndexReader, "term_vector", frt_ir_term_vector, 2);
|
2997
|
+
rb_define_method(cIndexReader, "term_vectors", frt_ir_term_vectors, 1);
|
2998
|
+
rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
|
2999
|
+
rb_define_method(cIndexReader, "term_positions",frt_ir_term_positions, 0);
|
3000
|
+
rb_define_method(cIndexReader, "term_docs_for", frt_ir_term_docs_for, 2);
|
3001
|
+
rb_define_method(cIndexReader, "term_positions_for", frt_ir_t_pos_for, 2);
|
3002
|
+
rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 2);
|
3003
|
+
rb_define_method(cIndexReader, "terms", frt_ir_terms, 1);
|
3004
|
+
rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 2);
|
3005
|
+
rb_define_method(cIndexReader, "field_names", frt_ir_field_names, 0);
|
3006
|
+
rb_define_method(cIndexReader, "field_infos", frt_ir_field_infos, 0);
|
3007
|
+
}
|
3008
|
+
|
3009
|
+
/* rdoc hack
|
3010
|
+
extern VALUE mFerret = rb_define_module("Ferret");
|
3011
|
+
*/
|
3012
|
+
|
3013
|
+
/*
|
3014
|
+
* Document-module: Ferret::Index
|
3015
|
+
*
|
3016
|
+
* == Summary
|
3017
|
+
*
|
3018
|
+
* The Index module contains all the classes used for adding to and
|
3019
|
+
* retrieving from the index. The important classes to know about are;
|
3020
|
+
*
|
3021
|
+
* * FieldInfo
|
3022
|
+
* * FieldInfos
|
3023
|
+
* * IndexWriter
|
3024
|
+
* * IndexReader
|
3025
|
+
* * LazyDoc
|
3026
|
+
*
|
3027
|
+
* The other classes in this module are useful for more advanced uses like
|
3028
|
+
* building tag clouds, creating more-like-this queries, custom highlighting
|
3029
|
+
* etc. They are also useful for index browsers.
|
3030
|
+
*/
|
3031
|
+
void
|
3032
|
+
Init_Index(void)
|
3033
|
+
{
|
3034
|
+
mIndex = rb_define_module_under(mFerret, "Index");
|
3035
|
+
|
3036
|
+
sym_boost = ID2SYM(rb_intern("boost"));
|
3037
|
+
sym_analyzer = ID2SYM(rb_intern("analyzer"));
|
3038
|
+
sym_close_dir = ID2SYM(rb_intern("close_dir"));
|
3039
|
+
|
3040
|
+
Init_TermVector();
|
3041
|
+
Init_TermEnum();
|
3042
|
+
Init_TermDocEnum();
|
3043
|
+
|
3044
|
+
Init_FieldInfos();
|
3045
|
+
|
3046
|
+
Init_LazyDoc();
|
3047
|
+
Init_IndexWriter();
|
3048
|
+
Init_IndexReader();
|
3049
|
+
}
|