ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/r_analysis.c
ADDED
@@ -0,0 +1,255 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
#include "analysis.h"
|
3
|
+
|
4
|
+
static VALUE cToken;
|
5
|
+
static VALUE cLetterTokenizer;
|
6
|
+
|
7
|
+
static VALUE cAnalyzer;
|
8
|
+
static VALUE cLetterAnalyzer;
|
9
|
+
static VALUE cWhiteSpaceAnalyzer;
|
10
|
+
static VALUE cStandardAnalyzer;
|
11
|
+
|
12
|
+
/****************************************************************************
|
13
|
+
*
|
14
|
+
* Token Methods
|
15
|
+
*
|
16
|
+
****************************************************************************/
|
17
|
+
|
18
|
+
typedef struct RToken {
|
19
|
+
VALUE text;
|
20
|
+
int start;
|
21
|
+
int end;
|
22
|
+
int pos_inc;
|
23
|
+
} RToken;
|
24
|
+
|
25
|
+
static void
|
26
|
+
frt_token_free(void *p)
|
27
|
+
{
|
28
|
+
free(p);
|
29
|
+
}
|
30
|
+
|
31
|
+
static void
|
32
|
+
frt_token_mark(void *p)
|
33
|
+
{
|
34
|
+
RToken *token = (RToken *)p;
|
35
|
+
rb_gc_mark(token->text);
|
36
|
+
}
|
37
|
+
|
38
|
+
static VALUE
|
39
|
+
frt_token_alloc(VALUE klass)
|
40
|
+
{
|
41
|
+
return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free, ALLOC(RToken));
|
42
|
+
}
|
43
|
+
|
44
|
+
#define GET_TK RToken *token; Data_Get_Struct(self, RToken, token);
|
45
|
+
static VALUE
|
46
|
+
frt_token_init(int argc, VALUE *argv, VALUE self)
|
47
|
+
{
|
48
|
+
GET_TK;
|
49
|
+
VALUE rtext, rstart, rend, rpos_inc, rtype;
|
50
|
+
token->pos_inc = 1;
|
51
|
+
switch (rb_scan_args(argc, argv, "32", &rtext, &rstart, &rend, &rpos_inc, &rtype)) {
|
52
|
+
case 5: /* type gets ignored at this stage */
|
53
|
+
case 4: token->pos_inc = FIX2INT(rpos_inc);
|
54
|
+
}
|
55
|
+
token->text = rb_obj_as_string(rtext);
|
56
|
+
token->start = FIX2INT(rstart);
|
57
|
+
token->end = FIX2INT(rend);
|
58
|
+
return self;
|
59
|
+
}
|
60
|
+
|
61
|
+
static VALUE
|
62
|
+
frt_token_cmp(VALUE self, VALUE rother)
|
63
|
+
{
|
64
|
+
RToken *other;
|
65
|
+
int cmp;
|
66
|
+
GET_TK;
|
67
|
+
Data_Get_Struct(rother, RToken, other);
|
68
|
+
if (token->start > other->start) {
|
69
|
+
cmp = 1;
|
70
|
+
} else if (token->start < other->start) {
|
71
|
+
cmp = -1;
|
72
|
+
} else {
|
73
|
+
if (token->end > other->end) {
|
74
|
+
cmp = 1;
|
75
|
+
} else if (token->end < other->end) {
|
76
|
+
cmp = -1;
|
77
|
+
} else {
|
78
|
+
cmp = strcmp(RSTRING(token->text)->ptr, RSTRING(other->text)->ptr);
|
79
|
+
}
|
80
|
+
}
|
81
|
+
return INT2FIX(cmp);
|
82
|
+
}
|
83
|
+
|
84
|
+
static VALUE
|
85
|
+
frt_token_get_text(VALUE self)
|
86
|
+
{
|
87
|
+
GET_TK;
|
88
|
+
return token->text;
|
89
|
+
}
|
90
|
+
|
91
|
+
static VALUE
|
92
|
+
frt_token_set_text(VALUE self, VALUE rtext)
|
93
|
+
{
|
94
|
+
GET_TK;
|
95
|
+
token->text = rtext;
|
96
|
+
return rtext;
|
97
|
+
}
|
98
|
+
|
99
|
+
static VALUE
|
100
|
+
frt_token_get_start_offset(VALUE self)
|
101
|
+
{
|
102
|
+
GET_TK;
|
103
|
+
return INT2FIX(token->start);
|
104
|
+
}
|
105
|
+
|
106
|
+
static VALUE
|
107
|
+
frt_token_get_end_offset(VALUE self)
|
108
|
+
{
|
109
|
+
GET_TK;
|
110
|
+
return INT2FIX(token->end);
|
111
|
+
}
|
112
|
+
|
113
|
+
static VALUE
|
114
|
+
frt_token_get_pos_inc(VALUE self)
|
115
|
+
{
|
116
|
+
GET_TK;
|
117
|
+
return INT2FIX(token->pos_inc);
|
118
|
+
}
|
119
|
+
|
120
|
+
static VALUE
|
121
|
+
frt_token_to_s(VALUE self)
|
122
|
+
{
|
123
|
+
GET_TK;
|
124
|
+
char *buf = alloca(RSTRING(token->text)->len + 80);
|
125
|
+
sprintf(buf, "token[\"%s\":%d:%d:%d]", RSTRING(token->text)->ptr, token->start,
|
126
|
+
token->end, token->pos_inc);
|
127
|
+
return rb_str_new2(buf);
|
128
|
+
}
|
129
|
+
|
130
|
+
/****************************************************************************
|
131
|
+
*
|
132
|
+
* Tokenizer Methods
|
133
|
+
*
|
134
|
+
****************************************************************************/
|
135
|
+
|
136
|
+
static void
|
137
|
+
frt_tokenizer_free(void *p)
|
138
|
+
{
|
139
|
+
TokenStream *ts = (TokenStream *)p;
|
140
|
+
object_del(p);
|
141
|
+
ts->destroy(ts);
|
142
|
+
}
|
143
|
+
|
144
|
+
static VALUE
|
145
|
+
frt_letter_tokenizer_init(VALUE self, VALUE rstr)
|
146
|
+
{
|
147
|
+
TokenStream *ts = letter_tokenizer_create();
|
148
|
+
Frt_Wrap_Struct(self, NULL, &frt_tokenizer_free, ts);
|
149
|
+
return self;
|
150
|
+
}
|
151
|
+
|
152
|
+
/****************************************************************************
|
153
|
+
*
|
154
|
+
* Analyzer Methods
|
155
|
+
*
|
156
|
+
****************************************************************************/
|
157
|
+
|
158
|
+
static void
|
159
|
+
frt_analyzer_free(void *p)
|
160
|
+
{
|
161
|
+
Analyzer *a = (Analyzer *)p;
|
162
|
+
object_del(a);
|
163
|
+
a->destroy(a);
|
164
|
+
}
|
165
|
+
|
166
|
+
VALUE
|
167
|
+
frt_get_analyzer(Analyzer *a)
|
168
|
+
{
|
169
|
+
VALUE self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
|
170
|
+
object_add(a, self);
|
171
|
+
return self;
|
172
|
+
}
|
173
|
+
|
174
|
+
/*** WhiteSpaceAnalyzer ***/
|
175
|
+
static VALUE
|
176
|
+
frt_white_space_analyzer_init(VALUE self)
|
177
|
+
{
|
178
|
+
Analyzer *a = whitespace_analyzer_create();
|
179
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
180
|
+
object_add(a, self);
|
181
|
+
return self;
|
182
|
+
}
|
183
|
+
|
184
|
+
/*** LetterAnalyzer ***/
|
185
|
+
static VALUE
|
186
|
+
frt_letter_analyzer_init(VALUE self)
|
187
|
+
{
|
188
|
+
Analyzer *a = letter_analyzer_create();
|
189
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
190
|
+
object_add(a, self);
|
191
|
+
return self;
|
192
|
+
}
|
193
|
+
|
194
|
+
/*** StandardAnalyzer ***/
|
195
|
+
static VALUE
|
196
|
+
frt_standard_analyzer_init(VALUE self)
|
197
|
+
{
|
198
|
+
Analyzer *a = standard_analyzer_create();
|
199
|
+
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
200
|
+
object_add(a, self);
|
201
|
+
return self;
|
202
|
+
}
|
203
|
+
|
204
|
+
/****************************************************************************
|
205
|
+
*
|
206
|
+
* Init Function
|
207
|
+
*
|
208
|
+
****************************************************************************/
|
209
|
+
|
210
|
+
void
|
211
|
+
Init_analysis(void)
|
212
|
+
{
|
213
|
+
cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
|
214
|
+
rb_define_alloc_func(cToken, frt_token_alloc);
|
215
|
+
rb_include_module(cToken, rb_mComparable);
|
216
|
+
|
217
|
+
rb_define_method(cToken, "initialize", frt_token_init, -1);
|
218
|
+
rb_define_method(cToken, "<=>", frt_token_cmp, 1);
|
219
|
+
rb_define_method(cToken, "text", frt_token_get_text, 0);
|
220
|
+
rb_define_method(cToken, "text=", frt_token_set_text, 1);
|
221
|
+
rb_define_method(cToken, "start_offset", frt_token_get_start_offset, 0);
|
222
|
+
rb_define_method(cToken, "end_offset", frt_token_get_end_offset, 0);
|
223
|
+
rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
|
224
|
+
rb_define_method(cToken, "to_s", frt_token_to_s, 0);
|
225
|
+
|
226
|
+
cLetterTokenizer =
|
227
|
+
rb_define_class_under(mAnalysis, "LetterTokenizer", rb_cObject);
|
228
|
+
rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
|
229
|
+
rb_define_method(cLetterTokenizer, "initialize",
|
230
|
+
frt_letter_tokenizer_init, 1);
|
231
|
+
|
232
|
+
cAnalyzer =
|
233
|
+
rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
234
|
+
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
235
|
+
rb_define_method(cAnalyzer, "initialize",
|
236
|
+
frt_letter_analyzer_init, 0);
|
237
|
+
|
238
|
+
cLetterAnalyzer =
|
239
|
+
rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
240
|
+
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
241
|
+
rb_define_method(cAnalyzer, "initialize",
|
242
|
+
frt_letter_analyzer_init, 0);
|
243
|
+
|
244
|
+
cWhiteSpaceAnalyzer =
|
245
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
246
|
+
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
247
|
+
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
248
|
+
frt_white_space_analyzer_init, 0);
|
249
|
+
|
250
|
+
cStandardAnalyzer =
|
251
|
+
rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
252
|
+
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
253
|
+
rb_define_method(cStandardAnalyzer, "initialize",
|
254
|
+
frt_standard_analyzer_init, 0);
|
255
|
+
}
|
data/ext/r_doc.c
ADDED
@@ -0,0 +1,578 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
#include "document.h"
|
3
|
+
|
4
|
+
VALUE cDocument;
|
5
|
+
VALUE cField;
|
6
|
+
VALUE cFieldStore;
|
7
|
+
VALUE cFieldIndex;
|
8
|
+
VALUE cFieldTermVector;
|
9
|
+
|
10
|
+
/****************************************************************************
|
11
|
+
*
|
12
|
+
* Field Methods
|
13
|
+
*
|
14
|
+
****************************************************************************/
|
15
|
+
|
16
|
+
void
|
17
|
+
frt_field_free(void *p)
|
18
|
+
{
|
19
|
+
object_del(p);
|
20
|
+
df_destroy_data(p);
|
21
|
+
}
|
22
|
+
|
23
|
+
static VALUE
|
24
|
+
frt_field_alloc(VALUE klass)
|
25
|
+
{
|
26
|
+
VALUE self;
|
27
|
+
DocField *df = ALLOC(DocField);
|
28
|
+
df->name = NULL;
|
29
|
+
df->data = NULL;
|
30
|
+
self = Data_Wrap_Struct(klass, NULL, &frt_field_free, df);
|
31
|
+
object_add(df, self);
|
32
|
+
return self;
|
33
|
+
}
|
34
|
+
|
35
|
+
#define GET_DF DocField *df; Data_Get_Struct(self, DocField, df)
|
36
|
+
static VALUE
|
37
|
+
frt_field_init(int argc, VALUE *argv, VALUE self)
|
38
|
+
{
|
39
|
+
GET_DF;
|
40
|
+
VALUE rname, rdata, rstored, rindexed, rstore_tv, rbinary, rboost;
|
41
|
+
float boost = 1.0;
|
42
|
+
int stored = 0, indexed = 0, store_tv = 0;
|
43
|
+
bool binary = false;
|
44
|
+
switch (rb_scan_args(argc, argv, "25", &rname, &rdata, &rstored,
|
45
|
+
&rindexed, &rstore_tv, &rbinary, &rboost)) {
|
46
|
+
case 7: boost = (float)rb_num2dbl(rboost);
|
47
|
+
case 6: binary = RTEST(rbinary);
|
48
|
+
case 5: store_tv = FIX2INT(rstore_tv);
|
49
|
+
case 4: indexed = FIX2INT(rindexed);
|
50
|
+
case 3: stored = FIX2INT(rstored);
|
51
|
+
case 2:
|
52
|
+
rname = rb_obj_as_string(rname);
|
53
|
+
rdata = rb_obj_as_string(rdata);
|
54
|
+
break;
|
55
|
+
}
|
56
|
+
char *name = RSTRING(rname)->ptr;
|
57
|
+
int len = RSTRING(rdata)->len;
|
58
|
+
char *data = ALLOC_N(char, len + 1);
|
59
|
+
MEMCPY(data, RSTRING(rdata)->ptr, char, len);
|
60
|
+
data[len] = 0;
|
61
|
+
df_set(df, name, data, stored, indexed, store_tv);
|
62
|
+
df->blen = len;
|
63
|
+
df->is_binary = binary;
|
64
|
+
df->boost = boost;
|
65
|
+
return Qnil;
|
66
|
+
}
|
67
|
+
|
68
|
+
static VALUE
|
69
|
+
frt_field_get_name(VALUE self)
|
70
|
+
{
|
71
|
+
GET_DF;
|
72
|
+
return rb_str_new2(df->name);
|
73
|
+
}
|
74
|
+
|
75
|
+
static VALUE
|
76
|
+
frt_field_set_name(VALUE self, VALUE rname)
|
77
|
+
{
|
78
|
+
int len;
|
79
|
+
GET_DF;
|
80
|
+
rname = rb_obj_as_string(rname);
|
81
|
+
len = RSTRING(rname)->len;
|
82
|
+
REALLOC_N(df->name, char, len);
|
83
|
+
MEMCPY(df->name, RSTRING(rname)->ptr, char, len);
|
84
|
+
return Qnil;
|
85
|
+
}
|
86
|
+
|
87
|
+
static VALUE
|
88
|
+
frt_field_get_data(VALUE self)
|
89
|
+
{
|
90
|
+
GET_DF;
|
91
|
+
return rb_str_new(df->data, df->blen);
|
92
|
+
}
|
93
|
+
|
94
|
+
static VALUE
|
95
|
+
frt_field_set_data(VALUE self, VALUE rdata)
|
96
|
+
{
|
97
|
+
int len;
|
98
|
+
GET_DF;
|
99
|
+
rdata = rb_obj_as_string(rdata);
|
100
|
+
len = RSTRING(rdata)->len;
|
101
|
+
REALLOC_N(df->data, char, len);
|
102
|
+
MEMCPY(df->data, RSTRING(rdata)->ptr, char, len);
|
103
|
+
df->blen = len;
|
104
|
+
return Qnil;
|
105
|
+
}
|
106
|
+
|
107
|
+
static VALUE
|
108
|
+
frt_field_get_boost(VALUE self)
|
109
|
+
{
|
110
|
+
GET_DF;
|
111
|
+
return rb_float_new((double)df->boost);
|
112
|
+
}
|
113
|
+
|
114
|
+
static VALUE
|
115
|
+
frt_field_set_boost(VALUE self, VALUE rboost)
|
116
|
+
{
|
117
|
+
GET_DF;
|
118
|
+
df->boost = (float)rb_num2dbl(rboost);
|
119
|
+
return Qnil;
|
120
|
+
}
|
121
|
+
|
122
|
+
static VALUE
|
123
|
+
frt_field_is_stored(VALUE self)
|
124
|
+
{
|
125
|
+
GET_DF;
|
126
|
+
return df->is_stored ? Qtrue : Qfalse;
|
127
|
+
}
|
128
|
+
|
129
|
+
static VALUE
|
130
|
+
frt_field_is_indexed(VALUE self)
|
131
|
+
{
|
132
|
+
GET_DF;
|
133
|
+
return df->is_indexed ? Qtrue : Qfalse;
|
134
|
+
}
|
135
|
+
|
136
|
+
static VALUE
|
137
|
+
frt_field_is_tokenized(VALUE self)
|
138
|
+
{
|
139
|
+
GET_DF;
|
140
|
+
return df->is_tokenized ? Qtrue : Qfalse;
|
141
|
+
}
|
142
|
+
|
143
|
+
static VALUE
|
144
|
+
frt_field_is_binary(VALUE self)
|
145
|
+
{
|
146
|
+
GET_DF;
|
147
|
+
return df->is_binary ? Qtrue : Qfalse;
|
148
|
+
}
|
149
|
+
|
150
|
+
static VALUE
|
151
|
+
frt_field_is_compressed(VALUE self)
|
152
|
+
{
|
153
|
+
GET_DF;
|
154
|
+
return df->is_compressed ? Qtrue : Qfalse;
|
155
|
+
}
|
156
|
+
|
157
|
+
static VALUE
|
158
|
+
frt_field_store_tv(VALUE self)
|
159
|
+
{
|
160
|
+
GET_DF;
|
161
|
+
return df->store_tv ? Qtrue : Qfalse;
|
162
|
+
}
|
163
|
+
|
164
|
+
static VALUE
|
165
|
+
frt_field_store_pos(VALUE self)
|
166
|
+
{
|
167
|
+
GET_DF;
|
168
|
+
return df->store_pos ? Qtrue : Qfalse;
|
169
|
+
}
|
170
|
+
|
171
|
+
static VALUE
|
172
|
+
frt_field_store_offset(VALUE self)
|
173
|
+
{
|
174
|
+
GET_DF;
|
175
|
+
return df->store_offset ? Qtrue : Qfalse;
|
176
|
+
}
|
177
|
+
|
178
|
+
static VALUE
|
179
|
+
frt_field_omit_norms(VALUE self)
|
180
|
+
{
|
181
|
+
GET_DF;
|
182
|
+
return df->omit_norms ? Qtrue : Qfalse;
|
183
|
+
}
|
184
|
+
|
185
|
+
static VALUE
|
186
|
+
frt_field_to_s(VALUE self)
|
187
|
+
{
|
188
|
+
VALUE rstr;
|
189
|
+
char *str;
|
190
|
+
GET_DF;
|
191
|
+
|
192
|
+
str = df_to_s(df);
|
193
|
+
rstr = rb_str_new2(str);
|
194
|
+
free(str);
|
195
|
+
return rstr;
|
196
|
+
}
|
197
|
+
|
198
|
+
static VALUE
|
199
|
+
frt_field_new_binary(VALUE klass, VALUE rname, VALUE rdata, VALUE rstore)
|
200
|
+
{
|
201
|
+
char *data;
|
202
|
+
int len;
|
203
|
+
DocField *df;
|
204
|
+
int store = FIX2INT(rstore);
|
205
|
+
rname = rb_obj_as_string(rname);
|
206
|
+
rdata = rb_obj_as_string(rdata);
|
207
|
+
len = RSTRING(rdata)->len;
|
208
|
+
data = ALLOC_N(char, len);
|
209
|
+
MEMCPY(data, RSTRING(rdata)->ptr, char, len);
|
210
|
+
|
211
|
+
df = df_create_binary(RSTRING(rname)->ptr, data, len, store);
|
212
|
+
return Data_Wrap_Struct(klass, NULL, &df_destroy_data, df);
|
213
|
+
}
|
214
|
+
|
215
|
+
static VALUE
|
216
|
+
frt_field_set_store(VALUE self, VALUE rstore)
|
217
|
+
{
|
218
|
+
GET_DF;
|
219
|
+
int store = FIX2INT(rstore);
|
220
|
+
df_set_store(df, store);
|
221
|
+
return Qnil;
|
222
|
+
}
|
223
|
+
|
224
|
+
static VALUE
|
225
|
+
frt_field_set_term_vector(VALUE self, VALUE rterm_vector)
|
226
|
+
{
|
227
|
+
GET_DF;
|
228
|
+
int term_vector = FIX2INT(rterm_vector);
|
229
|
+
df_set_term_vector(df, term_vector);
|
230
|
+
return Qnil;
|
231
|
+
}
|
232
|
+
|
233
|
+
static VALUE
|
234
|
+
frt_field_set_index(VALUE self, VALUE rindex)
|
235
|
+
{
|
236
|
+
GET_DF;
|
237
|
+
int index = FIX2INT(rindex);
|
238
|
+
df_set_index(df, index);
|
239
|
+
return Qnil;
|
240
|
+
}
|
241
|
+
|
242
|
+
/****************************************************************************
|
243
|
+
*
|
244
|
+
* Document Methods
|
245
|
+
*
|
246
|
+
****************************************************************************/
|
247
|
+
|
248
|
+
void
|
249
|
+
frt_doc_free(void *p)
|
250
|
+
{
|
251
|
+
object_del(p);
|
252
|
+
doc_destroy(p);
|
253
|
+
}
|
254
|
+
|
255
|
+
void
|
256
|
+
frt_doc_mark(void *p)
|
257
|
+
{
|
258
|
+
int i;
|
259
|
+
DocField *df;
|
260
|
+
Document *doc = (Document *)p;
|
261
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
262
|
+
df = doc->df_arr[i];
|
263
|
+
frt_gc_mark(df);
|
264
|
+
}
|
265
|
+
}
|
266
|
+
|
267
|
+
static VALUE
|
268
|
+
frt_doc_alloc(VALUE klass)
|
269
|
+
{
|
270
|
+
Document *doc = doc_create();
|
271
|
+
doc->free_data = NULL;
|
272
|
+
VALUE self = Data_Wrap_Struct(klass, &frt_doc_mark, &frt_doc_free, doc);
|
273
|
+
object_add(doc, self);
|
274
|
+
return self;
|
275
|
+
}
|
276
|
+
|
277
|
+
VALUE
|
278
|
+
frt_get_doc(Document *doc)
|
279
|
+
{
|
280
|
+
VALUE rfield, self;
|
281
|
+
DocField *df;
|
282
|
+
int i;
|
283
|
+
HshEntry *he;
|
284
|
+
if (!doc || (self = object_get(doc)) != Qnil) return Qnil;
|
285
|
+
|
286
|
+
doc->free_data = NULL;
|
287
|
+
/* Set all fields to not free their data */
|
288
|
+
for (i = 0; i <= doc->fields->mask; i++) {
|
289
|
+
he = &doc->fields->table[i];
|
290
|
+
if (he->key != NULL && he->key != dummy_key) {
|
291
|
+
((Array *)he->value)->free_elem = NULL;
|
292
|
+
}
|
293
|
+
}
|
294
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
295
|
+
df = doc->df_arr[i];
|
296
|
+
rfield = Data_Wrap_Struct(cField, NULL, &frt_field_free, df);
|
297
|
+
object_add(df, rfield);
|
298
|
+
}
|
299
|
+
self = Data_Wrap_Struct(cDocument, &frt_doc_mark, &frt_doc_free, doc);
|
300
|
+
object_add(doc, self);
|
301
|
+
return self;
|
302
|
+
}
|
303
|
+
|
304
|
+
#define GET_DOC Document *doc; Data_Get_Struct(self, Document, doc)
|
305
|
+
static VALUE
|
306
|
+
frt_doc_init(VALUE self)
|
307
|
+
{
|
308
|
+
return self;
|
309
|
+
}
|
310
|
+
|
311
|
+
static VALUE
|
312
|
+
frt_doc_all_fields(VALUE self)
|
313
|
+
{
|
314
|
+
int i;
|
315
|
+
GET_DOC;
|
316
|
+
VALUE values = rb_ary_new2(doc->dfcnt);
|
317
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
318
|
+
rb_ary_push(values, object_get(doc->df_arr[i]));
|
319
|
+
}
|
320
|
+
return values;
|
321
|
+
}
|
322
|
+
|
323
|
+
static VALUE
|
324
|
+
frt_doc_field_count(VALUE self)
|
325
|
+
{
|
326
|
+
GET_DOC;
|
327
|
+
return INT2FIX(doc->fcnt);
|
328
|
+
}
|
329
|
+
|
330
|
+
static VALUE
|
331
|
+
frt_doc_entry_count(VALUE self)
|
332
|
+
{
|
333
|
+
GET_DOC;
|
334
|
+
return INT2FIX(doc->dfcnt);
|
335
|
+
}
|
336
|
+
|
337
|
+
static VALUE
|
338
|
+
frt_doc_add_field(VALUE self, VALUE rfield)
|
339
|
+
{
|
340
|
+
DocField *df;
|
341
|
+
GET_DOC;
|
342
|
+
Data_Get_Struct(rfield, DocField, df);
|
343
|
+
doc_add_field(doc, df);
|
344
|
+
return Qnil;
|
345
|
+
}
|
346
|
+
|
347
|
+
/* TODO: return the removed fields as an array */
|
348
|
+
static VALUE
|
349
|
+
frt_doc_remove_fields(VALUE self, VALUE rname)
|
350
|
+
{
|
351
|
+
Array *fields;
|
352
|
+
GET_DOC;
|
353
|
+
rname = rb_obj_as_string(rname);
|
354
|
+
fields = doc_remove_fields(doc, RSTRING(rname)->ptr);
|
355
|
+
ary_destroy(fields);
|
356
|
+
return Qnil;
|
357
|
+
}
|
358
|
+
|
359
|
+
static VALUE
|
360
|
+
frt_doc_remove_field(VALUE self, VALUE rname)
|
361
|
+
{
|
362
|
+
DocField *df;
|
363
|
+
GET_DOC;
|
364
|
+
rname = rb_obj_as_string(rname);
|
365
|
+
df = doc_remove_field(doc, RSTRING(rname)->ptr);
|
366
|
+
return object_get(df);
|
367
|
+
}
|
368
|
+
|
369
|
+
static VALUE
|
370
|
+
frt_doc_field(VALUE self, VALUE rname)
|
371
|
+
{
|
372
|
+
GET_DOC;
|
373
|
+
DocField *df;
|
374
|
+
rname = rb_obj_as_string(rname);
|
375
|
+
df = doc_get_field(doc, RSTRING(rname)->ptr);
|
376
|
+
return object_get(df);
|
377
|
+
}
|
378
|
+
|
379
|
+
static VALUE
|
380
|
+
frt_doc_fields(VALUE self, VALUE rname)
|
381
|
+
{
|
382
|
+
int i;
|
383
|
+
VALUE fields;
|
384
|
+
GET_DOC;
|
385
|
+
Array *dfs;
|
386
|
+
rname = rb_obj_as_string(rname);
|
387
|
+
dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
|
388
|
+
if (!dfs) return Qnil;
|
389
|
+
fields = rb_ary_new2(dfs->size);
|
390
|
+
for (i = 0; i < dfs->size; i++) {
|
391
|
+
rb_ary_push(fields, object_get(dfs->elems[i]));
|
392
|
+
}
|
393
|
+
|
394
|
+
return fields;
|
395
|
+
}
|
396
|
+
|
397
|
+
static VALUE
|
398
|
+
frt_doc_values(VALUE self, VALUE rname)
|
399
|
+
{
|
400
|
+
int i, len = 0, vindex = 0;
|
401
|
+
VALUE rvalues;
|
402
|
+
char *values = NULL;
|
403
|
+
GET_DOC;
|
404
|
+
Array *dfs;
|
405
|
+
DocField *df;
|
406
|
+
rname = rb_obj_as_string(rname);
|
407
|
+
dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
|
408
|
+
if (!dfs) return Qnil;
|
409
|
+
|
410
|
+
for (i = 0; i < dfs->size; i++) {
|
411
|
+
df = (DocField *)dfs->elems[i];
|
412
|
+
if (df->is_binary) continue;
|
413
|
+
len += df->blen + 1;
|
414
|
+
REALLOC_N(values, char, len);
|
415
|
+
MEMCPY(values + vindex, df->data, char, df->blen);
|
416
|
+
vindex = len;
|
417
|
+
values[vindex-1] = ' ';
|
418
|
+
}
|
419
|
+
if (len) {
|
420
|
+
values[len-1] = '\0';
|
421
|
+
rvalues = rb_str_new(values, len-1);
|
422
|
+
free(values);
|
423
|
+
} else {
|
424
|
+
rvalues = Qnil;
|
425
|
+
}
|
426
|
+
|
427
|
+
return rvalues;
|
428
|
+
}
|
429
|
+
|
430
|
+
static VALUE
|
431
|
+
frt_doc_binaries(VALUE self, VALUE rname)
|
432
|
+
{
|
433
|
+
int i;
|
434
|
+
VALUE rvalues;
|
435
|
+
GET_DOC;
|
436
|
+
Array *dfs;
|
437
|
+
DocField *df;
|
438
|
+
rname = rb_obj_as_string(rname);
|
439
|
+
dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
|
440
|
+
if (!dfs) return Qnil;
|
441
|
+
|
442
|
+
rvalues = rb_ary_new2(dfs->size);
|
443
|
+
for (i = 0; i < dfs->size; i++) {
|
444
|
+
df = (DocField *)dfs->elems[i];
|
445
|
+
if (!df->is_binary) continue;
|
446
|
+
rb_ary_push(rvalues, rb_str_new(df->data, df->blen));
|
447
|
+
}
|
448
|
+
return rvalues;
|
449
|
+
}
|
450
|
+
|
451
|
+
static VALUE
|
452
|
+
frt_doc_set(VALUE self, VALUE rname, VALUE rdata)
|
453
|
+
{
|
454
|
+
DocField *df;
|
455
|
+
GET_DOC;
|
456
|
+
VALUE rfield;
|
457
|
+
rname = rb_obj_as_string(rname);
|
458
|
+
rdata = rb_obj_as_string(rdata);
|
459
|
+
|
460
|
+
df = doc_get_field(doc, RSTRING(rname)->ptr);
|
461
|
+
if (df) {
|
462
|
+
free(df->data);
|
463
|
+
df->data = estrdup(RSTRING(rdata)->ptr);
|
464
|
+
rfield = object_get(df);
|
465
|
+
} else {
|
466
|
+
rfield = rb_funcall(cField, id_new, 2, rname, rdata);
|
467
|
+
Data_Get_Struct(rfield, DocField, df);
|
468
|
+
doc_add_field(doc, df);
|
469
|
+
}
|
470
|
+
return rfield;
|
471
|
+
}
|
472
|
+
static VALUE
|
473
|
+
frt_doc_to_s(VALUE self)
|
474
|
+
{
|
475
|
+
char *str;
|
476
|
+
VALUE rstr;
|
477
|
+
GET_DOC;
|
478
|
+
str = doc_to_s(doc);
|
479
|
+
rstr = rb_str_new2(str);
|
480
|
+
free(str);
|
481
|
+
return rstr;
|
482
|
+
}
|
483
|
+
|
484
|
+
static VALUE
|
485
|
+
frt_doc_get_boost(VALUE self)
|
486
|
+
{
|
487
|
+
GET_DOC;
|
488
|
+
return rb_float_new((double)doc->boost);
|
489
|
+
}
|
490
|
+
|
491
|
+
static VALUE
|
492
|
+
frt_doc_set_boost(VALUE self, VALUE rboost)
|
493
|
+
{
|
494
|
+
GET_DOC;
|
495
|
+
doc->boost = (float)rb_num2dbl(rboost);
|
496
|
+
return Qnil;
|
497
|
+
}
|
498
|
+
|
499
|
+
|
500
|
+
/****************************************************************************
|
501
|
+
*
|
502
|
+
* Init Function
|
503
|
+
*
|
504
|
+
****************************************************************************/
|
505
|
+
|
506
|
+
void
|
507
|
+
Init_doc(void)
|
508
|
+
{
|
509
|
+
/* Field */
|
510
|
+
cField = rb_define_class_under(mDocument, "Field", rb_cObject);
|
511
|
+
rb_define_alloc_func(cField, frt_field_alloc);
|
512
|
+
|
513
|
+
rb_define_method(cField, "initialize", frt_field_init, -1);
|
514
|
+
rb_define_singleton_method(cField, "new_binary_field",
|
515
|
+
frt_field_new_binary, 3);
|
516
|
+
rb_define_method(cField, "name", frt_field_get_name, 0);
|
517
|
+
rb_define_method(cField, "name=", frt_field_set_name, 1);
|
518
|
+
rb_define_method(cField, "data", frt_field_get_data, 0);
|
519
|
+
rb_define_method(cField, "data=", frt_field_set_data, 1);
|
520
|
+
rb_define_method(cField, "boost", frt_field_get_boost, 0);
|
521
|
+
rb_define_method(cField, "boost=", frt_field_set_boost, 1);
|
522
|
+
rb_define_method(cField, "stored?", frt_field_is_stored, 0);
|
523
|
+
rb_define_method(cField, "indexed?", frt_field_is_indexed, 0);
|
524
|
+
rb_define_method(cField, "tokenized?", frt_field_is_tokenized, 0);
|
525
|
+
rb_define_method(cField, "binary?", frt_field_is_binary, 0);
|
526
|
+
rb_define_method(cField, "compressed?", frt_field_is_compressed, 0);
|
527
|
+
rb_define_method(cField, "store_term_vector?", frt_field_store_tv, 0);
|
528
|
+
rb_define_method(cField, "store_positions?", frt_field_store_pos, 0);
|
529
|
+
rb_define_method(cField, "store_offsets?", frt_field_store_offset, 0);
|
530
|
+
rb_define_method(cField, "omit_norms?", frt_field_omit_norms, 0);
|
531
|
+
rb_define_method(cField, "to_s", frt_field_to_s, 0);
|
532
|
+
rb_define_method(cField, "store=", frt_field_set_store, 1);
|
533
|
+
rb_define_method(cField, "index=", frt_field_set_index, 1);
|
534
|
+
rb_define_method(cField, "term_vector=", frt_field_set_term_vector, 1);
|
535
|
+
|
536
|
+
/* Field Constants */
|
537
|
+
cFieldStore = rb_define_class_under(cField, "Store", rb_cObject);
|
538
|
+
rb_define_const(cFieldStore, "YES", INT2FIX(DF_STORE_YES));
|
539
|
+
rb_define_const(cFieldStore, "NO", INT2FIX(DF_STORE_NO));
|
540
|
+
rb_define_const(cFieldStore, "COMPRESS", INT2FIX(DF_STORE_COMPRESS));
|
541
|
+
cFieldIndex = rb_define_class_under(cField, "Index", rb_cObject);
|
542
|
+
rb_define_const(cFieldIndex, "UNTOKENIZED", INT2FIX(DF_INDEX_UNTOKENIZED));
|
543
|
+
rb_define_const(cFieldIndex, "TOKENIZED", INT2FIX(DF_INDEX_TOKENIZED));
|
544
|
+
rb_define_const(cFieldIndex, "NO", INT2FIX(DF_INDEX_NO));
|
545
|
+
rb_define_const(cFieldIndex, "NO_NORMS", INT2FIX(DF_INDEX_NO_NORMS));
|
546
|
+
cFieldTermVector = rb_define_class_under(cField, "TermVector", rb_cObject);
|
547
|
+
rb_define_const(cFieldTermVector, "NO", INT2FIX(DF_TERM_VECTOR_NO));
|
548
|
+
rb_define_const(cFieldTermVector, "YES", INT2FIX(DF_TERM_VECTOR_YES));
|
549
|
+
rb_define_const(cFieldTermVector, "WITH_POSITIONS",
|
550
|
+
INT2FIX(DF_TERM_VECTOR_WITH_POSITIONS));
|
551
|
+
rb_define_const(cFieldTermVector, "WITH_OFFSETS",
|
552
|
+
INT2FIX(DF_TERM_VECTOR_WITH_OFFSETS));
|
553
|
+
rb_define_const(cFieldTermVector, "WITH_POSITIONS_OFFSETS",
|
554
|
+
INT2FIX(DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS));
|
555
|
+
|
556
|
+
/* Document */
|
557
|
+
cDocument = rb_define_class_under(mDocument, "Document", rb_cObject);
|
558
|
+
rb_define_alloc_func(cDocument, frt_doc_alloc);
|
559
|
+
|
560
|
+
rb_define_method(cDocument, "initialize", frt_doc_init, 0);
|
561
|
+
rb_define_method(cDocument, "all_fields", frt_doc_all_fields, 0);
|
562
|
+
rb_define_method(cDocument, "field_count", frt_doc_field_count, 0);
|
563
|
+
rb_define_method(cDocument, "entry_count", frt_doc_entry_count, 0);
|
564
|
+
rb_define_method(cDocument, "add_field", frt_doc_add_field, 1);
|
565
|
+
rb_define_method(cDocument, "<<", frt_doc_add_field, 1);
|
566
|
+
rb_define_method(cDocument, "remove_fields", frt_doc_remove_fields, 1);
|
567
|
+
rb_define_method(cDocument, "remove_field", frt_doc_remove_field, 1);
|
568
|
+
rb_define_method(cDocument, "field", frt_doc_field, 1);
|
569
|
+
rb_define_method(cDocument, "fields", frt_doc_fields, 1);
|
570
|
+
rb_define_method(cDocument, "values", frt_doc_values, 1);
|
571
|
+
rb_define_method(cDocument, "binaries", frt_doc_binaries, 1);
|
572
|
+
rb_define_method(cDocument, "[]", frt_doc_values, 1);
|
573
|
+
rb_define_method(cDocument, "set", frt_doc_set, 2);
|
574
|
+
rb_define_method(cDocument, "[]=", frt_doc_set, 2);
|
575
|
+
rb_define_method(cDocument, "to_s", frt_doc_to_s, 0);
|
576
|
+
rb_define_method(cDocument, "boost", frt_doc_get_boost, 0);
|
577
|
+
rb_define_method(cDocument, "boost=", frt_doc_set_boost, 1);
|
578
|
+
}
|