ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/document.c
ADDED
@@ -0,0 +1,336 @@
|
|
1
|
+
#include <document.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
/****************************************************************************
|
5
|
+
*
|
6
|
+
* DocField
|
7
|
+
*
|
8
|
+
****************************************************************************/
|
9
|
+
|
10
|
+
inline void df_set(DocField *df, const char *name,
|
11
|
+
char *data, int store, int index, int tv)
|
12
|
+
{
|
13
|
+
if ((index == DF_INDEX_NO) && (store == DF_STORE_NO))
|
14
|
+
eprintf(ARG_ERROR,
|
15
|
+
"it doesn't make sense to have a field that is neither indexed nor stored");
|
16
|
+
if ((index == DF_INDEX_NO) && (tv != DF_TERM_VECTOR_NO))
|
17
|
+
eprintf(ARG_ERROR,
|
18
|
+
"cannot store term vector information for a field that is not indexed");
|
19
|
+
df->name = estrdup(name);
|
20
|
+
df->data = data;
|
21
|
+
df->blen = strlen(data);
|
22
|
+
df_set_store(df, store);
|
23
|
+
df_set_index(df, index);
|
24
|
+
df_set_term_vector(df, tv);
|
25
|
+
df->is_binary = false;
|
26
|
+
df->boost = 1.0;
|
27
|
+
}
|
28
|
+
|
29
|
+
DocField *df_create(const char *name, char *data, int store, int index, int tv)
|
30
|
+
{
|
31
|
+
DocField *df = ALLOC(DocField);
|
32
|
+
df_set(df, name, data, store, index, tv);
|
33
|
+
return df;
|
34
|
+
}
|
35
|
+
|
36
|
+
DocField *df_clone(DocField *self)
|
37
|
+
{
|
38
|
+
DocField *clone = ALLOC(DocField);
|
39
|
+
memcpy(clone, self, sizeof(DocField));
|
40
|
+
clone->name = estrdup(self->name);
|
41
|
+
clone->data = estrdup(self->data);
|
42
|
+
return clone;
|
43
|
+
}
|
44
|
+
|
45
|
+
void df_destroy(void *p)
|
46
|
+
{
|
47
|
+
DocField *df = (DocField *)p;
|
48
|
+
free(df->name);
|
49
|
+
free(p);
|
50
|
+
}
|
51
|
+
|
52
|
+
void df_destroy_data(void *p)
|
53
|
+
{
|
54
|
+
DocField *df = (DocField *)p;
|
55
|
+
free(df->data);
|
56
|
+
free(df->name);
|
57
|
+
free(p);
|
58
|
+
}
|
59
|
+
|
60
|
+
void df_set_store(DocField *df, int store)
|
61
|
+
{
|
62
|
+
switch (store) {
|
63
|
+
case DF_STORE_YES:
|
64
|
+
df->is_stored = true;
|
65
|
+
df->is_compressed = false;
|
66
|
+
break;
|
67
|
+
case DF_STORE_NO:
|
68
|
+
df->is_stored = false;
|
69
|
+
df->is_compressed = false;
|
70
|
+
break;
|
71
|
+
case DF_STORE_COMPRESS:
|
72
|
+
df->is_stored = true;
|
73
|
+
df->is_compressed = true;
|
74
|
+
break;
|
75
|
+
default:
|
76
|
+
eprintf(ARG_ERROR, "Invalid value %d for store in document field", store);
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
void df_set_index(DocField *df, int index)
|
81
|
+
{
|
82
|
+
df->omit_norms = false;
|
83
|
+
switch (index) {
|
84
|
+
case DF_INDEX_NO:
|
85
|
+
df->is_indexed = false;
|
86
|
+
df->is_tokenized = false;
|
87
|
+
break;
|
88
|
+
case DF_INDEX_TOKENIZED:
|
89
|
+
df->is_indexed = true;
|
90
|
+
df->is_tokenized = true;
|
91
|
+
break;
|
92
|
+
case DF_INDEX_UNTOKENIZED:
|
93
|
+
df->is_indexed = true;
|
94
|
+
df->is_tokenized = false;
|
95
|
+
break;
|
96
|
+
case DF_INDEX_NO_NORMS:
|
97
|
+
df->is_indexed = true;
|
98
|
+
df->is_tokenized = false;
|
99
|
+
df->omit_norms = true;
|
100
|
+
break;
|
101
|
+
default:
|
102
|
+
eprintf(ARG_ERROR, "Invalid value %d for index in document field", index);
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
void df_set_term_vector(DocField *df, int tv)
|
107
|
+
{
|
108
|
+
switch (tv) {
|
109
|
+
case DF_TERM_VECTOR_NO:
|
110
|
+
df->store_tv = false;
|
111
|
+
df->store_offset = false;
|
112
|
+
df->store_pos = false;
|
113
|
+
break;
|
114
|
+
case DF_TERM_VECTOR_YES:
|
115
|
+
df->store_tv = true;
|
116
|
+
df->store_offset = false;
|
117
|
+
df->store_pos = false;
|
118
|
+
break;
|
119
|
+
case DF_TERM_VECTOR_WITH_OFFSETS:
|
120
|
+
df->store_tv = true;
|
121
|
+
df->store_offset = true;
|
122
|
+
df->store_pos = false;
|
123
|
+
break;
|
124
|
+
case DF_TERM_VECTOR_WITH_POSITIONS:
|
125
|
+
df->store_tv = true;
|
126
|
+
df->store_offset = false;
|
127
|
+
df->store_pos = true;
|
128
|
+
break;
|
129
|
+
case DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS:
|
130
|
+
df->store_tv = true;
|
131
|
+
df->store_offset = true;
|
132
|
+
df->store_pos = true;
|
133
|
+
break;
|
134
|
+
default:
|
135
|
+
eprintf(ARG_ERROR,
|
136
|
+
"Invalid value %d for term_vector in document field", tv);
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
DocField *df_create_binary(char *name, char *data, int blen, int store)
|
141
|
+
{
|
142
|
+
if (store == DF_STORE_NO) {
|
143
|
+
eprintf(ARG_ERROR, "It doesn't make sense not to store binary data\n");
|
144
|
+
}
|
145
|
+
DocField *df = df_create(name, data, store, DF_INDEX_NO, DF_TERM_VECTOR_NO);
|
146
|
+
df->is_binary = true;
|
147
|
+
df->blen = blen;
|
148
|
+
return df;
|
149
|
+
}
|
150
|
+
|
151
|
+
char *df_to_s(DocField *self)
|
152
|
+
{
|
153
|
+
/* the length of the str is name.len + data.len + 119, add safety 10 */
|
154
|
+
char *str = ALLOC_N(char, strlen(self->name) + strlen(self->data) + 129);
|
155
|
+
char *str_ptr = str;
|
156
|
+
|
157
|
+
if (self->is_stored) {
|
158
|
+
sprintf(str, "stored/%s,", self->is_compressed ? "compressed" : "uncompressed");
|
159
|
+
str_ptr = str + strlen(str);
|
160
|
+
}
|
161
|
+
sprintf(str_ptr, "%s%s%s%s%s%s%s<%s:%s>",
|
162
|
+
self->is_indexed ? "indexed," : "",
|
163
|
+
self->is_tokenized ? "tokenized," : "",
|
164
|
+
self->store_tv ? "store_term_vector," : "",
|
165
|
+
self->store_offset ? "store_offsets," : "",
|
166
|
+
self->store_pos ? "store_positions," : "",
|
167
|
+
self->omit_norms ? "omit_norms," : "",
|
168
|
+
self->is_binary ? "binary," : "",
|
169
|
+
self->name,
|
170
|
+
self->is_binary ? "=bin_data=" : self->data);
|
171
|
+
|
172
|
+
return str;
|
173
|
+
}
|
174
|
+
|
175
|
+
/****************************************************************************
|
176
|
+
*
|
177
|
+
* Document
|
178
|
+
*
|
179
|
+
****************************************************************************/
|
180
|
+
|
181
|
+
Document *doc_create()
|
182
|
+
{
|
183
|
+
Document *doc = ALLOC(Document);
|
184
|
+
doc->fields = h_new_str(&free, &ary_destroy);
|
185
|
+
doc->fcnt = 0;
|
186
|
+
doc->dfcnt = 0;
|
187
|
+
doc->field_arr = NULL;
|
188
|
+
doc->df_arr = NULL;
|
189
|
+
doc->boost = 1.0;
|
190
|
+
doc->free_data = &df_destroy_data;
|
191
|
+
return doc;
|
192
|
+
}
|
193
|
+
|
194
|
+
Document *doc_create_keep_data()
|
195
|
+
{
|
196
|
+
Document *doc = doc_create();
|
197
|
+
doc->free_data = df_destroy;
|
198
|
+
return doc;
|
199
|
+
}
|
200
|
+
|
201
|
+
void doc_destroy(void *p)
|
202
|
+
{
|
203
|
+
Document *doc = (Document *)p;
|
204
|
+
free(doc->field_arr);
|
205
|
+
free(doc->df_arr);
|
206
|
+
h_destroy(doc->fields);
|
207
|
+
free(doc);
|
208
|
+
}
|
209
|
+
|
210
|
+
void doc_add_field(Document *doc, DocField *df)
|
211
|
+
{
|
212
|
+
Array *field_ga = (Array *)h_get(doc->fields, df->name);
|
213
|
+
if (field_ga == NULL) {
|
214
|
+
field_ga = ary_create(1, doc->free_data);
|
215
|
+
h_set(doc->fields, estrdup(df->name), field_ga);
|
216
|
+
doc->fcnt++;
|
217
|
+
REALLOC_N(doc->field_arr, Array *, doc->fcnt);
|
218
|
+
doc->field_arr[doc->fcnt-1] = field_ga;
|
219
|
+
}
|
220
|
+
ary_append(field_ga, df);
|
221
|
+
doc->dfcnt++;
|
222
|
+
REALLOC_N(doc->df_arr, DocField *, doc->dfcnt);
|
223
|
+
doc->df_arr[doc->dfcnt-1] = df;
|
224
|
+
}
|
225
|
+
|
226
|
+
DocField *doc_get_field(Document *doc, const char *fname)
|
227
|
+
{
|
228
|
+
Array *field_ga = (Array *)h_get(doc->fields, fname);
|
229
|
+
if (field_ga) {
|
230
|
+
return field_ga->elems[0];
|
231
|
+
} else {
|
232
|
+
return NULL;
|
233
|
+
}
|
234
|
+
}
|
235
|
+
|
236
|
+
Array *doc_get_fields(Document *doc, const char *fname)
|
237
|
+
{
|
238
|
+
return (Array *)h_get(doc->fields, fname);
|
239
|
+
}
|
240
|
+
|
241
|
+
/**
|
242
|
+
* TODO:
|
243
|
+
* This is not exactly elegant or efficient but it works and is not going to
|
244
|
+
* be a performance problem. Still, it would be nice to make the code a little
|
245
|
+
* clearer.
|
246
|
+
*/
|
247
|
+
Array *doc_remove_fields(Document *doc, const char *fname)
|
248
|
+
{
|
249
|
+
Array *field_ga = (Array *)h_rem(doc->fields, fname, true);
|
250
|
+
if (field_ga) {
|
251
|
+
int i, j;
|
252
|
+
doc->fcnt--;
|
253
|
+
for (i = 0; i < doc->fcnt; i++) {
|
254
|
+
if (field_ga == doc->field_arr[i]) {
|
255
|
+
memmove(&doc->field_arr[i],
|
256
|
+
&doc->field_arr[i+1],
|
257
|
+
sizeof(void *) * (doc->fcnt - i));
|
258
|
+
break;
|
259
|
+
}
|
260
|
+
}
|
261
|
+
for (i = 0, j = 0; i < doc->dfcnt && j < field_ga->size;) {
|
262
|
+
if (field_ga->elems[j] == doc->df_arr[i]) {
|
263
|
+
memmove(&doc->df_arr[i],
|
264
|
+
&doc->df_arr[i+1],
|
265
|
+
sizeof(void *) * (doc->dfcnt - i - 1));
|
266
|
+
j++;
|
267
|
+
doc->dfcnt--;
|
268
|
+
} else {
|
269
|
+
i++;
|
270
|
+
}
|
271
|
+
}
|
272
|
+
return field_ga;
|
273
|
+
} else {
|
274
|
+
return NULL;
|
275
|
+
}
|
276
|
+
}
|
277
|
+
|
278
|
+
DocField *doc_remove_field(Document *doc, const char *fname)
|
279
|
+
{
|
280
|
+
DocField *df = NULL;
|
281
|
+
Array *dfs = (Array *)h_get(doc->fields, fname);
|
282
|
+
if (dfs) {
|
283
|
+
df = ary_remove(dfs, 0);
|
284
|
+
if (dfs->size == 0) {
|
285
|
+
Array *fields = doc_remove_fields(doc, fname);
|
286
|
+
ary_destroy(fields);
|
287
|
+
} else {
|
288
|
+
int i;
|
289
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
290
|
+
if (df == doc->df_arr[i]) {
|
291
|
+
memmove(&doc->df_arr[i],
|
292
|
+
&doc->df_arr[i+1],
|
293
|
+
sizeof(void *) * (doc->dfcnt - i - 1));
|
294
|
+
doc->dfcnt--;
|
295
|
+
break;
|
296
|
+
}
|
297
|
+
}
|
298
|
+
}
|
299
|
+
}
|
300
|
+
return df;
|
301
|
+
}
|
302
|
+
|
303
|
+
bool doc_delete_fields(Document *doc, const char *fname)
|
304
|
+
{
|
305
|
+
Array *field_ga = doc_remove_fields(doc, fname);
|
306
|
+
if (field_ga) {
|
307
|
+
ary_destroy(field_ga);
|
308
|
+
return true;
|
309
|
+
} else {
|
310
|
+
return false;
|
311
|
+
}
|
312
|
+
return h_del(doc->fields, fname);
|
313
|
+
}
|
314
|
+
|
315
|
+
char *doc_to_s(Document *doc)
|
316
|
+
{
|
317
|
+
int i, len = 20;
|
318
|
+
char *str, *str_ptr;
|
319
|
+
char **df_strs = ALLOC_N(char *, doc->dfcnt);
|
320
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
321
|
+
df_strs[i] = df_to_s(doc->df_arr[i]);
|
322
|
+
len += strlen(df_strs[i]) + 3;
|
323
|
+
}
|
324
|
+
str_ptr = str = ALLOC_N(char, len);
|
325
|
+
sprintf(str_ptr, "Document {\n");
|
326
|
+
str_ptr += strlen(str_ptr);
|
327
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
328
|
+
sprintf(str_ptr, " %s\n", df_strs[i]);
|
329
|
+
free(df_strs[i]);
|
330
|
+
str_ptr += strlen(str_ptr);
|
331
|
+
}
|
332
|
+
sprintf(str_ptr, "}");
|
333
|
+
free(df_strs);
|
334
|
+
|
335
|
+
return str;
|
336
|
+
}
|
data/ext/document.h
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
#ifndef FRT_DOCUMENT_H
|
2
|
+
#define FRT_DOCUMENT_H
|
3
|
+
|
4
|
+
#include <global.h>
|
5
|
+
#include <hash.h>
|
6
|
+
#include <array.h>
|
7
|
+
|
8
|
+
/****************************************************************************
|
9
|
+
*
|
10
|
+
* DocField
|
11
|
+
*
|
12
|
+
****************************************************************************/
|
13
|
+
enum {
|
14
|
+
DF_STORE_YES = 0,
|
15
|
+
DF_STORE_NO = 1,
|
16
|
+
DF_STORE_COMPRESS = 2
|
17
|
+
};
|
18
|
+
|
19
|
+
enum {
|
20
|
+
DF_INDEX_UNTOKENIZED = 0,
|
21
|
+
DF_INDEX_TOKENIZED = 1,
|
22
|
+
DF_INDEX_NO = 2,
|
23
|
+
DF_INDEX_NO_NORMS = 3
|
24
|
+
};
|
25
|
+
|
26
|
+
enum {
|
27
|
+
DF_TERM_VECTOR_NO = 0,
|
28
|
+
DF_TERM_VECTOR_YES = 1,
|
29
|
+
DF_TERM_VECTOR_WITH_POSITIONS = 2,
|
30
|
+
DF_TERM_VECTOR_WITH_OFFSETS = 3,
|
31
|
+
DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS = 4
|
32
|
+
};
|
33
|
+
|
34
|
+
typedef struct DocField {
|
35
|
+
char *name;
|
36
|
+
char *data;
|
37
|
+
int blen; // This is used for binary fields only to store the data length
|
38
|
+
float boost;
|
39
|
+
bool is_stored : 1;
|
40
|
+
bool is_compressed : 1;
|
41
|
+
bool is_indexed : 1;
|
42
|
+
bool is_tokenized : 1;
|
43
|
+
bool store_tv : 1;
|
44
|
+
bool store_pos : 1;
|
45
|
+
bool store_offset : 1;
|
46
|
+
bool omit_norms : 1;
|
47
|
+
bool is_binary : 1;
|
48
|
+
} DocField;
|
49
|
+
|
50
|
+
DocField *df_create(const char *name, char *data, int store, int index, int tv);
|
51
|
+
DocField *df_clone(DocField *self);
|
52
|
+
void df_set(DocField *df, const char *name, char *data, int store, int index, int tv);
|
53
|
+
void df_destroy(void *p);
|
54
|
+
void df_destroy_data(void *p);
|
55
|
+
void df_set_store(DocField *df, int store);
|
56
|
+
void df_set_index(DocField *df, int index);
|
57
|
+
void df_set_term_vector(DocField *df, int tv);
|
58
|
+
char *df_to_s(DocField *df);
|
59
|
+
DocField *df_create_binary(char *name, char *data, int blen, int store);
|
60
|
+
|
61
|
+
/****************************************************************************
|
62
|
+
*
|
63
|
+
* Document
|
64
|
+
*
|
65
|
+
****************************************************************************/
|
66
|
+
|
67
|
+
typedef struct Document {
|
68
|
+
HshTable *fields;
|
69
|
+
Array **field_arr;
|
70
|
+
int fcnt;
|
71
|
+
DocField **df_arr;
|
72
|
+
int dfcnt;
|
73
|
+
float boost;
|
74
|
+
void (*free_data)(void *p);
|
75
|
+
} Document;
|
76
|
+
|
77
|
+
Document *doc_create();
|
78
|
+
Document *doc_create_keep_data();
|
79
|
+
void doc_destroy(void *p);
|
80
|
+
void doc_add_field(Document *doc, DocField *df);
|
81
|
+
DocField *doc_get_field(Document *doc, const char *fname);
|
82
|
+
Array *doc_get_fields(Document *doc, const char *fname);
|
83
|
+
Array *doc_remove_fields(Document *doc, const char *fname);
|
84
|
+
DocField *doc_remove_field(Document *doc, const char *fname);
|
85
|
+
bool doc_delete_fields(Document *doc, const char *fname);
|
86
|
+
char *doc_to_s(Document *doc);
|
87
|
+
#endif
|
data/ext/ferret.c
CHANGED
@@ -1,73 +1,114 @@
|
|
1
1
|
#include "ferret.h"
|
2
|
+
#include "hash.h"
|
3
|
+
|
4
|
+
/* Object Map */
|
5
|
+
static HshTable *object_map;
|
2
6
|
|
3
7
|
/* IDs */
|
4
8
|
ID id_new;
|
5
|
-
ID id_close;
|
6
|
-
ID id_size;
|
7
|
-
ID id_iv_size;
|
8
9
|
|
9
10
|
/* Modules */
|
10
11
|
VALUE mFerret;
|
11
|
-
VALUE mStore;
|
12
|
-
VALUE mIndex;
|
13
|
-
VALUE mUtils;
|
14
12
|
VALUE mAnalysis;
|
13
|
+
VALUE mDocument;
|
14
|
+
VALUE mIndex;
|
15
15
|
VALUE mSearch;
|
16
|
+
VALUE mStore;
|
16
17
|
VALUE mStringHelper;
|
18
|
+
VALUE mUtils;
|
19
|
+
VALUE mSpans;
|
17
20
|
|
18
21
|
/* Classes */
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
VALUE
|
36
|
-
|
37
|
-
|
38
|
-
VALUE
|
22
|
+
/*
|
23
|
+
*/
|
24
|
+
|
25
|
+
|
26
|
+
unsigned int
|
27
|
+
object_hash(const void *key)
|
28
|
+
{
|
29
|
+
return (unsigned int)key;
|
30
|
+
}
|
31
|
+
|
32
|
+
int
|
33
|
+
object_eq(const void *key1, const void *key2)
|
34
|
+
{
|
35
|
+
return key1 == key2;
|
36
|
+
}
|
37
|
+
|
38
|
+
VALUE
|
39
|
+
object_get(void *key)
|
40
|
+
{
|
41
|
+
VALUE val = (VALUE)h_get(object_map, key);
|
42
|
+
if (!val) val = Qnil;
|
43
|
+
return val;
|
44
|
+
}
|
45
|
+
|
46
|
+
//static int hash_cnt = 0;
|
47
|
+
void
|
48
|
+
//object_add(void *key, VALUE obj)
|
49
|
+
object_add2(void *key, VALUE obj, const char *file, int line, const char *func)
|
50
|
+
{
|
51
|
+
if (h_get(object_map, key))
|
52
|
+
printf("failed adding %d. %s:%d:%s\n", (int)key, file, line, func);
|
53
|
+
//printf("adding %d. now contains %d %s:%d:%s\n", (int)key, ++hash_cnt, file, line, func);
|
54
|
+
h_set(object_map, key, (void *)obj);
|
55
|
+
}
|
56
|
+
|
57
|
+
void
|
58
|
+
//object_del(void *key)
|
59
|
+
object_del2(void *key, const char *file, int line, const char *func)
|
60
|
+
{
|
61
|
+
if (object_get(key) == Qnil)
|
62
|
+
printf("failed deleting %d. %s:%d:%s\n", (int)key, file, line, func);
|
63
|
+
//printf("deleting %d. now contains %d, %s:%d:%s\n", (int)key, --hash_cnt, file, line, func);
|
64
|
+
h_del(object_map, key);
|
65
|
+
}
|
66
|
+
|
67
|
+
void
|
68
|
+
frt_gc_mark(void *key)
|
69
|
+
{
|
70
|
+
VALUE val = (VALUE)h_get(object_map, key);
|
71
|
+
if (val)
|
72
|
+
rb_gc_mark(val);
|
73
|
+
}
|
74
|
+
|
75
|
+
VALUE
|
76
|
+
frt_data_alloc(VALUE klass)
|
77
|
+
{
|
78
|
+
return Frt_Make_Struct(klass);
|
79
|
+
}
|
80
|
+
|
81
|
+
void
|
82
|
+
frt_deref_free(void *p)
|
83
|
+
{
|
84
|
+
object_del(p);
|
85
|
+
}
|
39
86
|
|
40
87
|
void
|
41
88
|
Init_ferret_ext(void)
|
42
89
|
{
|
90
|
+
/* initialize object map */
|
91
|
+
object_map = h_new(&object_hash, &object_eq, NULL, NULL);
|
92
|
+
|
43
93
|
/* IDs */
|
44
|
-
|
45
|
-
id_close = rb_intern("close");
|
46
|
-
id_size = rb_intern("size");
|
47
|
-
id_iv_size = rb_intern("@size");
|
94
|
+
id_new = rb_intern("new");
|
48
95
|
|
49
96
|
/* Modules */
|
50
97
|
mFerret = rb_define_module("Ferret");
|
51
|
-
mStore = rb_define_module_under(mFerret, "Store");
|
52
|
-
mIndex = rb_define_module_under(mFerret, "Index");
|
53
|
-
mUtils = rb_define_module_under(mFerret, "Utils");
|
54
98
|
mAnalysis = rb_define_module_under(mFerret, "Analysis");
|
99
|
+
mDocument = rb_define_module_under(mFerret, "Document");
|
100
|
+
mIndex = rb_define_module_under(mFerret, "Index");
|
55
101
|
mSearch = rb_define_module_under(mFerret, "Search");
|
56
|
-
|
57
|
-
|
58
|
-
|
102
|
+
mStore = rb_define_module_under(mFerret, "Store");
|
103
|
+
mUtils = rb_define_module_under(mFerret, "Utils");
|
104
|
+
mSpans = rb_define_module_under(mSearch, "Spans");
|
59
105
|
|
60
106
|
/* Inits */
|
61
|
-
Init_indexio();
|
62
107
|
Init_term();
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
Init_segment_term_enum();
|
70
|
-
Init_ram_directory();
|
71
|
-
Init_string_helper();
|
72
|
-
Init_similarity();
|
108
|
+
Init_analysis();
|
109
|
+
Init_doc();
|
110
|
+
Init_dir();
|
111
|
+
Init_index_io();
|
112
|
+
Init_search();
|
113
|
+
Init_qparser();
|
73
114
|
}
|