ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/vector.c
ADDED
@@ -0,0 +1,594 @@
|
|
1
|
+
#include <index.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <helper.h>
|
4
|
+
|
5
|
+
#define TERM_ARR_START_SIZE 16
|
6
|
+
#define FIELD_ARR_START_SIZE 8
|
7
|
+
|
8
|
+
TVOffsetInfo *tvoi_create(int start, int end)
|
9
|
+
{
|
10
|
+
TVOffsetInfo *tvoi = ALLOC(TVOffsetInfo);
|
11
|
+
tvoi->start = start;
|
12
|
+
tvoi->end = end;
|
13
|
+
return tvoi;
|
14
|
+
}
|
15
|
+
|
16
|
+
void tvoi_destroy(void *p)
|
17
|
+
{
|
18
|
+
free(p);
|
19
|
+
}
|
20
|
+
|
21
|
+
TVField *tvf_create(int number, int store_positions, int store_offsets)
|
22
|
+
{
|
23
|
+
TVField *tvf = ALLOC(TVField);
|
24
|
+
tvf->tvf_pointer = 0;
|
25
|
+
tvf->number = number;
|
26
|
+
tvf->store_positions = store_positions;
|
27
|
+
tvf->store_offsets = store_offsets;
|
28
|
+
return tvf;
|
29
|
+
}
|
30
|
+
|
31
|
+
void tvf_destroy(void *p)
|
32
|
+
{
|
33
|
+
free(p);
|
34
|
+
}
|
35
|
+
|
36
|
+
TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets)
|
37
|
+
{
|
38
|
+
TVTerm *tvt = ALLOC(TVTerm);
|
39
|
+
tvt->text = text;
|
40
|
+
tvt->freq = freq;
|
41
|
+
tvt->positions = positions;
|
42
|
+
tvt->offsets = offsets;
|
43
|
+
return tvt;
|
44
|
+
}
|
45
|
+
|
46
|
+
void tvt_destroy(void *p)
|
47
|
+
{
|
48
|
+
//int i;
|
49
|
+
//TVTerm *tvt = (TVTerm *)p;
|
50
|
+
//free(tvt->text);
|
51
|
+
//free(tvt->positions);
|
52
|
+
//if (tvt->offsets != NULL) {
|
53
|
+
// for (i = 0; i < tvt->freq; i++) {
|
54
|
+
// tvoi_destroy(tvt->offsets[i]);
|
55
|
+
// }
|
56
|
+
// free(tvt->offsets);
|
57
|
+
//}
|
58
|
+
free(p);
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
|
63
|
+
{
|
64
|
+
TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
|
65
|
+
tvw->curr_field = NULL;
|
66
|
+
tvw->curr_doc_pointer = -1;
|
67
|
+
|
68
|
+
// Open files for TermVector storage
|
69
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
70
|
+
int segment_len = strlen(segment);
|
71
|
+
strcpy(fname, segment);
|
72
|
+
|
73
|
+
strcpy(fname + segment_len, TVX_EXTENSION);
|
74
|
+
OutStream *os = tvw->tvx = store->create_output(store, fname);
|
75
|
+
os_write_int(os, FORMAT_VERSION);
|
76
|
+
|
77
|
+
strcpy(fname + segment_len, TVD_EXTENSION);
|
78
|
+
os = tvw->tvd = store->create_output(store, fname);
|
79
|
+
os_write_int(os, FORMAT_VERSION);
|
80
|
+
|
81
|
+
strcpy(fname + segment_len, TVF_EXTENSION);
|
82
|
+
os = tvw->tvf = store->create_output(store, fname);
|
83
|
+
os_write_int(os, FORMAT_VERSION);
|
84
|
+
|
85
|
+
tvw->fis = fis;
|
86
|
+
|
87
|
+
tvw->fields = NULL;
|
88
|
+
tvw->fcnt = 0;
|
89
|
+
tvw->fsize = 0;
|
90
|
+
tvw->terms = NULL;
|
91
|
+
tvw->tcnt = 0;
|
92
|
+
tvw->tsize = 0;
|
93
|
+
|
94
|
+
return tvw;
|
95
|
+
}
|
96
|
+
|
97
|
+
void tvw_write_field(TermVectorsWriter *tvw)
|
98
|
+
{
|
99
|
+
int i, j, start, length;
|
100
|
+
char *last_term_text;
|
101
|
+
TVOffsetInfo *tmp_offset;
|
102
|
+
// remember where this field is written
|
103
|
+
OutStream *tvf = tvw->tvf;
|
104
|
+
tvw->curr_field->tvf_pointer = os_pos(tvf);
|
105
|
+
|
106
|
+
// write the number of terms
|
107
|
+
os_write_vint(tvf, tvw->tcnt);
|
108
|
+
|
109
|
+
int store_positions = tvw->curr_field->store_positions;
|
110
|
+
int store_offsets = tvw->curr_field->store_offsets;
|
111
|
+
int bits = 0x0;
|
112
|
+
if (store_positions)
|
113
|
+
bits |= STORE_POSITIONS_WITH_TERMVECTOR;
|
114
|
+
|
115
|
+
if (store_offsets)
|
116
|
+
bits |= STORE_OFFSET_WITH_TERMVECTOR;
|
117
|
+
|
118
|
+
os_write_byte(tvf, bits);
|
119
|
+
|
120
|
+
last_term_text = (char *)EMPTY_STRING;
|
121
|
+
TVTerm **terms = tvw->terms;
|
122
|
+
TVTerm *term;
|
123
|
+
for (i = 0; i < tvw->tcnt; i++) {
|
124
|
+
term = terms[i];
|
125
|
+
start = hlp_string_diff(last_term_text, term->text);
|
126
|
+
length = strlen(term->text) - start;
|
127
|
+
os_write_vint(tvf, start); // write shared prefix length
|
128
|
+
os_write_vint(tvf, length); // write delta length
|
129
|
+
os_write_chars(tvf, term->text, start, length); // write delta chars
|
130
|
+
os_write_vint(tvf, term->freq);
|
131
|
+
last_term_text = term->text;
|
132
|
+
|
133
|
+
if (store_positions) {
|
134
|
+
if (term->positions == NULL)
|
135
|
+
eprintf(IO_ERROR, "Trying to write positions that are null!");
|
136
|
+
|
137
|
+
// use delta encoding for positions
|
138
|
+
int last_pos = 0;
|
139
|
+
for (j = 0; j < term->freq; j++) {
|
140
|
+
os_write_vint(tvf, term->positions[j] - last_pos);
|
141
|
+
last_pos = term->positions[j];
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
if (store_offsets) {
|
146
|
+
if (term->offsets == NULL)
|
147
|
+
eprintf(IO_ERROR, "Trying to write offsets that are null!");
|
148
|
+
|
149
|
+
// use delta encoding for offsets
|
150
|
+
int last_end = 0;
|
151
|
+
for (j = 0; j < term->freq; j++) {
|
152
|
+
tmp_offset = term->offsets[j];
|
153
|
+
os_write_vint(tvf, tmp_offset->start - last_end);
|
154
|
+
|
155
|
+
// save the diff between the two.
|
156
|
+
os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
|
157
|
+
last_end = tmp_offset->end;
|
158
|
+
}
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
void tvw_close_field(TermVectorsWriter *tvw)
|
164
|
+
{
|
165
|
+
int i;
|
166
|
+
if (tvw->curr_field != NULL) {
|
167
|
+
// save field and terms
|
168
|
+
tvw_write_field(tvw);
|
169
|
+
|
170
|
+
if (tvw->fcnt >= tvw->fsize) {
|
171
|
+
tvw->fsize *=2;
|
172
|
+
if (tvw->fsize < FIELD_ARR_START_SIZE)
|
173
|
+
tvw->fsize = FIELD_ARR_START_SIZE;
|
174
|
+
REALLOC_N(tvw->fields, TVField *, tvw->fsize);
|
175
|
+
}
|
176
|
+
tvw->fields[tvw->fcnt] = tvw->curr_field;
|
177
|
+
tvw->fcnt++;
|
178
|
+
|
179
|
+
for (i = 0; i < tvw->tcnt; i++) {
|
180
|
+
tvt_destroy(tvw->terms[i]);
|
181
|
+
}
|
182
|
+
tvw->tcnt = 0;
|
183
|
+
|
184
|
+
tvw->curr_field = NULL;
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
void tvw_create_field(TermVectorsWriter *tvw,
|
189
|
+
int field_number, int store_position, int store_offset)
|
190
|
+
{
|
191
|
+
tvw_close_field(tvw);
|
192
|
+
tvw->curr_field = tvf_create(field_number, store_position, store_offset);
|
193
|
+
}
|
194
|
+
|
195
|
+
void tvw_open_field(TermVectorsWriter *tvw, char *field)
|
196
|
+
{
|
197
|
+
FieldInfo *fi = fis_get_fi(tvw->fis, field);
|
198
|
+
tvw_create_field(tvw, fi->number, fi->store_pos, fi->store_offset);
|
199
|
+
}
|
200
|
+
|
201
|
+
void tvw_write_doc(TermVectorsWriter *tvw)
|
202
|
+
{
|
203
|
+
if (tvw->curr_field != NULL)
|
204
|
+
eprintf(STATE_ERROR, "Field is still open while writing document");
|
205
|
+
|
206
|
+
// puts("Writing doc pointer: " + @curr_doc_pointer)
|
207
|
+
// write document index record
|
208
|
+
os_write_long(tvw->tvx, tvw->curr_doc_pointer);
|
209
|
+
|
210
|
+
OutStream *tvd = tvw->tvd;
|
211
|
+
// write the number of @fields
|
212
|
+
os_write_vint(tvd, tvw->fcnt);
|
213
|
+
|
214
|
+
// write field numbers
|
215
|
+
int i;
|
216
|
+
TVField **fields = tvw->fields;
|
217
|
+
for (i = 0; i < tvw->fcnt; i++) {
|
218
|
+
os_write_vint(tvd, fields[i]->number);
|
219
|
+
}
|
220
|
+
|
221
|
+
// write field pointers
|
222
|
+
int last_field_pointer = 0;
|
223
|
+
for (i = 0; i < tvw->fcnt; i++) {
|
224
|
+
os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
|
225
|
+
last_field_pointer = fields[i]->tvf_pointer;
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
void tvw_close_doc(TermVectorsWriter *tvw)
|
230
|
+
{
|
231
|
+
int i;
|
232
|
+
if (tvw->curr_doc_pointer >= 0) {
|
233
|
+
tvw_close_field(tvw);
|
234
|
+
tvw_write_doc(tvw);
|
235
|
+
|
236
|
+
for (i = 0; i < tvw->fcnt; i++) {
|
237
|
+
tvf_destroy(tvw->fields[i]);
|
238
|
+
}
|
239
|
+
tvw->fcnt = 0;
|
240
|
+
tvw->curr_doc_pointer = -1;
|
241
|
+
}
|
242
|
+
}
|
243
|
+
|
244
|
+
void tvw_open_doc(TermVectorsWriter *tvw)
|
245
|
+
{
|
246
|
+
tvw_close_doc(tvw);
|
247
|
+
tvw->curr_doc_pointer = os_pos(tvw->tvd);
|
248
|
+
}
|
249
|
+
|
250
|
+
void tvw_add_term(TermVectorsWriter *tvw,
|
251
|
+
char *text, int freq, int *positions, TVOffsetInfo **offsets)
|
252
|
+
{
|
253
|
+
if (tvw->tcnt >= tvw->tsize) {
|
254
|
+
tvw->tsize *= 2;
|
255
|
+
if (tvw->tsize < TERM_ARR_START_SIZE)
|
256
|
+
tvw->tsize = TERM_ARR_START_SIZE;
|
257
|
+
REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
|
258
|
+
}
|
259
|
+
tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
|
260
|
+
tvw->tcnt++;
|
261
|
+
}
|
262
|
+
|
263
|
+
void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
|
264
|
+
{
|
265
|
+
tvw_open_doc(tvw);
|
266
|
+
|
267
|
+
int i, j, store_positions, store_offsets;
|
268
|
+
TermVector *tv;
|
269
|
+
for (i = 0; i < vectors->size; i++) {
|
270
|
+
tv = vectors->elems[i];
|
271
|
+
|
272
|
+
store_positions = (tv->tcnt > 0 && tv->positions != NULL);
|
273
|
+
store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
|
274
|
+
|
275
|
+
tvw_create_field(tvw, fis_get_number(tvw->fis, tv->field),
|
276
|
+
store_positions, store_offsets);
|
277
|
+
|
278
|
+
if (store_positions && store_offsets) {
|
279
|
+
for (j = 0; j < tv->tcnt; j++)
|
280
|
+
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
|
281
|
+
} else if (store_positions) {
|
282
|
+
for (j = 0; j < tv->tcnt; j++)
|
283
|
+
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
|
284
|
+
} else if (store_offsets) {
|
285
|
+
for (j = 0; j < tv->tcnt; j++)
|
286
|
+
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
|
287
|
+
} else {
|
288
|
+
for (j = 0; j < tv->tcnt; j++)
|
289
|
+
tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
|
290
|
+
}
|
291
|
+
tvw_close_field(tvw);
|
292
|
+
}
|
293
|
+
tvw_close_doc(tvw);
|
294
|
+
}
|
295
|
+
|
296
|
+
void tvw_close(TermVectorsWriter *tvw)
|
297
|
+
{
|
298
|
+
tvw_close_doc(tvw);
|
299
|
+
os_close(tvw->tvx);
|
300
|
+
os_close(tvw->tvd);
|
301
|
+
os_close(tvw->tvf);
|
302
|
+
free(tvw->terms);
|
303
|
+
free(tvw->fields);
|
304
|
+
free(tvw);
|
305
|
+
}
|
306
|
+
|
307
|
+
TermVector *tv_create(
|
308
|
+
const char *field,
|
309
|
+
char **terms,
|
310
|
+
int tcnt,
|
311
|
+
int *freqs,
|
312
|
+
int **positions,
|
313
|
+
TVOffsetInfo ***offsets)
|
314
|
+
{
|
315
|
+
TermVector *tv =
|
316
|
+
ALLOC(TermVector);
|
317
|
+
tv->field = (char *)field;
|
318
|
+
tv->terms = terms;
|
319
|
+
tv->tcnt = tcnt;
|
320
|
+
tv->freqs = freqs;
|
321
|
+
tv->positions = positions;
|
322
|
+
tv->offsets = offsets;
|
323
|
+
return tv;
|
324
|
+
}
|
325
|
+
|
326
|
+
void tv_destroy(void *p)
|
327
|
+
{
|
328
|
+
int i, j;
|
329
|
+
TermVector *tv = (TermVector *)p;
|
330
|
+
for (i = 0; i < tv->tcnt; i++) {
|
331
|
+
free(tv->terms[i]);
|
332
|
+
}
|
333
|
+
free(tv->terms);
|
334
|
+
if (tv->positions != NULL) {
|
335
|
+
for (i = 0; i < tv->tcnt; i++) {
|
336
|
+
free(tv->positions[i]);
|
337
|
+
}
|
338
|
+
free(tv->positions);
|
339
|
+
}
|
340
|
+
if (tv->offsets != NULL) {
|
341
|
+
for (i = 0; i < tv->tcnt; i++) {
|
342
|
+
for (j = 0; j < tv->freqs[i]; j++) {
|
343
|
+
tvoi_destroy(tv->offsets[i][j]);
|
344
|
+
}
|
345
|
+
free(tv->offsets[i]);
|
346
|
+
}
|
347
|
+
free(tv->offsets);
|
348
|
+
}
|
349
|
+
free(tv->freqs);
|
350
|
+
free(p);
|
351
|
+
}
|
352
|
+
|
353
|
+
void tv_destroy_except_data(void *p)
|
354
|
+
{
|
355
|
+
TermVector *tv = (TermVector *)p;
|
356
|
+
free(tv->terms);
|
357
|
+
if (tv->positions != NULL) {
|
358
|
+
free(tv->positions);
|
359
|
+
}
|
360
|
+
if (tv->offsets != NULL) {
|
361
|
+
free(tv->offsets);
|
362
|
+
}
|
363
|
+
free(tv->freqs);
|
364
|
+
free(p);
|
365
|
+
}
|
366
|
+
|
367
|
+
int tvr_check_valid_format(InStream *is)
|
368
|
+
{
|
369
|
+
int format = is_read_int(is);
|
370
|
+
if (format > FORMAT_VERSION)
|
371
|
+
eprintf(ERROR, "Incompatible format version: %d expected %d or less",
|
372
|
+
format, FORMAT_VERSION);
|
373
|
+
return format;
|
374
|
+
}
|
375
|
+
|
376
|
+
TermVectorsReader *tvr_clone(TermVectorsReader *orig)
|
377
|
+
{
|
378
|
+
TermVectorsReader *clone = NULL;
|
379
|
+
if (orig->tvx && orig->tvd && orig->tvf) {
|
380
|
+
clone = ALLOC(TermVectorsReader);
|
381
|
+
memcpy(clone, orig, sizeof(TermVectorsReader));
|
382
|
+
clone->tvx = is_clone(orig->tvx);
|
383
|
+
clone->tvd = is_clone(orig->tvd);
|
384
|
+
clone->tvf = is_clone(orig->tvf);
|
385
|
+
}
|
386
|
+
return clone;
|
387
|
+
}
|
388
|
+
|
389
|
+
TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis)
|
390
|
+
{
|
391
|
+
TermVectorsReader *tvr = ALLOC(TermVectorsReader);
|
392
|
+
// Open files for TermVector storage
|
393
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
394
|
+
int segment_len = strlen(segment);
|
395
|
+
strcpy(fname, segment);
|
396
|
+
|
397
|
+
strcpy(fname + segment_len, TVX_EXTENSION);
|
398
|
+
InStream *is = tvr->tvx = store->open_input(store, fname);
|
399
|
+
tvr_check_valid_format(is);
|
400
|
+
tvr->size = is_length(is)/8;
|
401
|
+
|
402
|
+
strcpy(fname + segment_len, TVD_EXTENSION);
|
403
|
+
is = tvr->tvd = store->open_input(store, fname);
|
404
|
+
tvr->tvd_format = tvr_check_valid_format(is);
|
405
|
+
|
406
|
+
strcpy(fname + segment_len, TVF_EXTENSION);
|
407
|
+
is = tvr->tvf = store->open_input(store, fname);
|
408
|
+
tvr->tvf_format = tvr_check_valid_format(is);
|
409
|
+
|
410
|
+
tvr->fis = fis;
|
411
|
+
return tvr;
|
412
|
+
}
|
413
|
+
|
414
|
+
void tvr_close(TermVectorsReader *tvr)
|
415
|
+
{
|
416
|
+
is_close(tvr->tvx);
|
417
|
+
is_close(tvr->tvd);
|
418
|
+
is_close(tvr->tvf);
|
419
|
+
free(tvr);
|
420
|
+
}
|
421
|
+
|
422
|
+
TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
423
|
+
char *field, int tvf_pointer)
|
424
|
+
{
|
425
|
+
int i, j, store_positions, store_offsets, bits;
|
426
|
+
// Now read the data from specified position
|
427
|
+
// We don't need to offset by the FORMAT here since the pointer
|
428
|
+
// already includes the offset
|
429
|
+
is_seek(tvr->tvf, tvf_pointer);
|
430
|
+
|
431
|
+
int num_terms = is_read_vint(tvr->tvf);
|
432
|
+
// If no terms - return a constant empty termvector. However, this should
|
433
|
+
// never occur!
|
434
|
+
if (num_terms == 0)
|
435
|
+
return tv_create(field, NULL, 0, NULL, NULL, NULL);
|
436
|
+
|
437
|
+
if(tvr->tvf_format == FORMAT_VERSION) {
|
438
|
+
bits = is_read_byte(tvr->tvf);
|
439
|
+
store_positions = ((bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0);
|
440
|
+
store_offsets = ((bits & STORE_OFFSET_WITH_TERMVECTOR) != 0);
|
441
|
+
} else {
|
442
|
+
is_read_vint(tvr->tvf);
|
443
|
+
store_positions = false;
|
444
|
+
store_offsets = false;
|
445
|
+
}
|
446
|
+
|
447
|
+
char **terms = ALLOC_N(char *, num_terms);
|
448
|
+
int *term_freqs = ALLOC_N(int, num_terms);
|
449
|
+
|
450
|
+
// we may not need these, but declare them
|
451
|
+
int **positions = NULL;
|
452
|
+
TVOffsetInfo ***offsets = NULL;
|
453
|
+
|
454
|
+
if(store_positions)
|
455
|
+
positions = ALLOC_N(int *, num_terms);
|
456
|
+
|
457
|
+
if(store_offsets)
|
458
|
+
offsets = ALLOC_N(TVOffsetInfo **, num_terms);
|
459
|
+
|
460
|
+
int start, delta_length, total_length, freq, prev_pos;
|
461
|
+
int start_offset, end_offset, prev_offset;
|
462
|
+
int *pos;
|
463
|
+
TVOffsetInfo **offs;
|
464
|
+
char buffer[MAX_WORD_SIZE] = "";
|
465
|
+
|
466
|
+
for (i = 0; i < num_terms; i++) {
|
467
|
+
start = is_read_vint(tvr->tvf);
|
468
|
+
delta_length = is_read_vint(tvr->tvf);
|
469
|
+
total_length = start + delta_length;
|
470
|
+
is_read_chars(tvr->tvf, buffer, start, delta_length);
|
471
|
+
buffer[total_length] = '\0';
|
472
|
+
terms[i] = estrdup(buffer);
|
473
|
+
freq = is_read_vint(tvr->tvf);
|
474
|
+
term_freqs[i] = freq;
|
475
|
+
|
476
|
+
if (store_positions) {//read in the positions
|
477
|
+
pos = ALLOC_N(int, freq);
|
478
|
+
positions[i] = pos;
|
479
|
+
prev_pos = 0;
|
480
|
+
for (j = 0; j < freq; j++) {
|
481
|
+
pos[j] = prev_pos + is_read_vint(tvr->tvf);
|
482
|
+
prev_pos = pos[j];
|
483
|
+
}
|
484
|
+
}
|
485
|
+
|
486
|
+
if (store_offsets) {
|
487
|
+
offs = ALLOC_N(TVOffsetInfo *, freq);
|
488
|
+
offsets[i] = offs;
|
489
|
+
prev_offset = 0;
|
490
|
+
for (j = 0; j < freq; j++) {
|
491
|
+
start_offset = prev_offset + is_read_vint(tvr->tvf);
|
492
|
+
end_offset = start_offset + is_read_vint(tvr->tvf);
|
493
|
+
offs[j] = tvoi_create(start_offset, end_offset);
|
494
|
+
prev_offset = end_offset;
|
495
|
+
}
|
496
|
+
}
|
497
|
+
}
|
498
|
+
return tv_create(field, terms, num_terms, term_freqs, positions, offsets);
|
499
|
+
}
|
500
|
+
|
501
|
+
Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
|
502
|
+
{
|
503
|
+
int i;
|
504
|
+
Array *tvs = NULL;
|
505
|
+
// Check if no term vectors are available for this segment at all
|
506
|
+
if (tvr->tvx != NULL) {
|
507
|
+
// We need to offset by
|
508
|
+
is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
|
509
|
+
|
510
|
+
int position = is_read_long(tvr->tvx);
|
511
|
+
|
512
|
+
is_seek(tvr->tvd, position);
|
513
|
+
int field_count = is_read_vint(tvr->tvd);
|
514
|
+
|
515
|
+
// No fields are vectorized for this document
|
516
|
+
if (field_count > 0) {
|
517
|
+
int number = 0;
|
518
|
+
char **fields = ALLOC_N(char *, field_count);
|
519
|
+
|
520
|
+
for (i = 0; i < field_count; i++) {
|
521
|
+
if (tvr->tvd_format == FORMAT_VERSION)
|
522
|
+
number = is_read_vint(tvr->tvd);
|
523
|
+
else
|
524
|
+
number += is_read_vint(tvr->tvd);
|
525
|
+
|
526
|
+
fields[i] = tvr->fis->by_number[number]->name;
|
527
|
+
}
|
528
|
+
|
529
|
+
// Compute position in the tvf file
|
530
|
+
int position = 0;
|
531
|
+
int *tvf_pointers = ALLOC_N(int, field_count);
|
532
|
+
for (i = 0; i < field_count; i++) {
|
533
|
+
position += is_read_vint(tvr->tvd);
|
534
|
+
tvf_pointers[i] = position;
|
535
|
+
}
|
536
|
+
|
537
|
+
tvs = ary_create(field_count, &tv_destroy);
|
538
|
+
for (i = 0; i < field_count; i++) {
|
539
|
+
ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
|
540
|
+
}
|
541
|
+
free(fields);
|
542
|
+
free(tvf_pointers);
|
543
|
+
}
|
544
|
+
}
|
545
|
+
return tvs;
|
546
|
+
}
|
547
|
+
|
548
|
+
TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
|
549
|
+
{
|
550
|
+
int i;
|
551
|
+
// Check if no term vectors are available for this segment at all
|
552
|
+
int field_number = fis_get_number(tvr->fis, field);
|
553
|
+
TermVector *tv = NULL;
|
554
|
+
|
555
|
+
if (tvr->tvx != NULL) {
|
556
|
+
// We need to account for the FORMAT_SIZE at when seeking in the @tvx
|
557
|
+
// We don't need to do this in other seeks because we already have the
|
558
|
+
// file pointer that was written in another file
|
559
|
+
is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
|
560
|
+
// puts("TVX Pointer: " + @tvx.pos())
|
561
|
+
int pos = is_read_long(tvr->tvx);
|
562
|
+
|
563
|
+
is_seek(tvr->tvd, pos);
|
564
|
+
int field_count = is_read_vint(tvr->tvd);
|
565
|
+
//puts("Num Fields: " + field_count)
|
566
|
+
// There are only a few fields per document. We opt for a full scan
|
567
|
+
// rather then requiring that they be ordered. We need to read through
|
568
|
+
// all of the fields anyway to get to the tvf pointers.
|
569
|
+
int number = 0;
|
570
|
+
int found = -1;
|
571
|
+
|
572
|
+
for (i = 0; i < field_count; i++) {
|
573
|
+
if (tvr->tvd_format == FORMAT_VERSION)
|
574
|
+
number = is_read_vint(tvr->tvd);
|
575
|
+
else
|
576
|
+
number += is_read_vint(tvr->tvd);
|
577
|
+
|
578
|
+
if (number == field_number)
|
579
|
+
found = i;
|
580
|
+
}
|
581
|
+
|
582
|
+
// This field, although valid in the segment, was not found in this
|
583
|
+
// document
|
584
|
+
if (found != -1) {
|
585
|
+
// Compute pos in the @tvf file
|
586
|
+
pos = 0;
|
587
|
+
for (i = 0; i <= found; i++)
|
588
|
+
pos += is_read_vint(tvr->tvd);
|
589
|
+
|
590
|
+
tv = tvr_read_term_vector(tvr, field, pos);
|
591
|
+
}
|
592
|
+
}
|
593
|
+
return tv;
|
594
|
+
}
|