ferret 0.3.2 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/index.h
ADDED
@@ -0,0 +1,884 @@
|
|
1
|
+
#ifndef FRT_INDEX_H
|
2
|
+
#define FRT_INDEX_H
|
3
|
+
|
4
|
+
#include <limits.h>
|
5
|
+
#include "global.h"
|
6
|
+
#include "array.h"
|
7
|
+
#include "bitvector.h"
|
8
|
+
#include "hashset.h"
|
9
|
+
#include "priorityqueue.h"
|
10
|
+
#include "hash.h"
|
11
|
+
#include "store.h"
|
12
|
+
#include "document.h"
|
13
|
+
#include "analysis.h"
|
14
|
+
|
15
|
+
#define SEGMENT_NAME_MAX_LENGTH 100
|
16
|
+
|
17
|
+
typedef struct Config {
|
18
|
+
int merge_factor;
|
19
|
+
int min_merge_docs;
|
20
|
+
int max_merge_docs;
|
21
|
+
int max_field_length;
|
22
|
+
int term_index_interval;
|
23
|
+
} FerretConfig;
|
24
|
+
|
25
|
+
extern FerretConfig config;
|
26
|
+
|
27
|
+
typedef struct IndexReader IndexReader;
|
28
|
+
typedef struct IndexWriter IndexWriter;
|
29
|
+
typedef struct SegmentReader SegmentReader;
|
30
|
+
|
31
|
+
/***************************************************************************
|
32
|
+
*
|
33
|
+
* CacheObject
|
34
|
+
*
|
35
|
+
***************************************************************************/
|
36
|
+
|
37
|
+
typedef struct CacheObject {
|
38
|
+
HshTable *ref_tab1;
|
39
|
+
HshTable *ref_tab2;
|
40
|
+
void *ref1;
|
41
|
+
void *ref2;
|
42
|
+
void *obj;
|
43
|
+
void (*destroy)(void *p);
|
44
|
+
} CacheObject;
|
45
|
+
|
46
|
+
void cache_destroy(CacheObject *co);
|
47
|
+
CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
|
48
|
+
void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
|
49
|
+
unsigned int co_hash(const void *key);
|
50
|
+
int co_eq(const void *key1, const void *key2);
|
51
|
+
HshTable *co_hsh_create();
|
52
|
+
|
53
|
+
/****************************************************************************
|
54
|
+
*
|
55
|
+
* FieldInfo
|
56
|
+
*
|
57
|
+
****************************************************************************/
|
58
|
+
|
59
|
+
typedef struct FieldInfo {
|
60
|
+
char *name;
|
61
|
+
int number;
|
62
|
+
bool is_indexed : 1;
|
63
|
+
bool store_tv : 1;
|
64
|
+
bool store_offset : 1;
|
65
|
+
bool store_pos : 1;
|
66
|
+
bool omit_norms : 1;
|
67
|
+
} FieldInfo;
|
68
|
+
|
69
|
+
FieldInfo *fi_create(char *name,
|
70
|
+
int number,
|
71
|
+
bool is_indexed,
|
72
|
+
bool store_tv,
|
73
|
+
bool store_pos,
|
74
|
+
bool store_offset,
|
75
|
+
bool omit_norms);
|
76
|
+
void fi_destroy(void *p);
|
77
|
+
|
78
|
+
/****************************************************************************
|
79
|
+
*
|
80
|
+
* FieldInfos
|
81
|
+
*
|
82
|
+
****************************************************************************/
|
83
|
+
|
84
|
+
typedef struct FieldInfos {
|
85
|
+
HashEntry **by_name;
|
86
|
+
FieldInfo **by_number;
|
87
|
+
int fcnt;
|
88
|
+
} FieldInfos;
|
89
|
+
|
90
|
+
FieldInfos *fis_create();
|
91
|
+
FieldInfos *fis_open(Store *store, char *filename);
|
92
|
+
void fis_destroy(void *p);
|
93
|
+
FieldInfo *fis_add(FieldInfos *fis,
|
94
|
+
char *name,
|
95
|
+
bool is_indexed,
|
96
|
+
bool store_tv,
|
97
|
+
bool store_offset,
|
98
|
+
bool store_pos,
|
99
|
+
bool omit_norms);
|
100
|
+
|
101
|
+
void fis_add_fields(FieldInfos *fis,
|
102
|
+
HashSet *field_names,
|
103
|
+
bool is_indexed,
|
104
|
+
bool store_tv,
|
105
|
+
bool store_offset,
|
106
|
+
bool store_pos,
|
107
|
+
bool omit_norms);
|
108
|
+
bool fis_has_vectors(FieldInfos *fis);
|
109
|
+
void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
|
110
|
+
FieldInfos *fis_read(FieldInfos *fis, InStream *is);
|
111
|
+
FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
|
112
|
+
unsigned long long fis_get_number(FieldInfos *fis, char *name);
|
113
|
+
FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
|
114
|
+
|
115
|
+
|
116
|
+
/****************************************************************************
|
117
|
+
*
|
118
|
+
* Term
|
119
|
+
*
|
120
|
+
****************************************************************************/
|
121
|
+
|
122
|
+
typedef struct Term {
|
123
|
+
char *field;
|
124
|
+
char *text;
|
125
|
+
} Term;
|
126
|
+
|
127
|
+
Term *term_clone(Term *term);
|
128
|
+
Term *term_create(const char *field, char *text);
|
129
|
+
void term_destroy(void *p);
|
130
|
+
int term_cmp(void *t1, void *t2);
|
131
|
+
int term_eq(const void *t1, const void *t2);
|
132
|
+
unsigned int term_hash(const void *t);
|
133
|
+
char *term_to_s(Term *term);
|
134
|
+
|
135
|
+
/****************************************************************************
|
136
|
+
*
|
137
|
+
* TermBuffer
|
138
|
+
*
|
139
|
+
****************************************************************************/
|
140
|
+
|
141
|
+
typedef struct TermBuffer {
|
142
|
+
char *field;
|
143
|
+
char text[MAX_WORD_SIZE];
|
144
|
+
} TermBuffer;
|
145
|
+
|
146
|
+
TermBuffer *tb_create();
|
147
|
+
void tb_destroy(void *p);
|
148
|
+
TermBuffer *tb_set_term(TermBuffer *tb, Term *t);
|
149
|
+
Term *tb_get_term(TermBuffer *tb);
|
150
|
+
int tb_cmp(TermBuffer *tb1, TermBuffer *tb2);
|
151
|
+
int tb_term_cmp(TermBuffer *tb, Term *t);
|
152
|
+
TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2);
|
153
|
+
TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis);
|
154
|
+
|
155
|
+
/****************************************************************************
|
156
|
+
*
|
157
|
+
* TermInfo
|
158
|
+
*
|
159
|
+
****************************************************************************/
|
160
|
+
|
161
|
+
typedef struct TermInfo {
|
162
|
+
int doc_freq;
|
163
|
+
int freq_pointer;
|
164
|
+
int prox_pointer;
|
165
|
+
int skip_offset;
|
166
|
+
} TermInfo;
|
167
|
+
|
168
|
+
TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset);
|
169
|
+
TermInfo *ti_set(TermInfo *ti, int df, int fp, int pp, int so);
|
170
|
+
void ti_destroy(void *p);
|
171
|
+
TermInfo *ti_cpy(TermInfo *ti1, TermInfo *ti2);
|
172
|
+
TermInfo *ti_clone(TermInfo *other);
|
173
|
+
int ti_eq(TermInfo *ti1, TermInfo *ti2);
|
174
|
+
|
175
|
+
/****************************************************************************
|
176
|
+
*
|
177
|
+
* TermEnum
|
178
|
+
*
|
179
|
+
****************************************************************************/
|
180
|
+
|
181
|
+
typedef struct TermEnumFilter TermEnumFilter;
|
182
|
+
typedef struct TermEnum TermEnum;
|
183
|
+
struct TermEnum {
|
184
|
+
void *data;
|
185
|
+
TermBuffer *(*next)(TermEnum *te);
|
186
|
+
void (*close)(TermEnum *te);
|
187
|
+
TermEnum *(*clone)(TermEnum *te);
|
188
|
+
TermBuffer *tb_curr;
|
189
|
+
TermBuffer *tb_prev;
|
190
|
+
TermInfo *ti_curr;
|
191
|
+
};
|
192
|
+
|
193
|
+
TermBuffer *te_skip_to(struct TermEnum *te, Term *t);
|
194
|
+
|
195
|
+
Term *te_get_term(struct TermEnum *te);
|
196
|
+
TermInfo *te_get_ti(struct TermEnum *te);
|
197
|
+
|
198
|
+
/* * SegmentTermEnum * */
|
199
|
+
|
200
|
+
typedef struct SegmentTermEnum {
|
201
|
+
FieldInfos *fis;
|
202
|
+
int is_index;
|
203
|
+
InStream *is;
|
204
|
+
int size;
|
205
|
+
int pos;
|
206
|
+
int index_pointer;
|
207
|
+
int index_interval;
|
208
|
+
int skip_interval;
|
209
|
+
int format_m1skip_interval;
|
210
|
+
int format;
|
211
|
+
} SegmentTermEnum;
|
212
|
+
|
213
|
+
|
214
|
+
TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index);
|
215
|
+
TermBuffer *ste_next(struct TermEnum *te);
|
216
|
+
void ste_close(struct TermEnum *te);
|
217
|
+
|
218
|
+
/* * MultiTermEnum * */
|
219
|
+
|
220
|
+
typedef struct MultiTermEnum {
|
221
|
+
int doc_freq;
|
222
|
+
PriorityQueue *smi_queue;
|
223
|
+
} MultiTermEnum;
|
224
|
+
|
225
|
+
TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *term);
|
226
|
+
|
227
|
+
/****************************************************************************
|
228
|
+
*
|
229
|
+
* TermInfosWriter
|
230
|
+
*
|
231
|
+
****************************************************************************/
|
232
|
+
|
233
|
+
#define TERM_INFO_FORMAT -2
|
234
|
+
|
235
|
+
typedef struct TermInfosWriter {
|
236
|
+
int index_interval;
|
237
|
+
int skip_interval;
|
238
|
+
int size;
|
239
|
+
int last_index_pointer;
|
240
|
+
bool is_index;
|
241
|
+
OutStream *os;
|
242
|
+
struct TermInfosWriter *other;
|
243
|
+
Term *last_term;
|
244
|
+
TermInfo *last_term_info;
|
245
|
+
FieldInfos *fis;
|
246
|
+
char *curr_field;
|
247
|
+
int curr_field_num;
|
248
|
+
} TermInfosWriter;
|
249
|
+
|
250
|
+
TermInfosWriter *tiw_open(Store *store,
|
251
|
+
char *segment,
|
252
|
+
FieldInfos *fis,
|
253
|
+
int interval);
|
254
|
+
void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti);
|
255
|
+
void tiw_close(TermInfosWriter *tiw);
|
256
|
+
|
257
|
+
/****************************************************************************
|
258
|
+
*
|
259
|
+
* TermInfosReader
|
260
|
+
*
|
261
|
+
****************************************************************************/
|
262
|
+
|
263
|
+
typedef struct TermInfosReader {
|
264
|
+
mutex_t mutex;
|
265
|
+
TermEnum *orig_te;
|
266
|
+
thread_key_t thread_te;
|
267
|
+
Array *te_bucket;
|
268
|
+
TermEnum *index_te;
|
269
|
+
int size;
|
270
|
+
int skip_interval;
|
271
|
+
int index_size;
|
272
|
+
Term **index_terms;
|
273
|
+
TermInfo **index_term_infos;
|
274
|
+
int *index_pointers;
|
275
|
+
} TermInfosReader;
|
276
|
+
|
277
|
+
TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis);
|
278
|
+
void tir_close(TermInfosReader *tir);
|
279
|
+
Term *tir_get_term(TermInfosReader *tir, int position);
|
280
|
+
int tir_get_term_pos(TermInfosReader *tir, Term *t);
|
281
|
+
TermInfo *tir_get_ti(TermInfosReader *tir, Term *t);
|
282
|
+
|
283
|
+
/****************************************************************************
|
284
|
+
*
|
285
|
+
* TVOffsetInfo
|
286
|
+
*
|
287
|
+
****************************************************************************/
|
288
|
+
|
289
|
+
typedef struct TVOffsetInfo {
|
290
|
+
int start;
|
291
|
+
int end;
|
292
|
+
} TVOffsetInfo;
|
293
|
+
|
294
|
+
TVOffsetInfo *tvoi_create(int start, int end);
|
295
|
+
void tvoi_destroy(void *p);
|
296
|
+
|
297
|
+
/****************************************************************************
|
298
|
+
*
|
299
|
+
* TVField
|
300
|
+
*
|
301
|
+
****************************************************************************/
|
302
|
+
|
303
|
+
typedef struct TVField {
|
304
|
+
int tvf_pointer;
|
305
|
+
int number;
|
306
|
+
unsigned int store_positions : 1;
|
307
|
+
unsigned int store_offsets : 1;
|
308
|
+
} TVField;
|
309
|
+
|
310
|
+
TVField *tvf_create(int number, int store_positions, int store_offsets);
|
311
|
+
void tvf_destroy(void *p);
|
312
|
+
|
313
|
+
/****************************************************************************
|
314
|
+
*
|
315
|
+
* TVTerm
|
316
|
+
*
|
317
|
+
****************************************************************************/
|
318
|
+
|
319
|
+
typedef struct TVTerm {
|
320
|
+
char *text;
|
321
|
+
int freq;
|
322
|
+
int *positions;
|
323
|
+
TVOffsetInfo **offsets;
|
324
|
+
} TVTerm;
|
325
|
+
|
326
|
+
TVTerm *tvt_create(char *text,
|
327
|
+
int freq,
|
328
|
+
int *positions,
|
329
|
+
TVOffsetInfo **offsets);
|
330
|
+
void tvt_destroy(void *p);
|
331
|
+
|
332
|
+
/****************************************************************************
|
333
|
+
*
|
334
|
+
* TermVector
|
335
|
+
*
|
336
|
+
****************************************************************************/
|
337
|
+
|
338
|
+
typedef struct TermVector {
|
339
|
+
char *field;
|
340
|
+
char **terms;
|
341
|
+
int tcnt;
|
342
|
+
int *freqs;
|
343
|
+
int **positions;
|
344
|
+
TVOffsetInfo ***offsets;
|
345
|
+
} TermVector;
|
346
|
+
|
347
|
+
TermVector *tv_create(const char *field,
|
348
|
+
char **terms,
|
349
|
+
int tcnt,
|
350
|
+
int *freqs,
|
351
|
+
int **positions,
|
352
|
+
TVOffsetInfo ***offsets);
|
353
|
+
void tv_destroy(void *p);
|
354
|
+
|
355
|
+
/****************************************************************************
|
356
|
+
*
|
357
|
+
* TermVectorsWriter
|
358
|
+
*
|
359
|
+
****************************************************************************/
|
360
|
+
|
361
|
+
#define STORE_POSITIONS_WITH_TERMVECTOR 0x1
|
362
|
+
#define STORE_OFFSET_WITH_TERMVECTOR 0x2
|
363
|
+
|
364
|
+
#define FORMAT_VERSION 2
|
365
|
+
#define FORMAT_SIZE 4
|
366
|
+
|
367
|
+
#define TVX_EXTENSION ".tvx"
|
368
|
+
#define TVD_EXTENSION ".tvd"
|
369
|
+
#define TVF_EXTENSION ".tvf"
|
370
|
+
|
371
|
+
typedef struct TermVectorsWriter {
|
372
|
+
TVField *curr_field;
|
373
|
+
int curr_doc_pointer;
|
374
|
+
OutStream *tvx;
|
375
|
+
OutStream *tvd;
|
376
|
+
OutStream *tvf;
|
377
|
+
FieldInfos *fis;
|
378
|
+
TVField **fields;
|
379
|
+
int fcnt;
|
380
|
+
int fsize;
|
381
|
+
TVTerm **terms;
|
382
|
+
int tcnt;
|
383
|
+
int tsize;
|
384
|
+
} TermVectorsWriter;
|
385
|
+
|
386
|
+
TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
|
387
|
+
void tvw_close(TermVectorsWriter *tvw);
|
388
|
+
void tvw_open_doc(TermVectorsWriter *tvw);
|
389
|
+
void tvw_close_doc(TermVectorsWriter *tvw);
|
390
|
+
void tvw_open_field(TermVectorsWriter *tvw, char *field);
|
391
|
+
void tvw_close_field(TermVectorsWriter *tvw);
|
392
|
+
void tvw_add_term(TermVectorsWriter *tvw, char *text, int freq, int *positions, TVOffsetInfo **offsets);
|
393
|
+
void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
|
394
|
+
|
395
|
+
|
396
|
+
/****************************************************************************
|
397
|
+
*
|
398
|
+
* TermVectorsReader
|
399
|
+
*
|
400
|
+
****************************************************************************/
|
401
|
+
|
402
|
+
typedef struct TermVectorsReader {
|
403
|
+
int size;
|
404
|
+
InStream *tvx;
|
405
|
+
InStream *tvd;
|
406
|
+
InStream *tvf;
|
407
|
+
FieldInfos *fis;
|
408
|
+
int tvd_format;
|
409
|
+
int tvf_format;
|
410
|
+
} TermVectorsReader;
|
411
|
+
|
412
|
+
TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
|
413
|
+
TermVectorsReader *tvr_clone(TermVectorsReader *orig);
|
414
|
+
void tvr_close(TermVectorsReader *tvr);
|
415
|
+
TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
416
|
+
char *field, int tvf_pointer);
|
417
|
+
Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
|
418
|
+
TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
|
419
|
+
|
420
|
+
/****************************************************************************
|
421
|
+
*
|
422
|
+
* FieldsWriter
|
423
|
+
*
|
424
|
+
****************************************************************************/
|
425
|
+
|
426
|
+
#define FIELD_IS_TOKENIZED 0X1
|
427
|
+
#define FIELD_IS_BINARY 0X2
|
428
|
+
#define FIELD_IS_COMPRESSED 0X4
|
429
|
+
|
430
|
+
typedef struct FieldsWriter {
|
431
|
+
FieldInfos *fis;
|
432
|
+
OutStream *fields_out;
|
433
|
+
OutStream *index_out;
|
434
|
+
} FieldsWriter;
|
435
|
+
|
436
|
+
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
|
437
|
+
void fw_close(FieldsWriter *fw);
|
438
|
+
void fw_add_doc(FieldsWriter *fw, Document *doc);
|
439
|
+
|
440
|
+
/****************************************************************************
|
441
|
+
*
|
442
|
+
* TermDocEnum
|
443
|
+
*
|
444
|
+
****************************************************************************/
|
445
|
+
|
446
|
+
typedef struct TermDocEnum TermDocEnum;
|
447
|
+
struct TermDocEnum {
|
448
|
+
void *data;
|
449
|
+
void (*seek)(TermDocEnum *tde, Term *term);
|
450
|
+
int (*doc_num)(TermDocEnum *tde);
|
451
|
+
int (*freq)(TermDocEnum *tde);
|
452
|
+
bool (*next)(TermDocEnum *tde);
|
453
|
+
int (*read)(TermDocEnum *tde, int *docs, int *freqs, int req_num);
|
454
|
+
bool (*skip_to)(TermDocEnum *tde, int target);
|
455
|
+
int (*next_position)(TermDocEnum *tde);
|
456
|
+
void (*close)(TermDocEnum *tde);
|
457
|
+
};
|
458
|
+
|
459
|
+
/* * SegmentTermDocEnum * */
|
460
|
+
|
461
|
+
typedef struct SegmentTermDocEnum SegmentTermDocEnum;
|
462
|
+
struct SegmentTermDocEnum {
|
463
|
+
SegmentReader *parent;
|
464
|
+
InStream *freq_in;
|
465
|
+
int count; // the number of docs for this term that we have skipped
|
466
|
+
int doc_freq; // the number of doc this term appears in
|
467
|
+
BitVector *deleted_docs;
|
468
|
+
int doc_num;
|
469
|
+
int freq;
|
470
|
+
int skip_interval;
|
471
|
+
int num_skips;
|
472
|
+
int skip_count;
|
473
|
+
InStream *skip_in;
|
474
|
+
int skip_doc;
|
475
|
+
int freq_pointer;
|
476
|
+
int prox_pointer;
|
477
|
+
int skip_pointer;
|
478
|
+
unsigned int have_skipped : 1;
|
479
|
+
void (*skip_prox)(SegmentTermDocEnum *stde);
|
480
|
+
InStream *prox_in;
|
481
|
+
int prox_cnt;
|
482
|
+
int position;
|
483
|
+
void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
|
484
|
+
};
|
485
|
+
|
486
|
+
TermDocEnum *stde_create(IndexReader *ir);
|
487
|
+
void stde_seek_ti(TermDocEnum *tde, TermInfo *ti);
|
488
|
+
|
489
|
+
/* * SegmentTermPosEnum * */
|
490
|
+
TermDocEnum *stpe_create(IndexReader *ir);
|
491
|
+
|
492
|
+
/* * MultiTermDocEnum * */
|
493
|
+
typedef struct MultiTermDocEnum MultiTermDocEnum;
|
494
|
+
struct MultiTermDocEnum {
|
495
|
+
IndexReader **irs;
|
496
|
+
int *starts;
|
497
|
+
int ir_cnt;
|
498
|
+
Term *term;
|
499
|
+
int base;
|
500
|
+
int pointer;
|
501
|
+
TermDocEnum **irs_tde;
|
502
|
+
TermDocEnum *curr_tde;
|
503
|
+
TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
|
504
|
+
};
|
505
|
+
|
506
|
+
TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
|
507
|
+
|
508
|
+
/* * MultiTermPosEnum * */
|
509
|
+
TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
|
510
|
+
|
511
|
+
/****************************************************************************
|
512
|
+
* MultipleTermDocPosEnum
|
513
|
+
****************************************************************************/
|
514
|
+
|
515
|
+
#define MTDPE_POS_QUEUE_INIT_CAPA 8
|
516
|
+
typedef struct {
|
517
|
+
int doc_num;
|
518
|
+
int freq;
|
519
|
+
PriorityQueue *pq;
|
520
|
+
int *pos_queue;
|
521
|
+
int pos_queue_index;
|
522
|
+
int pos_queue_capa;
|
523
|
+
} MultipleTermDocPosEnum;
|
524
|
+
|
525
|
+
TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
|
526
|
+
|
527
|
+
/****************************************************************************
|
528
|
+
*
|
529
|
+
* FieldsReader
|
530
|
+
*
|
531
|
+
****************************************************************************/
|
532
|
+
|
533
|
+
typedef struct FieldsReader {
|
534
|
+
int len;
|
535
|
+
FieldInfos *fis;
|
536
|
+
InStream *fields_in;
|
537
|
+
InStream *index_in;
|
538
|
+
} FieldsReader;
|
539
|
+
|
540
|
+
FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
|
541
|
+
void fr_close(FieldsReader *fr);
|
542
|
+
Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
543
|
+
|
544
|
+
/****************************************************************************
|
545
|
+
*
|
546
|
+
* Posting
|
547
|
+
*
|
548
|
+
****************************************************************************/
|
549
|
+
|
550
|
+
typedef struct Posting {
|
551
|
+
Term *term;
|
552
|
+
int freq;
|
553
|
+
int size;
|
554
|
+
int *positions;
|
555
|
+
TVOffsetInfo **offsets;
|
556
|
+
} Posting;
|
557
|
+
|
558
|
+
Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
|
559
|
+
void p_destroy(void *p);
|
560
|
+
void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset);
|
561
|
+
|
562
|
+
|
563
|
+
/****************************************************************************
|
564
|
+
*
|
565
|
+
* DocumentWriter
|
566
|
+
*
|
567
|
+
****************************************************************************/
|
568
|
+
|
569
|
+
#include "search.h"
|
570
|
+
|
571
|
+
typedef struct DocumentWriter {
|
572
|
+
Store *store;
|
573
|
+
Analyzer *analyzer;
|
574
|
+
Similarity *similarity;
|
575
|
+
HshTable *postingtable;
|
576
|
+
int pcnt;
|
577
|
+
FieldInfos *fis;
|
578
|
+
float *field_boosts;
|
579
|
+
int *field_lengths;
|
580
|
+
int *field_positions;
|
581
|
+
int *field_offsets;
|
582
|
+
int max_field_length;
|
583
|
+
int term_index_interval;
|
584
|
+
} DocumentWriter;
|
585
|
+
|
586
|
+
DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
|
587
|
+
Similarity *similarity, int max_field_length, int term_index_interval);
|
588
|
+
void dw_close(DocumentWriter *dw);
|
589
|
+
void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
|
590
|
+
|
591
|
+
/****************************************************************************
|
592
|
+
*
|
593
|
+
* SegmentInfo
|
594
|
+
*
|
595
|
+
****************************************************************************/
|
596
|
+
|
597
|
+
typedef struct SegmentInfo {
|
598
|
+
char *name;
|
599
|
+
int doc_cnt;
|
600
|
+
Store *store;
|
601
|
+
} SegmentInfo;
|
602
|
+
|
603
|
+
SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
|
604
|
+
void si_destroy(void *p);
|
605
|
+
bool si_has_deletions(SegmentInfo *si);
|
606
|
+
bool si_uses_compound_file(SegmentInfo *si);
|
607
|
+
bool si_has_separate_norms(SegmentInfo *si);
|
608
|
+
|
609
|
+
/****************************************************************************
|
610
|
+
*
|
611
|
+
* SegmentInfos
|
612
|
+
*
|
613
|
+
****************************************************************************/
|
614
|
+
|
615
|
+
typedef struct SegmentInfos {
|
616
|
+
Store *store;
|
617
|
+
SegmentInfo **segs;
|
618
|
+
int scnt;
|
619
|
+
int size;
|
620
|
+
int counter;
|
621
|
+
unsigned int version;
|
622
|
+
int format;
|
623
|
+
} SegmentInfos;
|
624
|
+
|
625
|
+
SegmentInfos *sis_create();
|
626
|
+
void sis_destroy(void *p);
|
627
|
+
void sis_add_si(SegmentInfos *sis, SegmentInfo *si);
|
628
|
+
void sis_del_at(SegmentInfos *sis, int at);
|
629
|
+
void sis_del_from_to(SegmentInfos *sis, int from, int to);
|
630
|
+
void sis_clear(SegmentInfos *sis);
|
631
|
+
void sis_read(SegmentInfos *sis, Store *store);
|
632
|
+
void sis_write(SegmentInfos *sis, Store *store);
|
633
|
+
int sis_read_current_version(Store *store);
|
634
|
+
|
635
|
+
/****************************************************************************
|
636
|
+
*
|
637
|
+
* IndexReader
|
638
|
+
*
|
639
|
+
****************************************************************************/
|
640
|
+
|
641
|
+
enum FIELD_TYPE {
|
642
|
+
// all fields
|
643
|
+
IR_ALL,
|
644
|
+
// all indexed fields
|
645
|
+
IR_INDEXED,
|
646
|
+
// all fields which are not indexed
|
647
|
+
IR_UNINDEXED,
|
648
|
+
// all fields which are indexed with termvectors enables
|
649
|
+
IR_INDEXED_WITH_TERM_VECTOR,
|
650
|
+
// all fields which are indexed but don't have termvectors enabled
|
651
|
+
IR_INDEXED_NO_TERM_VECTOR,
|
652
|
+
// all fields where termvectors are enabled. Please note that only standard
|
653
|
+
// termvector fields are returned
|
654
|
+
IR_TERM_VECTOR,
|
655
|
+
// all field with termvectors wiht positions enabled
|
656
|
+
IR_TERM_VECTOR_WITH_POSITION,
|
657
|
+
// all fields where termvectors with offset position are set
|
658
|
+
IR_TERM_VECTOR_WITH_OFFSET,
|
659
|
+
// all fields where termvectors with offset and position values set
|
660
|
+
IR_TERM_VECTOR_WITH_POSITION_OFFSET
|
661
|
+
};
|
662
|
+
|
663
|
+
struct IndexReader {
|
664
|
+
mutex_t mutex;
|
665
|
+
HshTable *cache;
|
666
|
+
HshTable *sort_cache;
|
667
|
+
void *data;
|
668
|
+
Store *store;
|
669
|
+
Lock *write_lock;
|
670
|
+
SegmentInfos *sis;
|
671
|
+
bool has_changes : 1;
|
672
|
+
bool is_stale : 1;
|
673
|
+
bool is_owner : 1;
|
674
|
+
bool close_store : 1;
|
675
|
+
TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
|
676
|
+
Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
|
677
|
+
int (*num_docs)(IndexReader *ir);
|
678
|
+
int (*max_doc)(IndexReader *ir);
|
679
|
+
Document *(*get_doc)(IndexReader *ir, int doc_num);
|
680
|
+
uchar *(*get_norms)(IndexReader *ir, char *field);
|
681
|
+
uchar *(*get_norms_always)(IndexReader *ir, char *field);
|
682
|
+
void (*do_set_norm)(IndexReader *ir, int doc_num, char *field, uchar val);
|
683
|
+
void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf, int offset);
|
684
|
+
TermEnum *(*terms)(IndexReader *ir);
|
685
|
+
TermEnum *(*terms_from)(IndexReader *ir, Term *term);
|
686
|
+
int (*doc_freq)(IndexReader *ir, Term *t);
|
687
|
+
TermDocEnum *(*term_docs)(IndexReader *ir);
|
688
|
+
TermDocEnum *(*term_positions)(IndexReader *ir);
|
689
|
+
void (*do_delete_doc)(IndexReader *ir, int doc_num);
|
690
|
+
void (*do_undelete_all)(IndexReader *ir);
|
691
|
+
bool (*is_deleted)(IndexReader *ir, int doc_num);
|
692
|
+
bool (*has_deletions)(IndexReader *ir);
|
693
|
+
bool (*has_norms)(IndexReader *ir, char *field);
|
694
|
+
HashSet *(*get_field_names)(IndexReader *ir, int field_type);
|
695
|
+
void (*do_commit)(IndexReader *ir);
|
696
|
+
void (*do_close)(IndexReader *ir);
|
697
|
+
void (*acquire_write_lock)(IndexReader *ir);
|
698
|
+
};
|
699
|
+
|
700
|
+
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_store);
|
701
|
+
IndexReader *ir_open(Store *store, int close_store);
|
702
|
+
bool ir_index_exists(Store *store);
|
703
|
+
void ir_close(IndexReader *ir);
|
704
|
+
void ir_commit(IndexReader *ir);
|
705
|
+
void ir_delete_doc(IndexReader *ir, int doc_num);
|
706
|
+
void ir_undelete_all(IndexReader *ir);
|
707
|
+
void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
|
708
|
+
void ir_destroy(void *p);
|
709
|
+
Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
|
710
|
+
TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
|
711
|
+
TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
|
712
|
+
void ir_add_cache(IndexReader *ir);
|
713
|
+
bool ir_is_latest(IndexReader *ir);
|
714
|
+
|
715
|
+
/****************************************************************************
|
716
|
+
*
|
717
|
+
* Norm
|
718
|
+
*
|
719
|
+
****************************************************************************/
|
720
|
+
|
721
|
+
typedef struct Norm {
|
722
|
+
bool is_dirty : 1;
|
723
|
+
int field_num;
|
724
|
+
InStream *is;
|
725
|
+
uchar *bytes;
|
726
|
+
} Norm;
|
727
|
+
|
728
|
+
/****************************************************************************
|
729
|
+
*
|
730
|
+
* SegmentReader
|
731
|
+
*
|
732
|
+
****************************************************************************/
|
733
|
+
|
734
|
+
struct SegmentReader {
|
735
|
+
FieldInfos *fis;
|
736
|
+
FieldsReader *fr;
|
737
|
+
char *segment;
|
738
|
+
BitVector *deleted_docs;
|
739
|
+
bool deleted_docs_dirty : 1;
|
740
|
+
bool undelete_all : 1;
|
741
|
+
bool norms_dirty : 1;
|
742
|
+
InStream *freq_in;
|
743
|
+
InStream *prox_in;
|
744
|
+
TermInfosReader *tir;
|
745
|
+
TermVectorsReader *orig_tvr;
|
746
|
+
thread_key_t thread_tvr;
|
747
|
+
Array *tvr_bucket;
|
748
|
+
HshTable *norms;
|
749
|
+
Store *cfs_store;
|
750
|
+
uchar *fake_norms;
|
751
|
+
};
|
752
|
+
|
753
|
+
IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_store);
|
754
|
+
IndexReader *sr_open_si(SegmentInfo *si);
|
755
|
+
//int sr_has_deletions(IndexReader *ir);
|
756
|
+
|
757
|
+
/****************************************************************************
|
758
|
+
*
|
759
|
+
* MultiReader
|
760
|
+
*
|
761
|
+
****************************************************************************/
|
762
|
+
|
763
|
+
typedef struct MultiReader {
|
764
|
+
bool has_deletions : 1;
|
765
|
+
int max_doc;
|
766
|
+
int num_docs_cache;
|
767
|
+
int rcnt;
|
768
|
+
int *starts;
|
769
|
+
IndexReader **sub_readers;
|
770
|
+
HshTable *norms_cache;
|
771
|
+
} MultiReader;
|
772
|
+
|
773
|
+
IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
774
|
+
int rcnt, int close_store);
|
775
|
+
|
776
|
+
/****************************************************************************
|
777
|
+
*
|
778
|
+
* SegmentMergeInfo
|
779
|
+
*
|
780
|
+
****************************************************************************/
|
781
|
+
|
782
|
+
typedef struct SegmentMergeInfo {
|
783
|
+
int base;
|
784
|
+
IndexReader *ir;
|
785
|
+
TermEnum *te;
|
786
|
+
TermBuffer *tb;
|
787
|
+
TermDocEnum *postings;
|
788
|
+
int *doc_map;
|
789
|
+
} SegmentMergeInfo;
|
790
|
+
|
791
|
+
SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
|
792
|
+
void smi_destroy(void *p);
|
793
|
+
TermBuffer *smi_next(SegmentMergeInfo *smi);
|
794
|
+
bool smi_lt(void *p1, void *p2);
|
795
|
+
|
796
|
+
/****************************************************************************
|
797
|
+
*
|
798
|
+
* SegmentMerger
|
799
|
+
*
|
800
|
+
****************************************************************************/
|
801
|
+
|
802
|
+
typedef struct SegmentMerger {
|
803
|
+
Store *store;
|
804
|
+
char *name;
|
805
|
+
Array *readers;
|
806
|
+
FieldInfos *fis;
|
807
|
+
OutStream *freq_out;
|
808
|
+
OutStream *prox_out;
|
809
|
+
TermInfosWriter *tiw;
|
810
|
+
Term *terms_buf;
|
811
|
+
int terms_buf_pointer;
|
812
|
+
int terms_buf_size;
|
813
|
+
PriorityQueue *queue;
|
814
|
+
TermInfo *ti;
|
815
|
+
int term_index_interval;
|
816
|
+
OutStream *skip_buffer;
|
817
|
+
int skip_interval;
|
818
|
+
int last_skip_doc;
|
819
|
+
int last_skip_freq_pointer;
|
820
|
+
int last_skip_prox_pointer;
|
821
|
+
} SegmentMerger;
|
822
|
+
|
823
|
+
SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
|
824
|
+
void sm_destroy(void *p);
|
825
|
+
void sm_add(SegmentMerger *sm, IndexReader *ir);
|
826
|
+
int sm_merge(SegmentMerger *sm);
|
827
|
+
Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
828
|
+
|
829
|
+
|
830
|
+
/****************************************************************************
|
831
|
+
*
|
832
|
+
* IndexWriter
|
833
|
+
*
|
834
|
+
****************************************************************************/
|
835
|
+
|
836
|
+
#define WRITE_LOCK_NAME "write"
|
837
|
+
#define COMMIT_LOCK_NAME "commit"
|
838
|
+
struct IndexWriter {
|
839
|
+
mutex_t mutex;
|
840
|
+
int merge_factor;
|
841
|
+
int min_merge_docs;
|
842
|
+
int max_merge_docs;
|
843
|
+
int max_field_length;
|
844
|
+
int term_index_interval;
|
845
|
+
Store *store;
|
846
|
+
Analyzer *analyzer;
|
847
|
+
Similarity *similarity;
|
848
|
+
SegmentInfos *sis;
|
849
|
+
Store *ram_store;
|
850
|
+
Lock *write_lock;
|
851
|
+
bool close_store : 1;
|
852
|
+
bool close_analyzer : 1;
|
853
|
+
bool use_compound_file : 1;
|
854
|
+
};
|
855
|
+
|
856
|
+
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
857
|
+
bool create, bool close_store, bool close_analyzer);
|
858
|
+
void iw_flush_ram_segments(IndexWriter *iw);
|
859
|
+
void iw_close(IndexWriter *iw);
|
860
|
+
int iw_doc_count(IndexWriter *iw);
|
861
|
+
void iw_add_doc(IndexWriter *iw, Document *doc);
|
862
|
+
void iw_optimize(IndexWriter *iw);
|
863
|
+
void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt);
|
864
|
+
void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
|
865
|
+
|
866
|
+
/****************************************************************************
|
867
|
+
*
|
868
|
+
* CompoundWriter
|
869
|
+
*
|
870
|
+
****************************************************************************/
|
871
|
+
|
872
|
+
typedef struct CompoundWriter {
|
873
|
+
Store *store;
|
874
|
+
const char *name;
|
875
|
+
HashSet *ids;
|
876
|
+
Array *file_entries;
|
877
|
+
bool merged;
|
878
|
+
} CompoundWriter;
|
879
|
+
|
880
|
+
CompoundWriter *open_cw(Store *store, char *name);
|
881
|
+
void cw_add_file(CompoundWriter *cw, char *id);
|
882
|
+
void cw_close(CompoundWriter *cw);
|
883
|
+
|
884
|
+
#endif
|