ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/index.h
ADDED
@@ -0,0 +1,884 @@
|
|
1
|
+
#ifndef FRT_INDEX_H
|
2
|
+
#define FRT_INDEX_H
|
3
|
+
|
4
|
+
#include <limits.h>
|
5
|
+
#include "global.h"
|
6
|
+
#include "array.h"
|
7
|
+
#include "bitvector.h"
|
8
|
+
#include "hashset.h"
|
9
|
+
#include "priorityqueue.h"
|
10
|
+
#include "hash.h"
|
11
|
+
#include "store.h"
|
12
|
+
#include "document.h"
|
13
|
+
#include "analysis.h"
|
14
|
+
|
15
|
+
#define SEGMENT_NAME_MAX_LENGTH 100
|
16
|
+
|
17
|
+
typedef struct Config {
|
18
|
+
int merge_factor;
|
19
|
+
int min_merge_docs;
|
20
|
+
int max_merge_docs;
|
21
|
+
int max_field_length;
|
22
|
+
int term_index_interval;
|
23
|
+
} FerretConfig;
|
24
|
+
|
25
|
+
extern FerretConfig config;
|
26
|
+
|
27
|
+
typedef struct IndexReader IndexReader;
|
28
|
+
typedef struct IndexWriter IndexWriter;
|
29
|
+
typedef struct SegmentReader SegmentReader;
|
30
|
+
|
31
|
+
/***************************************************************************
|
32
|
+
*
|
33
|
+
* CacheObject
|
34
|
+
*
|
35
|
+
***************************************************************************/
|
36
|
+
|
37
|
+
typedef struct CacheObject {
|
38
|
+
HshTable *ref_tab1;
|
39
|
+
HshTable *ref_tab2;
|
40
|
+
void *ref1;
|
41
|
+
void *ref2;
|
42
|
+
void *obj;
|
43
|
+
void (*destroy)(void *p);
|
44
|
+
} CacheObject;
|
45
|
+
|
46
|
+
void cache_destroy(CacheObject *co);
|
47
|
+
CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
|
48
|
+
void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
|
49
|
+
unsigned int co_hash(const void *key);
|
50
|
+
int co_eq(const void *key1, const void *key2);
|
51
|
+
HshTable *co_hsh_create();
|
52
|
+
|
53
|
+
/****************************************************************************
|
54
|
+
*
|
55
|
+
* FieldInfo
|
56
|
+
*
|
57
|
+
****************************************************************************/
|
58
|
+
|
59
|
+
typedef struct FieldInfo {
|
60
|
+
char *name;
|
61
|
+
int number;
|
62
|
+
bool is_indexed : 1;
|
63
|
+
bool store_tv : 1;
|
64
|
+
bool store_offset : 1;
|
65
|
+
bool store_pos : 1;
|
66
|
+
bool omit_norms : 1;
|
67
|
+
} FieldInfo;
|
68
|
+
|
69
|
+
FieldInfo *fi_create(char *name,
|
70
|
+
int number,
|
71
|
+
bool is_indexed,
|
72
|
+
bool store_tv,
|
73
|
+
bool store_pos,
|
74
|
+
bool store_offset,
|
75
|
+
bool omit_norms);
|
76
|
+
void fi_destroy(void *p);
|
77
|
+
|
78
|
+
/****************************************************************************
|
79
|
+
*
|
80
|
+
* FieldInfos
|
81
|
+
*
|
82
|
+
****************************************************************************/
|
83
|
+
|
84
|
+
typedef struct FieldInfos {
|
85
|
+
HashEntry **by_name;
|
86
|
+
FieldInfo **by_number;
|
87
|
+
int fcnt;
|
88
|
+
} FieldInfos;
|
89
|
+
|
90
|
+
FieldInfos *fis_create();
|
91
|
+
FieldInfos *fis_open(Store *store, char *filename);
|
92
|
+
void fis_destroy(void *p);
|
93
|
+
FieldInfo *fis_add(FieldInfos *fis,
|
94
|
+
char *name,
|
95
|
+
bool is_indexed,
|
96
|
+
bool store_tv,
|
97
|
+
bool store_offset,
|
98
|
+
bool store_pos,
|
99
|
+
bool omit_norms);
|
100
|
+
|
101
|
+
void fis_add_fields(FieldInfos *fis,
|
102
|
+
HashSet *field_names,
|
103
|
+
bool is_indexed,
|
104
|
+
bool store_tv,
|
105
|
+
bool store_offset,
|
106
|
+
bool store_pos,
|
107
|
+
bool omit_norms);
|
108
|
+
bool fis_has_vectors(FieldInfos *fis);
|
109
|
+
void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
|
110
|
+
FieldInfos *fis_read(FieldInfos *fis, InStream *is);
|
111
|
+
FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
|
112
|
+
unsigned long long fis_get_number(FieldInfos *fis, char *name);
|
113
|
+
FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
|
114
|
+
|
115
|
+
|
116
|
+
/****************************************************************************
|
117
|
+
*
|
118
|
+
* Term
|
119
|
+
*
|
120
|
+
****************************************************************************/
|
121
|
+
|
122
|
+
typedef struct Term {
|
123
|
+
char *field;
|
124
|
+
char *text;
|
125
|
+
} Term;
|
126
|
+
|
127
|
+
Term *term_clone(Term *term);
|
128
|
+
Term *term_create(const char *field, char *text);
|
129
|
+
void term_destroy(void *p);
|
130
|
+
int term_cmp(void *t1, void *t2);
|
131
|
+
int term_eq(const void *t1, const void *t2);
|
132
|
+
unsigned int term_hash(const void *t);
|
133
|
+
char *term_to_s(Term *term);
|
134
|
+
|
135
|
+
/****************************************************************************
|
136
|
+
*
|
137
|
+
* TermBuffer
|
138
|
+
*
|
139
|
+
****************************************************************************/
|
140
|
+
|
141
|
+
typedef struct TermBuffer {
|
142
|
+
char *field;
|
143
|
+
char text[MAX_WORD_SIZE];
|
144
|
+
} TermBuffer;
|
145
|
+
|
146
|
+
TermBuffer *tb_create();
|
147
|
+
void tb_destroy(void *p);
|
148
|
+
TermBuffer *tb_set_term(TermBuffer *tb, Term *t);
|
149
|
+
Term *tb_get_term(TermBuffer *tb);
|
150
|
+
int tb_cmp(TermBuffer *tb1, TermBuffer *tb2);
|
151
|
+
int tb_term_cmp(TermBuffer *tb, Term *t);
|
152
|
+
TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2);
|
153
|
+
TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis);
|
154
|
+
|
155
|
+
/****************************************************************************
|
156
|
+
*
|
157
|
+
* TermInfo
|
158
|
+
*
|
159
|
+
****************************************************************************/
|
160
|
+
|
161
|
+
typedef struct TermInfo {
|
162
|
+
int doc_freq;
|
163
|
+
int freq_pointer;
|
164
|
+
int prox_pointer;
|
165
|
+
int skip_offset;
|
166
|
+
} TermInfo;
|
167
|
+
|
168
|
+
TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset);
|
169
|
+
TermInfo *ti_set(TermInfo *ti, int df, int fp, int pp, int so);
|
170
|
+
void ti_destroy(void *p);
|
171
|
+
TermInfo *ti_cpy(TermInfo *ti1, TermInfo *ti2);
|
172
|
+
TermInfo *ti_clone(TermInfo *other);
|
173
|
+
int ti_eq(TermInfo *ti1, TermInfo *ti2);
|
174
|
+
|
175
|
+
/****************************************************************************
|
176
|
+
*
|
177
|
+
* TermEnum
|
178
|
+
*
|
179
|
+
****************************************************************************/
|
180
|
+
|
181
|
+
typedef struct TermEnumFilter TermEnumFilter;
|
182
|
+
typedef struct TermEnum TermEnum;
|
183
|
+
struct TermEnum {
|
184
|
+
void *data;
|
185
|
+
TermBuffer *(*next)(TermEnum *te);
|
186
|
+
void (*close)(TermEnum *te);
|
187
|
+
TermEnum *(*clone)(TermEnum *te);
|
188
|
+
TermBuffer *tb_curr;
|
189
|
+
TermBuffer *tb_prev;
|
190
|
+
TermInfo *ti_curr;
|
191
|
+
};
|
192
|
+
|
193
|
+
TermBuffer *te_skip_to(struct TermEnum *te, Term *t);
|
194
|
+
|
195
|
+
Term *te_get_term(struct TermEnum *te);
|
196
|
+
TermInfo *te_get_ti(struct TermEnum *te);
|
197
|
+
|
198
|
+
/* * SegmentTermEnum * */
|
199
|
+
|
200
|
+
typedef struct SegmentTermEnum {
|
201
|
+
FieldInfos *fis;
|
202
|
+
int is_index;
|
203
|
+
InStream *is;
|
204
|
+
int size;
|
205
|
+
int pos;
|
206
|
+
int index_pointer;
|
207
|
+
int index_interval;
|
208
|
+
int skip_interval;
|
209
|
+
int format_m1skip_interval;
|
210
|
+
int format;
|
211
|
+
} SegmentTermEnum;
|
212
|
+
|
213
|
+
|
214
|
+
TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index);
|
215
|
+
TermBuffer *ste_next(struct TermEnum *te);
|
216
|
+
void ste_close(struct TermEnum *te);
|
217
|
+
|
218
|
+
/* * MultiTermEnum * */
|
219
|
+
|
220
|
+
typedef struct MultiTermEnum {
|
221
|
+
int doc_freq;
|
222
|
+
PriorityQueue *smi_queue;
|
223
|
+
} MultiTermEnum;
|
224
|
+
|
225
|
+
TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *term);
|
226
|
+
|
227
|
+
/****************************************************************************
|
228
|
+
*
|
229
|
+
* TermInfosWriter
|
230
|
+
*
|
231
|
+
****************************************************************************/
|
232
|
+
|
233
|
+
#define TERM_INFO_FORMAT -2
|
234
|
+
|
235
|
+
typedef struct TermInfosWriter {
|
236
|
+
int index_interval;
|
237
|
+
int skip_interval;
|
238
|
+
int size;
|
239
|
+
int last_index_pointer;
|
240
|
+
bool is_index;
|
241
|
+
OutStream *os;
|
242
|
+
struct TermInfosWriter *other;
|
243
|
+
Term *last_term;
|
244
|
+
TermInfo *last_term_info;
|
245
|
+
FieldInfos *fis;
|
246
|
+
char *curr_field;
|
247
|
+
int curr_field_num;
|
248
|
+
} TermInfosWriter;
|
249
|
+
|
250
|
+
TermInfosWriter *tiw_open(Store *store,
|
251
|
+
char *segment,
|
252
|
+
FieldInfos *fis,
|
253
|
+
int interval);
|
254
|
+
void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti);
|
255
|
+
void tiw_close(TermInfosWriter *tiw);
|
256
|
+
|
257
|
+
/****************************************************************************
|
258
|
+
*
|
259
|
+
* TermInfosReader
|
260
|
+
*
|
261
|
+
****************************************************************************/
|
262
|
+
|
263
|
+
typedef struct TermInfosReader {
|
264
|
+
mutex_t mutex;
|
265
|
+
TermEnum *orig_te;
|
266
|
+
thread_key_t thread_te;
|
267
|
+
Array *te_bucket;
|
268
|
+
TermEnum *index_te;
|
269
|
+
int size;
|
270
|
+
int skip_interval;
|
271
|
+
int index_size;
|
272
|
+
Term **index_terms;
|
273
|
+
TermInfo **index_term_infos;
|
274
|
+
int *index_pointers;
|
275
|
+
} TermInfosReader;
|
276
|
+
|
277
|
+
TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis);
|
278
|
+
void tir_close(TermInfosReader *tir);
|
279
|
+
Term *tir_get_term(TermInfosReader *tir, int position);
|
280
|
+
int tir_get_term_pos(TermInfosReader *tir, Term *t);
|
281
|
+
TermInfo *tir_get_ti(TermInfosReader *tir, Term *t);
|
282
|
+
|
283
|
+
/****************************************************************************
|
284
|
+
*
|
285
|
+
* TVOffsetInfo
|
286
|
+
*
|
287
|
+
****************************************************************************/
|
288
|
+
|
289
|
+
typedef struct TVOffsetInfo {
|
290
|
+
int start;
|
291
|
+
int end;
|
292
|
+
} TVOffsetInfo;
|
293
|
+
|
294
|
+
TVOffsetInfo *tvoi_create(int start, int end);
|
295
|
+
void tvoi_destroy(void *p);
|
296
|
+
|
297
|
+
/****************************************************************************
|
298
|
+
*
|
299
|
+
* TVField
|
300
|
+
*
|
301
|
+
****************************************************************************/
|
302
|
+
|
303
|
+
typedef struct TVField {
|
304
|
+
int tvf_pointer;
|
305
|
+
int number;
|
306
|
+
unsigned int store_positions : 1;
|
307
|
+
unsigned int store_offsets : 1;
|
308
|
+
} TVField;
|
309
|
+
|
310
|
+
TVField *tvf_create(int number, int store_positions, int store_offsets);
|
311
|
+
void tvf_destroy(void *p);
|
312
|
+
|
313
|
+
/****************************************************************************
|
314
|
+
*
|
315
|
+
* TVTerm
|
316
|
+
*
|
317
|
+
****************************************************************************/
|
318
|
+
|
319
|
+
typedef struct TVTerm {
|
320
|
+
char *text;
|
321
|
+
int freq;
|
322
|
+
int *positions;
|
323
|
+
TVOffsetInfo **offsets;
|
324
|
+
} TVTerm;
|
325
|
+
|
326
|
+
TVTerm *tvt_create(char *text,
|
327
|
+
int freq,
|
328
|
+
int *positions,
|
329
|
+
TVOffsetInfo **offsets);
|
330
|
+
void tvt_destroy(void *p);
|
331
|
+
|
332
|
+
/****************************************************************************
|
333
|
+
*
|
334
|
+
* TermVector
|
335
|
+
*
|
336
|
+
****************************************************************************/
|
337
|
+
|
338
|
+
typedef struct TermVector {
|
339
|
+
char *field;
|
340
|
+
char **terms;
|
341
|
+
int tcnt;
|
342
|
+
int *freqs;
|
343
|
+
int **positions;
|
344
|
+
TVOffsetInfo ***offsets;
|
345
|
+
} TermVector;
|
346
|
+
|
347
|
+
TermVector *tv_create(const char *field,
|
348
|
+
char **terms,
|
349
|
+
int tcnt,
|
350
|
+
int *freqs,
|
351
|
+
int **positions,
|
352
|
+
TVOffsetInfo ***offsets);
|
353
|
+
void tv_destroy(void *p);
|
354
|
+
|
355
|
+
/****************************************************************************
|
356
|
+
*
|
357
|
+
* TermVectorsWriter
|
358
|
+
*
|
359
|
+
****************************************************************************/
|
360
|
+
|
361
|
+
#define STORE_POSITIONS_WITH_TERMVECTOR 0x1
|
362
|
+
#define STORE_OFFSET_WITH_TERMVECTOR 0x2
|
363
|
+
|
364
|
+
#define FORMAT_VERSION 2
|
365
|
+
#define FORMAT_SIZE 4
|
366
|
+
|
367
|
+
#define TVX_EXTENSION ".tvx"
|
368
|
+
#define TVD_EXTENSION ".tvd"
|
369
|
+
#define TVF_EXTENSION ".tvf"
|
370
|
+
|
371
|
+
typedef struct TermVectorsWriter {
|
372
|
+
TVField *curr_field;
|
373
|
+
int curr_doc_pointer;
|
374
|
+
OutStream *tvx;
|
375
|
+
OutStream *tvd;
|
376
|
+
OutStream *tvf;
|
377
|
+
FieldInfos *fis;
|
378
|
+
TVField **fields;
|
379
|
+
int fcnt;
|
380
|
+
int fsize;
|
381
|
+
TVTerm **terms;
|
382
|
+
int tcnt;
|
383
|
+
int tsize;
|
384
|
+
} TermVectorsWriter;
|
385
|
+
|
386
|
+
TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
|
387
|
+
void tvw_close(TermVectorsWriter *tvw);
|
388
|
+
void tvw_open_doc(TermVectorsWriter *tvw);
|
389
|
+
void tvw_close_doc(TermVectorsWriter *tvw);
|
390
|
+
void tvw_open_field(TermVectorsWriter *tvw, char *field);
|
391
|
+
void tvw_close_field(TermVectorsWriter *tvw);
|
392
|
+
void tvw_add_term(TermVectorsWriter *tvw, char *text, int freq, int *positions, TVOffsetInfo **offsets);
|
393
|
+
void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
|
394
|
+
|
395
|
+
|
396
|
+
/****************************************************************************
|
397
|
+
*
|
398
|
+
* TermVectorsReader
|
399
|
+
*
|
400
|
+
****************************************************************************/
|
401
|
+
|
402
|
+
typedef struct TermVectorsReader {
|
403
|
+
int size;
|
404
|
+
InStream *tvx;
|
405
|
+
InStream *tvd;
|
406
|
+
InStream *tvf;
|
407
|
+
FieldInfos *fis;
|
408
|
+
int tvd_format;
|
409
|
+
int tvf_format;
|
410
|
+
} TermVectorsReader;
|
411
|
+
|
412
|
+
TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
|
413
|
+
TermVectorsReader *tvr_clone(TermVectorsReader *orig);
|
414
|
+
void tvr_close(TermVectorsReader *tvr);
|
415
|
+
TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
416
|
+
char *field, int tvf_pointer);
|
417
|
+
Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
|
418
|
+
TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
|
419
|
+
|
420
|
+
/****************************************************************************
|
421
|
+
*
|
422
|
+
* FieldsWriter
|
423
|
+
*
|
424
|
+
****************************************************************************/
|
425
|
+
|
426
|
+
#define FIELD_IS_TOKENIZED 0X1
|
427
|
+
#define FIELD_IS_BINARY 0X2
|
428
|
+
#define FIELD_IS_COMPRESSED 0X4
|
429
|
+
|
430
|
+
typedef struct FieldsWriter {
|
431
|
+
FieldInfos *fis;
|
432
|
+
OutStream *fields_out;
|
433
|
+
OutStream *index_out;
|
434
|
+
} FieldsWriter;
|
435
|
+
|
436
|
+
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
|
437
|
+
void fw_close(FieldsWriter *fw);
|
438
|
+
void fw_add_doc(FieldsWriter *fw, Document *doc);
|
439
|
+
|
440
|
+
/****************************************************************************
|
441
|
+
*
|
442
|
+
* TermDocEnum
|
443
|
+
*
|
444
|
+
****************************************************************************/
|
445
|
+
|
446
|
+
typedef struct TermDocEnum TermDocEnum;
|
447
|
+
struct TermDocEnum {
|
448
|
+
void *data;
|
449
|
+
void (*seek)(TermDocEnum *tde, Term *term);
|
450
|
+
int (*doc_num)(TermDocEnum *tde);
|
451
|
+
int (*freq)(TermDocEnum *tde);
|
452
|
+
bool (*next)(TermDocEnum *tde);
|
453
|
+
int (*read)(TermDocEnum *tde, int *docs, int *freqs, int req_num);
|
454
|
+
bool (*skip_to)(TermDocEnum *tde, int target);
|
455
|
+
int (*next_position)(TermDocEnum *tde);
|
456
|
+
void (*close)(TermDocEnum *tde);
|
457
|
+
};
|
458
|
+
|
459
|
+
/* * SegmentTermDocEnum * */
|
460
|
+
|
461
|
+
typedef struct SegmentTermDocEnum SegmentTermDocEnum;
|
462
|
+
struct SegmentTermDocEnum {
|
463
|
+
SegmentReader *parent;
|
464
|
+
InStream *freq_in;
|
465
|
+
int count; // the number of docs for this term that we have skipped
|
466
|
+
int doc_freq; // the number of doc this term appears in
|
467
|
+
BitVector *deleted_docs;
|
468
|
+
int doc_num;
|
469
|
+
int freq;
|
470
|
+
int skip_interval;
|
471
|
+
int num_skips;
|
472
|
+
int skip_count;
|
473
|
+
InStream *skip_in;
|
474
|
+
int skip_doc;
|
475
|
+
int freq_pointer;
|
476
|
+
int prox_pointer;
|
477
|
+
int skip_pointer;
|
478
|
+
unsigned int have_skipped : 1;
|
479
|
+
void (*skip_prox)(SegmentTermDocEnum *stde);
|
480
|
+
InStream *prox_in;
|
481
|
+
int prox_cnt;
|
482
|
+
int position;
|
483
|
+
void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
|
484
|
+
};
|
485
|
+
|
486
|
+
TermDocEnum *stde_create(IndexReader *ir);
|
487
|
+
void stde_seek_ti(TermDocEnum *tde, TermInfo *ti);
|
488
|
+
|
489
|
+
/* * SegmentTermPosEnum * */
|
490
|
+
TermDocEnum *stpe_create(IndexReader *ir);
|
491
|
+
|
492
|
+
/* * MultiTermDocEnum * */
|
493
|
+
typedef struct MultiTermDocEnum MultiTermDocEnum;
|
494
|
+
struct MultiTermDocEnum {
|
495
|
+
IndexReader **irs;
|
496
|
+
int *starts;
|
497
|
+
int ir_cnt;
|
498
|
+
Term *term;
|
499
|
+
int base;
|
500
|
+
int pointer;
|
501
|
+
TermDocEnum **irs_tde;
|
502
|
+
TermDocEnum *curr_tde;
|
503
|
+
TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
|
504
|
+
};
|
505
|
+
|
506
|
+
TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
|
507
|
+
|
508
|
+
/* * MultiTermPosEnum * */
|
509
|
+
TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
|
510
|
+
|
511
|
+
/****************************************************************************
|
512
|
+
* MultipleTermDocPosEnum
|
513
|
+
****************************************************************************/
|
514
|
+
|
515
|
+
#define MTDPE_POS_QUEUE_INIT_CAPA 8
|
516
|
+
typedef struct {
|
517
|
+
int doc_num;
|
518
|
+
int freq;
|
519
|
+
PriorityQueue *pq;
|
520
|
+
int *pos_queue;
|
521
|
+
int pos_queue_index;
|
522
|
+
int pos_queue_capa;
|
523
|
+
} MultipleTermDocPosEnum;
|
524
|
+
|
525
|
+
TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
|
526
|
+
|
527
|
+
/****************************************************************************
|
528
|
+
*
|
529
|
+
* FieldsReader
|
530
|
+
*
|
531
|
+
****************************************************************************/
|
532
|
+
|
533
|
+
typedef struct FieldsReader {
|
534
|
+
int len;
|
535
|
+
FieldInfos *fis;
|
536
|
+
InStream *fields_in;
|
537
|
+
InStream *index_in;
|
538
|
+
} FieldsReader;
|
539
|
+
|
540
|
+
FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
|
541
|
+
void fr_close(FieldsReader *fr);
|
542
|
+
Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
543
|
+
|
544
|
+
/****************************************************************************
|
545
|
+
*
|
546
|
+
* Posting
|
547
|
+
*
|
548
|
+
****************************************************************************/
|
549
|
+
|
550
|
+
typedef struct Posting {
|
551
|
+
Term *term;
|
552
|
+
int freq;
|
553
|
+
int size;
|
554
|
+
int *positions;
|
555
|
+
TVOffsetInfo **offsets;
|
556
|
+
} Posting;
|
557
|
+
|
558
|
+
Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
|
559
|
+
void p_destroy(void *p);
|
560
|
+
void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset);
|
561
|
+
|
562
|
+
|
563
|
+
/****************************************************************************
|
564
|
+
*
|
565
|
+
* DocumentWriter
|
566
|
+
*
|
567
|
+
****************************************************************************/
|
568
|
+
|
569
|
+
#include "search.h"
|
570
|
+
|
571
|
+
typedef struct DocumentWriter {
|
572
|
+
Store *store;
|
573
|
+
Analyzer *analyzer;
|
574
|
+
Similarity *similarity;
|
575
|
+
HshTable *postingtable;
|
576
|
+
int pcnt;
|
577
|
+
FieldInfos *fis;
|
578
|
+
float *field_boosts;
|
579
|
+
int *field_lengths;
|
580
|
+
int *field_positions;
|
581
|
+
int *field_offsets;
|
582
|
+
int max_field_length;
|
583
|
+
int term_index_interval;
|
584
|
+
} DocumentWriter;
|
585
|
+
|
586
|
+
DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
|
587
|
+
Similarity *similarity, int max_field_length, int term_index_interval);
|
588
|
+
void dw_close(DocumentWriter *dw);
|
589
|
+
void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
|
590
|
+
|
591
|
+
/****************************************************************************
|
592
|
+
*
|
593
|
+
* SegmentInfo
|
594
|
+
*
|
595
|
+
****************************************************************************/
|
596
|
+
|
597
|
+
typedef struct SegmentInfo {
|
598
|
+
char *name;
|
599
|
+
int doc_cnt;
|
600
|
+
Store *store;
|
601
|
+
} SegmentInfo;
|
602
|
+
|
603
|
+
SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
|
604
|
+
void si_destroy(void *p);
|
605
|
+
bool si_has_deletions(SegmentInfo *si);
|
606
|
+
bool si_uses_compound_file(SegmentInfo *si);
|
607
|
+
bool si_has_separate_norms(SegmentInfo *si);
|
608
|
+
|
609
|
+
/****************************************************************************
|
610
|
+
*
|
611
|
+
* SegmentInfos
|
612
|
+
*
|
613
|
+
****************************************************************************/
|
614
|
+
|
615
|
+
typedef struct SegmentInfos {
|
616
|
+
Store *store;
|
617
|
+
SegmentInfo **segs;
|
618
|
+
int scnt;
|
619
|
+
int size;
|
620
|
+
int counter;
|
621
|
+
unsigned int version;
|
622
|
+
int format;
|
623
|
+
} SegmentInfos;
|
624
|
+
|
625
|
+
SegmentInfos *sis_create();
|
626
|
+
void sis_destroy(void *p);
|
627
|
+
void sis_add_si(SegmentInfos *sis, SegmentInfo *si);
|
628
|
+
void sis_del_at(SegmentInfos *sis, int at);
|
629
|
+
void sis_del_from_to(SegmentInfos *sis, int from, int to);
|
630
|
+
void sis_clear(SegmentInfos *sis);
|
631
|
+
void sis_read(SegmentInfos *sis, Store *store);
|
632
|
+
void sis_write(SegmentInfos *sis, Store *store);
|
633
|
+
int sis_read_current_version(Store *store);
|
634
|
+
|
635
|
+
/****************************************************************************
|
636
|
+
*
|
637
|
+
* IndexReader
|
638
|
+
*
|
639
|
+
****************************************************************************/
|
640
|
+
|
641
|
+
enum FIELD_TYPE {
|
642
|
+
// all fields
|
643
|
+
IR_ALL,
|
644
|
+
// all indexed fields
|
645
|
+
IR_INDEXED,
|
646
|
+
// all fields which are not indexed
|
647
|
+
IR_UNINDEXED,
|
648
|
+
// all fields which are indexed with termvectors enables
|
649
|
+
IR_INDEXED_WITH_TERM_VECTOR,
|
650
|
+
// all fields which are indexed but don't have termvectors enabled
|
651
|
+
IR_INDEXED_NO_TERM_VECTOR,
|
652
|
+
// all fields where termvectors are enabled. Please note that only standard
|
653
|
+
// termvector fields are returned
|
654
|
+
IR_TERM_VECTOR,
|
655
|
+
// all field with termvectors wiht positions enabled
|
656
|
+
IR_TERM_VECTOR_WITH_POSITION,
|
657
|
+
// all fields where termvectors with offset position are set
|
658
|
+
IR_TERM_VECTOR_WITH_OFFSET,
|
659
|
+
// all fields where termvectors with offset and position values set
|
660
|
+
IR_TERM_VECTOR_WITH_POSITION_OFFSET
|
661
|
+
};
|
662
|
+
|
663
|
+
struct IndexReader {
|
664
|
+
mutex_t mutex;
|
665
|
+
HshTable *cache;
|
666
|
+
HshTable *sort_cache;
|
667
|
+
void *data;
|
668
|
+
Store *store;
|
669
|
+
Lock *write_lock;
|
670
|
+
SegmentInfos *sis;
|
671
|
+
bool has_changes : 1;
|
672
|
+
bool is_stale : 1;
|
673
|
+
bool is_owner : 1;
|
674
|
+
bool close_store : 1;
|
675
|
+
TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
|
676
|
+
Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
|
677
|
+
int (*num_docs)(IndexReader *ir);
|
678
|
+
int (*max_doc)(IndexReader *ir);
|
679
|
+
Document *(*get_doc)(IndexReader *ir, int doc_num);
|
680
|
+
uchar *(*get_norms)(IndexReader *ir, char *field);
|
681
|
+
uchar *(*get_norms_always)(IndexReader *ir, char *field);
|
682
|
+
void (*do_set_norm)(IndexReader *ir, int doc_num, char *field, uchar val);
|
683
|
+
void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf, int offset);
|
684
|
+
TermEnum *(*terms)(IndexReader *ir);
|
685
|
+
TermEnum *(*terms_from)(IndexReader *ir, Term *term);
|
686
|
+
int (*doc_freq)(IndexReader *ir, Term *t);
|
687
|
+
TermDocEnum *(*term_docs)(IndexReader *ir);
|
688
|
+
TermDocEnum *(*term_positions)(IndexReader *ir);
|
689
|
+
void (*do_delete_doc)(IndexReader *ir, int doc_num);
|
690
|
+
void (*do_undelete_all)(IndexReader *ir);
|
691
|
+
bool (*is_deleted)(IndexReader *ir, int doc_num);
|
692
|
+
bool (*has_deletions)(IndexReader *ir);
|
693
|
+
bool (*has_norms)(IndexReader *ir, char *field);
|
694
|
+
HashSet *(*get_field_names)(IndexReader *ir, int field_type);
|
695
|
+
void (*do_commit)(IndexReader *ir);
|
696
|
+
void (*do_close)(IndexReader *ir);
|
697
|
+
void (*acquire_write_lock)(IndexReader *ir);
|
698
|
+
};
|
699
|
+
|
700
|
+
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_store);
|
701
|
+
IndexReader *ir_open(Store *store, int close_store);
|
702
|
+
bool ir_index_exists(Store *store);
|
703
|
+
void ir_close(IndexReader *ir);
|
704
|
+
void ir_commit(IndexReader *ir);
|
705
|
+
void ir_delete_doc(IndexReader *ir, int doc_num);
|
706
|
+
void ir_undelete_all(IndexReader *ir);
|
707
|
+
void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
|
708
|
+
void ir_destroy(void *p);
|
709
|
+
Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
|
710
|
+
TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
|
711
|
+
TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
|
712
|
+
void ir_add_cache(IndexReader *ir);
|
713
|
+
bool ir_is_latest(IndexReader *ir);
|
714
|
+
|
715
|
+
/****************************************************************************
|
716
|
+
*
|
717
|
+
* Norm
|
718
|
+
*
|
719
|
+
****************************************************************************/
|
720
|
+
|
721
|
+
typedef struct Norm {
|
722
|
+
bool is_dirty : 1;
|
723
|
+
int field_num;
|
724
|
+
InStream *is;
|
725
|
+
uchar *bytes;
|
726
|
+
} Norm;
|
727
|
+
|
728
|
+
/****************************************************************************
|
729
|
+
*
|
730
|
+
* SegmentReader
|
731
|
+
*
|
732
|
+
****************************************************************************/
|
733
|
+
|
734
|
+
struct SegmentReader {
|
735
|
+
FieldInfos *fis;
|
736
|
+
FieldsReader *fr;
|
737
|
+
char *segment;
|
738
|
+
BitVector *deleted_docs;
|
739
|
+
bool deleted_docs_dirty : 1;
|
740
|
+
bool undelete_all : 1;
|
741
|
+
bool norms_dirty : 1;
|
742
|
+
InStream *freq_in;
|
743
|
+
InStream *prox_in;
|
744
|
+
TermInfosReader *tir;
|
745
|
+
TermVectorsReader *orig_tvr;
|
746
|
+
thread_key_t thread_tvr;
|
747
|
+
Array *tvr_bucket;
|
748
|
+
HshTable *norms;
|
749
|
+
Store *cfs_store;
|
750
|
+
uchar *fake_norms;
|
751
|
+
};
|
752
|
+
|
753
|
+
IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_store);
|
754
|
+
IndexReader *sr_open_si(SegmentInfo *si);
|
755
|
+
//int sr_has_deletions(IndexReader *ir);
|
756
|
+
|
757
|
+
/****************************************************************************
|
758
|
+
*
|
759
|
+
* MultiReader
|
760
|
+
*
|
761
|
+
****************************************************************************/
|
762
|
+
|
763
|
+
typedef struct MultiReader {
|
764
|
+
bool has_deletions : 1;
|
765
|
+
int max_doc;
|
766
|
+
int num_docs_cache;
|
767
|
+
int rcnt;
|
768
|
+
int *starts;
|
769
|
+
IndexReader **sub_readers;
|
770
|
+
HshTable *norms_cache;
|
771
|
+
} MultiReader;
|
772
|
+
|
773
|
+
IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
774
|
+
int rcnt, int close_store);
|
775
|
+
|
776
|
+
/****************************************************************************
|
777
|
+
*
|
778
|
+
* SegmentMergeInfo
|
779
|
+
*
|
780
|
+
****************************************************************************/
|
781
|
+
|
782
|
+
typedef struct SegmentMergeInfo {
|
783
|
+
int base;
|
784
|
+
IndexReader *ir;
|
785
|
+
TermEnum *te;
|
786
|
+
TermBuffer *tb;
|
787
|
+
TermDocEnum *postings;
|
788
|
+
int *doc_map;
|
789
|
+
} SegmentMergeInfo;
|
790
|
+
|
791
|
+
SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
|
792
|
+
void smi_destroy(void *p);
|
793
|
+
TermBuffer *smi_next(SegmentMergeInfo *smi);
|
794
|
+
bool smi_lt(void *p1, void *p2);
|
795
|
+
|
796
|
+
/****************************************************************************
|
797
|
+
*
|
798
|
+
* SegmentMerger
|
799
|
+
*
|
800
|
+
****************************************************************************/
|
801
|
+
|
802
|
+
typedef struct SegmentMerger {
|
803
|
+
Store *store;
|
804
|
+
char *name;
|
805
|
+
Array *readers;
|
806
|
+
FieldInfos *fis;
|
807
|
+
OutStream *freq_out;
|
808
|
+
OutStream *prox_out;
|
809
|
+
TermInfosWriter *tiw;
|
810
|
+
Term *terms_buf;
|
811
|
+
int terms_buf_pointer;
|
812
|
+
int terms_buf_size;
|
813
|
+
PriorityQueue *queue;
|
814
|
+
TermInfo *ti;
|
815
|
+
int term_index_interval;
|
816
|
+
OutStream *skip_buffer;
|
817
|
+
int skip_interval;
|
818
|
+
int last_skip_doc;
|
819
|
+
int last_skip_freq_pointer;
|
820
|
+
int last_skip_prox_pointer;
|
821
|
+
} SegmentMerger;
|
822
|
+
|
823
|
+
SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
|
824
|
+
void sm_destroy(void *p);
|
825
|
+
void sm_add(SegmentMerger *sm, IndexReader *ir);
|
826
|
+
int sm_merge(SegmentMerger *sm);
|
827
|
+
Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
828
|
+
|
829
|
+
|
830
|
+
/****************************************************************************
|
831
|
+
*
|
832
|
+
* IndexWriter
|
833
|
+
*
|
834
|
+
****************************************************************************/
|
835
|
+
|
836
|
+
#define WRITE_LOCK_NAME "write"
|
837
|
+
#define COMMIT_LOCK_NAME "commit"
|
838
|
+
struct IndexWriter {
|
839
|
+
mutex_t mutex;
|
840
|
+
int merge_factor;
|
841
|
+
int min_merge_docs;
|
842
|
+
int max_merge_docs;
|
843
|
+
int max_field_length;
|
844
|
+
int term_index_interval;
|
845
|
+
Store *store;
|
846
|
+
Analyzer *analyzer;
|
847
|
+
Similarity *similarity;
|
848
|
+
SegmentInfos *sis;
|
849
|
+
Store *ram_store;
|
850
|
+
Lock *write_lock;
|
851
|
+
bool close_store : 1;
|
852
|
+
bool close_analyzer : 1;
|
853
|
+
bool use_compound_file : 1;
|
854
|
+
};
|
855
|
+
|
856
|
+
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
857
|
+
bool create, bool close_store, bool close_analyzer);
|
858
|
+
void iw_flush_ram_segments(IndexWriter *iw);
|
859
|
+
void iw_close(IndexWriter *iw);
|
860
|
+
int iw_doc_count(IndexWriter *iw);
|
861
|
+
void iw_add_doc(IndexWriter *iw, Document *doc);
|
862
|
+
void iw_optimize(IndexWriter *iw);
|
863
|
+
void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt);
|
864
|
+
void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
|
865
|
+
|
866
|
+
/****************************************************************************
|
867
|
+
*
|
868
|
+
* CompoundWriter
|
869
|
+
*
|
870
|
+
****************************************************************************/
|
871
|
+
|
872
|
+
typedef struct CompoundWriter {
|
873
|
+
Store *store;
|
874
|
+
const char *name;
|
875
|
+
HashSet *ids;
|
876
|
+
Array *file_entries;
|
877
|
+
bool merged;
|
878
|
+
} CompoundWriter;
|
879
|
+
|
880
|
+
CompoundWriter *open_cw(Store *store, char *name);
|
881
|
+
void cw_add_file(CompoundWriter *cw, char *id);
|
882
|
+
void cw_close(CompoundWriter *cw);
|
883
|
+
|
884
|
+
#endif
|