ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/index.h ADDED
@@ -0,0 +1,884 @@
1
+ #ifndef FRT_INDEX_H
2
+ #define FRT_INDEX_H
3
+
4
+ #include <limits.h>
5
+ #include "global.h"
6
+ #include "array.h"
7
+ #include "bitvector.h"
8
+ #include "hashset.h"
9
+ #include "priorityqueue.h"
10
+ #include "hash.h"
11
+ #include "store.h"
12
+ #include "document.h"
13
+ #include "analysis.h"
14
+
15
+ #define SEGMENT_NAME_MAX_LENGTH 100
16
+
17
+ typedef struct Config {
18
+ int merge_factor;
19
+ int min_merge_docs;
20
+ int max_merge_docs;
21
+ int max_field_length;
22
+ int term_index_interval;
23
+ } FerretConfig;
24
+
25
+ extern FerretConfig config;
26
+
27
+ typedef struct IndexReader IndexReader;
28
+ typedef struct IndexWriter IndexWriter;
29
+ typedef struct SegmentReader SegmentReader;
30
+
31
+ /***************************************************************************
32
+ *
33
+ * CacheObject
34
+ *
35
+ ***************************************************************************/
36
+
37
+ typedef struct CacheObject {
38
+ HshTable *ref_tab1;
39
+ HshTable *ref_tab2;
40
+ void *ref1;
41
+ void *ref2;
42
+ void *obj;
43
+ void (*destroy)(void *p);
44
+ } CacheObject;
45
+
46
+ void cache_destroy(CacheObject *co);
47
+ CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
48
+ void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
49
+ unsigned int co_hash(const void *key);
50
+ int co_eq(const void *key1, const void *key2);
51
+ HshTable *co_hsh_create();
52
+
53
+ /****************************************************************************
54
+ *
55
+ * FieldInfo
56
+ *
57
+ ****************************************************************************/
58
+
59
+ typedef struct FieldInfo {
60
+ char *name;
61
+ int number;
62
+ bool is_indexed : 1;
63
+ bool store_tv : 1;
64
+ bool store_offset : 1;
65
+ bool store_pos : 1;
66
+ bool omit_norms : 1;
67
+ } FieldInfo;
68
+
69
+ FieldInfo *fi_create(char *name,
70
+ int number,
71
+ bool is_indexed,
72
+ bool store_tv,
73
+ bool store_pos,
74
+ bool store_offset,
75
+ bool omit_norms);
76
+ void fi_destroy(void *p);
77
+
78
+ /****************************************************************************
79
+ *
80
+ * FieldInfos
81
+ *
82
+ ****************************************************************************/
83
+
84
+ typedef struct FieldInfos {
85
+ HashEntry **by_name;
86
+ FieldInfo **by_number;
87
+ int fcnt;
88
+ } FieldInfos;
89
+
90
+ FieldInfos *fis_create();
91
+ FieldInfos *fis_open(Store *store, char *filename);
92
+ void fis_destroy(void *p);
93
+ FieldInfo *fis_add(FieldInfos *fis,
94
+ char *name,
95
+ bool is_indexed,
96
+ bool store_tv,
97
+ bool store_offset,
98
+ bool store_pos,
99
+ bool omit_norms);
100
+
101
+ void fis_add_fields(FieldInfos *fis,
102
+ HashSet *field_names,
103
+ bool is_indexed,
104
+ bool store_tv,
105
+ bool store_offset,
106
+ bool store_pos,
107
+ bool omit_norms);
108
+ bool fis_has_vectors(FieldInfos *fis);
109
+ void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
110
+ FieldInfos *fis_read(FieldInfos *fis, InStream *is);
111
+ FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
112
+ unsigned long long fis_get_number(FieldInfos *fis, char *name);
113
+ FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
114
+
115
+
116
+ /****************************************************************************
117
+ *
118
+ * Term
119
+ *
120
+ ****************************************************************************/
121
+
122
+ typedef struct Term {
123
+ char *field;
124
+ char *text;
125
+ } Term;
126
+
127
+ Term *term_clone(Term *term);
128
+ Term *term_create(const char *field, char *text);
129
+ void term_destroy(void *p);
130
+ int term_cmp(void *t1, void *t2);
131
+ int term_eq(const void *t1, const void *t2);
132
+ unsigned int term_hash(const void *t);
133
+ char *term_to_s(Term *term);
134
+
135
+ /****************************************************************************
136
+ *
137
+ * TermBuffer
138
+ *
139
+ ****************************************************************************/
140
+
141
+ typedef struct TermBuffer {
142
+ char *field;
143
+ char text[MAX_WORD_SIZE];
144
+ } TermBuffer;
145
+
146
+ TermBuffer *tb_create();
147
+ void tb_destroy(void *p);
148
+ TermBuffer *tb_set_term(TermBuffer *tb, Term *t);
149
+ Term *tb_get_term(TermBuffer *tb);
150
+ int tb_cmp(TermBuffer *tb1, TermBuffer *tb2);
151
+ int tb_term_cmp(TermBuffer *tb, Term *t);
152
+ TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2);
153
+ TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis);
154
+
155
+ /****************************************************************************
156
+ *
157
+ * TermInfo
158
+ *
159
+ ****************************************************************************/
160
+
161
+ typedef struct TermInfo {
162
+ int doc_freq;
163
+ int freq_pointer;
164
+ int prox_pointer;
165
+ int skip_offset;
166
+ } TermInfo;
167
+
168
+ TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset);
169
+ TermInfo *ti_set(TermInfo *ti, int df, int fp, int pp, int so);
170
+ void ti_destroy(void *p);
171
+ TermInfo *ti_cpy(TermInfo *ti1, TermInfo *ti2);
172
+ TermInfo *ti_clone(TermInfo *other);
173
+ int ti_eq(TermInfo *ti1, TermInfo *ti2);
174
+
175
+ /****************************************************************************
176
+ *
177
+ * TermEnum
178
+ *
179
+ ****************************************************************************/
180
+
181
+ typedef struct TermEnumFilter TermEnumFilter;
182
+ typedef struct TermEnum TermEnum;
183
+ struct TermEnum {
184
+ void *data;
185
+ TermBuffer *(*next)(TermEnum *te);
186
+ void (*close)(TermEnum *te);
187
+ TermEnum *(*clone)(TermEnum *te);
188
+ TermBuffer *tb_curr;
189
+ TermBuffer *tb_prev;
190
+ TermInfo *ti_curr;
191
+ };
192
+
193
+ TermBuffer *te_skip_to(struct TermEnum *te, Term *t);
194
+
195
+ Term *te_get_term(struct TermEnum *te);
196
+ TermInfo *te_get_ti(struct TermEnum *te);
197
+
198
+ /* * SegmentTermEnum * */
199
+
200
+ typedef struct SegmentTermEnum {
201
+ FieldInfos *fis;
202
+ int is_index;
203
+ InStream *is;
204
+ int size;
205
+ int pos;
206
+ int index_pointer;
207
+ int index_interval;
208
+ int skip_interval;
209
+ int format_m1skip_interval;
210
+ int format;
211
+ } SegmentTermEnum;
212
+
213
+
214
+ TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index);
215
+ TermBuffer *ste_next(struct TermEnum *te);
216
+ void ste_close(struct TermEnum *te);
217
+
218
+ /* * MultiTermEnum * */
219
+
220
+ typedef struct MultiTermEnum {
221
+ int doc_freq;
222
+ PriorityQueue *smi_queue;
223
+ } MultiTermEnum;
224
+
225
+ TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *term);
226
+
227
+ /****************************************************************************
228
+ *
229
+ * TermInfosWriter
230
+ *
231
+ ****************************************************************************/
232
+
233
+ #define TERM_INFO_FORMAT -2
234
+
235
+ typedef struct TermInfosWriter {
236
+ int index_interval;
237
+ int skip_interval;
238
+ int size;
239
+ int last_index_pointer;
240
+ bool is_index;
241
+ OutStream *os;
242
+ struct TermInfosWriter *other;
243
+ Term *last_term;
244
+ TermInfo *last_term_info;
245
+ FieldInfos *fis;
246
+ char *curr_field;
247
+ int curr_field_num;
248
+ } TermInfosWriter;
249
+
250
+ TermInfosWriter *tiw_open(Store *store,
251
+ char *segment,
252
+ FieldInfos *fis,
253
+ int interval);
254
+ void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti);
255
+ void tiw_close(TermInfosWriter *tiw);
256
+
257
+ /****************************************************************************
258
+ *
259
+ * TermInfosReader
260
+ *
261
+ ****************************************************************************/
262
+
263
+ typedef struct TermInfosReader {
264
+ mutex_t mutex;
265
+ TermEnum *orig_te;
266
+ thread_key_t thread_te;
267
+ Array *te_bucket;
268
+ TermEnum *index_te;
269
+ int size;
270
+ int skip_interval;
271
+ int index_size;
272
+ Term **index_terms;
273
+ TermInfo **index_term_infos;
274
+ int *index_pointers;
275
+ } TermInfosReader;
276
+
277
+ TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis);
278
+ void tir_close(TermInfosReader *tir);
279
+ Term *tir_get_term(TermInfosReader *tir, int position);
280
+ int tir_get_term_pos(TermInfosReader *tir, Term *t);
281
+ TermInfo *tir_get_ti(TermInfosReader *tir, Term *t);
282
+
283
+ /****************************************************************************
284
+ *
285
+ * TVOffsetInfo
286
+ *
287
+ ****************************************************************************/
288
+
289
+ typedef struct TVOffsetInfo {
290
+ int start;
291
+ int end;
292
+ } TVOffsetInfo;
293
+
294
+ TVOffsetInfo *tvoi_create(int start, int end);
295
+ void tvoi_destroy(void *p);
296
+
297
+ /****************************************************************************
298
+ *
299
+ * TVField
300
+ *
301
+ ****************************************************************************/
302
+
303
+ typedef struct TVField {
304
+ int tvf_pointer;
305
+ int number;
306
+ unsigned int store_positions : 1;
307
+ unsigned int store_offsets : 1;
308
+ } TVField;
309
+
310
+ TVField *tvf_create(int number, int store_positions, int store_offsets);
311
+ void tvf_destroy(void *p);
312
+
313
+ /****************************************************************************
314
+ *
315
+ * TVTerm
316
+ *
317
+ ****************************************************************************/
318
+
319
+ typedef struct TVTerm {
320
+ char *text;
321
+ int freq;
322
+ int *positions;
323
+ TVOffsetInfo **offsets;
324
+ } TVTerm;
325
+
326
+ TVTerm *tvt_create(char *text,
327
+ int freq,
328
+ int *positions,
329
+ TVOffsetInfo **offsets);
330
+ void tvt_destroy(void *p);
331
+
332
+ /****************************************************************************
333
+ *
334
+ * TermVector
335
+ *
336
+ ****************************************************************************/
337
+
338
+ typedef struct TermVector {
339
+ char *field;
340
+ char **terms;
341
+ int tcnt;
342
+ int *freqs;
343
+ int **positions;
344
+ TVOffsetInfo ***offsets;
345
+ } TermVector;
346
+
347
+ TermVector *tv_create(const char *field,
348
+ char **terms,
349
+ int tcnt,
350
+ int *freqs,
351
+ int **positions,
352
+ TVOffsetInfo ***offsets);
353
+ void tv_destroy(void *p);
354
+
355
+ /****************************************************************************
356
+ *
357
+ * TermVectorsWriter
358
+ *
359
+ ****************************************************************************/
360
+
361
+ #define STORE_POSITIONS_WITH_TERMVECTOR 0x1
362
+ #define STORE_OFFSET_WITH_TERMVECTOR 0x2
363
+
364
+ #define FORMAT_VERSION 2
365
+ #define FORMAT_SIZE 4
366
+
367
+ #define TVX_EXTENSION ".tvx"
368
+ #define TVD_EXTENSION ".tvd"
369
+ #define TVF_EXTENSION ".tvf"
370
+
371
+ typedef struct TermVectorsWriter {
372
+ TVField *curr_field;
373
+ int curr_doc_pointer;
374
+ OutStream *tvx;
375
+ OutStream *tvd;
376
+ OutStream *tvf;
377
+ FieldInfos *fis;
378
+ TVField **fields;
379
+ int fcnt;
380
+ int fsize;
381
+ TVTerm **terms;
382
+ int tcnt;
383
+ int tsize;
384
+ } TermVectorsWriter;
385
+
386
+ TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
387
+ void tvw_close(TermVectorsWriter *tvw);
388
+ void tvw_open_doc(TermVectorsWriter *tvw);
389
+ void tvw_close_doc(TermVectorsWriter *tvw);
390
+ void tvw_open_field(TermVectorsWriter *tvw, char *field);
391
+ void tvw_close_field(TermVectorsWriter *tvw);
392
+ void tvw_add_term(TermVectorsWriter *tvw, char *text, int freq, int *positions, TVOffsetInfo **offsets);
393
+ void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
394
+
395
+
396
+ /****************************************************************************
397
+ *
398
+ * TermVectorsReader
399
+ *
400
+ ****************************************************************************/
401
+
402
+ typedef struct TermVectorsReader {
403
+ int size;
404
+ InStream *tvx;
405
+ InStream *tvd;
406
+ InStream *tvf;
407
+ FieldInfos *fis;
408
+ int tvd_format;
409
+ int tvf_format;
410
+ } TermVectorsReader;
411
+
412
+ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
413
+ TermVectorsReader *tvr_clone(TermVectorsReader *orig);
414
+ void tvr_close(TermVectorsReader *tvr);
415
+ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
416
+ char *field, int tvf_pointer);
417
+ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
418
+ TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
419
+
420
+ /****************************************************************************
421
+ *
422
+ * FieldsWriter
423
+ *
424
+ ****************************************************************************/
425
+
426
+ #define FIELD_IS_TOKENIZED 0X1
427
+ #define FIELD_IS_BINARY 0X2
428
+ #define FIELD_IS_COMPRESSED 0X4
429
+
430
+ typedef struct FieldsWriter {
431
+ FieldInfos *fis;
432
+ OutStream *fields_out;
433
+ OutStream *index_out;
434
+ } FieldsWriter;
435
+
436
+ FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
437
+ void fw_close(FieldsWriter *fw);
438
+ void fw_add_doc(FieldsWriter *fw, Document *doc);
439
+
440
+ /****************************************************************************
441
+ *
442
+ * TermDocEnum
443
+ *
444
+ ****************************************************************************/
445
+
446
+ typedef struct TermDocEnum TermDocEnum;
447
+ struct TermDocEnum {
448
+ void *data;
449
+ void (*seek)(TermDocEnum *tde, Term *term);
450
+ int (*doc_num)(TermDocEnum *tde);
451
+ int (*freq)(TermDocEnum *tde);
452
+ bool (*next)(TermDocEnum *tde);
453
+ int (*read)(TermDocEnum *tde, int *docs, int *freqs, int req_num);
454
+ bool (*skip_to)(TermDocEnum *tde, int target);
455
+ int (*next_position)(TermDocEnum *tde);
456
+ void (*close)(TermDocEnum *tde);
457
+ };
458
+
459
+ /* * SegmentTermDocEnum * */
460
+
461
+ typedef struct SegmentTermDocEnum SegmentTermDocEnum;
462
+ struct SegmentTermDocEnum {
463
+ SegmentReader *parent;
464
+ InStream *freq_in;
465
+ int count; // the number of docs for this term that we have skipped
466
+ int doc_freq; // the number of doc this term appears in
467
+ BitVector *deleted_docs;
468
+ int doc_num;
469
+ int freq;
470
+ int skip_interval;
471
+ int num_skips;
472
+ int skip_count;
473
+ InStream *skip_in;
474
+ int skip_doc;
475
+ int freq_pointer;
476
+ int prox_pointer;
477
+ int skip_pointer;
478
+ unsigned int have_skipped : 1;
479
+ void (*skip_prox)(SegmentTermDocEnum *stde);
480
+ InStream *prox_in;
481
+ int prox_cnt;
482
+ int position;
483
+ void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
484
+ };
485
+
486
+ TermDocEnum *stde_create(IndexReader *ir);
487
+ void stde_seek_ti(TermDocEnum *tde, TermInfo *ti);
488
+
489
+ /* * SegmentTermPosEnum * */
490
+ TermDocEnum *stpe_create(IndexReader *ir);
491
+
492
+ /* * MultiTermDocEnum * */
493
+ typedef struct MultiTermDocEnum MultiTermDocEnum;
494
+ struct MultiTermDocEnum {
495
+ IndexReader **irs;
496
+ int *starts;
497
+ int ir_cnt;
498
+ Term *term;
499
+ int base;
500
+ int pointer;
501
+ TermDocEnum **irs_tde;
502
+ TermDocEnum *curr_tde;
503
+ TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
504
+ };
505
+
506
+ TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
507
+
508
+ /* * MultiTermPosEnum * */
509
+ TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
510
+
511
+ /****************************************************************************
512
+ * MultipleTermDocPosEnum
513
+ ****************************************************************************/
514
+
515
+ #define MTDPE_POS_QUEUE_INIT_CAPA 8
516
+ typedef struct {
517
+ int doc_num;
518
+ int freq;
519
+ PriorityQueue *pq;
520
+ int *pos_queue;
521
+ int pos_queue_index;
522
+ int pos_queue_capa;
523
+ } MultipleTermDocPosEnum;
524
+
525
+ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
526
+
527
+ /****************************************************************************
528
+ *
529
+ * FieldsReader
530
+ *
531
+ ****************************************************************************/
532
+
533
+ typedef struct FieldsReader {
534
+ int len;
535
+ FieldInfos *fis;
536
+ InStream *fields_in;
537
+ InStream *index_in;
538
+ } FieldsReader;
539
+
540
+ FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
541
+ void fr_close(FieldsReader *fr);
542
+ Document *fr_get_doc(FieldsReader *fr, int doc_num);
543
+
544
+ /****************************************************************************
545
+ *
546
+ * Posting
547
+ *
548
+ ****************************************************************************/
549
+
550
+ typedef struct Posting {
551
+ Term *term;
552
+ int freq;
553
+ int size;
554
+ int *positions;
555
+ TVOffsetInfo **offsets;
556
+ } Posting;
557
+
558
+ Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
559
+ void p_destroy(void *p);
560
+ void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset);
561
+
562
+
563
+ /****************************************************************************
564
+ *
565
+ * DocumentWriter
566
+ *
567
+ ****************************************************************************/
568
+
569
+ #include "search.h"
570
+
571
+ typedef struct DocumentWriter {
572
+ Store *store;
573
+ Analyzer *analyzer;
574
+ Similarity *similarity;
575
+ HshTable *postingtable;
576
+ int pcnt;
577
+ FieldInfos *fis;
578
+ float *field_boosts;
579
+ int *field_lengths;
580
+ int *field_positions;
581
+ int *field_offsets;
582
+ int max_field_length;
583
+ int term_index_interval;
584
+ } DocumentWriter;
585
+
586
+ DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
587
+ Similarity *similarity, int max_field_length, int term_index_interval);
588
+ void dw_close(DocumentWriter *dw);
589
+ void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
590
+
591
+ /****************************************************************************
592
+ *
593
+ * SegmentInfo
594
+ *
595
+ ****************************************************************************/
596
+
597
+ typedef struct SegmentInfo {
598
+ char *name;
599
+ int doc_cnt;
600
+ Store *store;
601
+ } SegmentInfo;
602
+
603
+ SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
604
+ void si_destroy(void *p);
605
+ bool si_has_deletions(SegmentInfo *si);
606
+ bool si_uses_compound_file(SegmentInfo *si);
607
+ bool si_has_separate_norms(SegmentInfo *si);
608
+
609
+ /****************************************************************************
610
+ *
611
+ * SegmentInfos
612
+ *
613
+ ****************************************************************************/
614
+
615
+ typedef struct SegmentInfos {
616
+ Store *store;
617
+ SegmentInfo **segs;
618
+ int scnt;
619
+ int size;
620
+ int counter;
621
+ unsigned int version;
622
+ int format;
623
+ } SegmentInfos;
624
+
625
+ SegmentInfos *sis_create();
626
+ void sis_destroy(void *p);
627
+ void sis_add_si(SegmentInfos *sis, SegmentInfo *si);
628
+ void sis_del_at(SegmentInfos *sis, int at);
629
+ void sis_del_from_to(SegmentInfos *sis, int from, int to);
630
+ void sis_clear(SegmentInfos *sis);
631
+ void sis_read(SegmentInfos *sis, Store *store);
632
+ void sis_write(SegmentInfos *sis, Store *store);
633
+ int sis_read_current_version(Store *store);
634
+
635
+ /****************************************************************************
636
+ *
637
+ * IndexReader
638
+ *
639
+ ****************************************************************************/
640
+
641
+ enum FIELD_TYPE {
642
+ // all fields
643
+ IR_ALL,
644
+ // all indexed fields
645
+ IR_INDEXED,
646
+ // all fields which are not indexed
647
+ IR_UNINDEXED,
648
+ // all fields which are indexed with termvectors enables
649
+ IR_INDEXED_WITH_TERM_VECTOR,
650
+ // all fields which are indexed but don't have termvectors enabled
651
+ IR_INDEXED_NO_TERM_VECTOR,
652
+ // all fields where termvectors are enabled. Please note that only standard
653
+ // termvector fields are returned
654
+ IR_TERM_VECTOR,
655
+ // all field with termvectors wiht positions enabled
656
+ IR_TERM_VECTOR_WITH_POSITION,
657
+ // all fields where termvectors with offset position are set
658
+ IR_TERM_VECTOR_WITH_OFFSET,
659
+ // all fields where termvectors with offset and position values set
660
+ IR_TERM_VECTOR_WITH_POSITION_OFFSET
661
+ };
662
+
663
+ struct IndexReader {
664
+ mutex_t mutex;
665
+ HshTable *cache;
666
+ HshTable *sort_cache;
667
+ void *data;
668
+ Store *store;
669
+ Lock *write_lock;
670
+ SegmentInfos *sis;
671
+ bool has_changes : 1;
672
+ bool is_stale : 1;
673
+ bool is_owner : 1;
674
+ bool close_store : 1;
675
+ TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
676
+ Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
677
+ int (*num_docs)(IndexReader *ir);
678
+ int (*max_doc)(IndexReader *ir);
679
+ Document *(*get_doc)(IndexReader *ir, int doc_num);
680
+ uchar *(*get_norms)(IndexReader *ir, char *field);
681
+ uchar *(*get_norms_always)(IndexReader *ir, char *field);
682
+ void (*do_set_norm)(IndexReader *ir, int doc_num, char *field, uchar val);
683
+ void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf, int offset);
684
+ TermEnum *(*terms)(IndexReader *ir);
685
+ TermEnum *(*terms_from)(IndexReader *ir, Term *term);
686
+ int (*doc_freq)(IndexReader *ir, Term *t);
687
+ TermDocEnum *(*term_docs)(IndexReader *ir);
688
+ TermDocEnum *(*term_positions)(IndexReader *ir);
689
+ void (*do_delete_doc)(IndexReader *ir, int doc_num);
690
+ void (*do_undelete_all)(IndexReader *ir);
691
+ bool (*is_deleted)(IndexReader *ir, int doc_num);
692
+ bool (*has_deletions)(IndexReader *ir);
693
+ bool (*has_norms)(IndexReader *ir, char *field);
694
+ HashSet *(*get_field_names)(IndexReader *ir, int field_type);
695
+ void (*do_commit)(IndexReader *ir);
696
+ void (*do_close)(IndexReader *ir);
697
+ void (*acquire_write_lock)(IndexReader *ir);
698
+ };
699
+
700
+ IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_store);
701
+ IndexReader *ir_open(Store *store, int close_store);
702
+ bool ir_index_exists(Store *store);
703
+ void ir_close(IndexReader *ir);
704
+ void ir_commit(IndexReader *ir);
705
+ void ir_delete_doc(IndexReader *ir, int doc_num);
706
+ void ir_undelete_all(IndexReader *ir);
707
+ void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
708
+ void ir_destroy(void *p);
709
+ Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
710
+ TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
711
+ TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
712
+ void ir_add_cache(IndexReader *ir);
713
+ bool ir_is_latest(IndexReader *ir);
714
+
715
+ /****************************************************************************
716
+ *
717
+ * Norm
718
+ *
719
+ ****************************************************************************/
720
+
721
+ typedef struct Norm {
722
+ bool is_dirty : 1;
723
+ int field_num;
724
+ InStream *is;
725
+ uchar *bytes;
726
+ } Norm;
727
+
728
+ /****************************************************************************
729
+ *
730
+ * SegmentReader
731
+ *
732
+ ****************************************************************************/
733
+
734
+ struct SegmentReader {
735
+ FieldInfos *fis;
736
+ FieldsReader *fr;
737
+ char *segment;
738
+ BitVector *deleted_docs;
739
+ bool deleted_docs_dirty : 1;
740
+ bool undelete_all : 1;
741
+ bool norms_dirty : 1;
742
+ InStream *freq_in;
743
+ InStream *prox_in;
744
+ TermInfosReader *tir;
745
+ TermVectorsReader *orig_tvr;
746
+ thread_key_t thread_tvr;
747
+ Array *tvr_bucket;
748
+ HshTable *norms;
749
+ Store *cfs_store;
750
+ uchar *fake_norms;
751
+ };
752
+
753
+ IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_store);
754
+ IndexReader *sr_open_si(SegmentInfo *si);
755
+ //int sr_has_deletions(IndexReader *ir);
756
+
757
+ /****************************************************************************
758
+ *
759
+ * MultiReader
760
+ *
761
+ ****************************************************************************/
762
+
763
+ typedef struct MultiReader {
764
+ bool has_deletions : 1;
765
+ int max_doc;
766
+ int num_docs_cache;
767
+ int rcnt;
768
+ int *starts;
769
+ IndexReader **sub_readers;
770
+ HshTable *norms_cache;
771
+ } MultiReader;
772
+
773
+ IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
774
+ int rcnt, int close_store);
775
+
776
+ /****************************************************************************
777
+ *
778
+ * SegmentMergeInfo
779
+ *
780
+ ****************************************************************************/
781
+
782
+ typedef struct SegmentMergeInfo {
783
+ int base;
784
+ IndexReader *ir;
785
+ TermEnum *te;
786
+ TermBuffer *tb;
787
+ TermDocEnum *postings;
788
+ int *doc_map;
789
+ } SegmentMergeInfo;
790
+
791
+ SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
792
+ void smi_destroy(void *p);
793
+ TermBuffer *smi_next(SegmentMergeInfo *smi);
794
+ bool smi_lt(void *p1, void *p2);
795
+
796
+ /****************************************************************************
797
+ *
798
+ * SegmentMerger
799
+ *
800
+ ****************************************************************************/
801
+
802
+ typedef struct SegmentMerger {
803
+ Store *store;
804
+ char *name;
805
+ Array *readers;
806
+ FieldInfos *fis;
807
+ OutStream *freq_out;
808
+ OutStream *prox_out;
809
+ TermInfosWriter *tiw;
810
+ Term *terms_buf;
811
+ int terms_buf_pointer;
812
+ int terms_buf_size;
813
+ PriorityQueue *queue;
814
+ TermInfo *ti;
815
+ int term_index_interval;
816
+ OutStream *skip_buffer;
817
+ int skip_interval;
818
+ int last_skip_doc;
819
+ int last_skip_freq_pointer;
820
+ int last_skip_prox_pointer;
821
+ } SegmentMerger;
822
+
823
+ SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
824
+ void sm_destroy(void *p);
825
+ void sm_add(SegmentMerger *sm, IndexReader *ir);
826
+ int sm_merge(SegmentMerger *sm);
827
+ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
828
+
829
+
830
+ /****************************************************************************
831
+ *
832
+ * IndexWriter
833
+ *
834
+ ****************************************************************************/
835
+
836
+ #define WRITE_LOCK_NAME "write"
837
+ #define COMMIT_LOCK_NAME "commit"
838
+ struct IndexWriter {
839
+ mutex_t mutex;
840
+ int merge_factor;
841
+ int min_merge_docs;
842
+ int max_merge_docs;
843
+ int max_field_length;
844
+ int term_index_interval;
845
+ Store *store;
846
+ Analyzer *analyzer;
847
+ Similarity *similarity;
848
+ SegmentInfos *sis;
849
+ Store *ram_store;
850
+ Lock *write_lock;
851
+ bool close_store : 1;
852
+ bool close_analyzer : 1;
853
+ bool use_compound_file : 1;
854
+ };
855
+
856
+ IndexWriter *iw_open(Store *store, Analyzer *analyzer,
857
+ bool create, bool close_store, bool close_analyzer);
858
+ void iw_flush_ram_segments(IndexWriter *iw);
859
+ void iw_close(IndexWriter *iw);
860
+ int iw_doc_count(IndexWriter *iw);
861
+ void iw_add_doc(IndexWriter *iw, Document *doc);
862
+ void iw_optimize(IndexWriter *iw);
863
+ void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt);
864
+ void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
865
+
866
+ /****************************************************************************
867
+ *
868
+ * CompoundWriter
869
+ *
870
+ ****************************************************************************/
871
+
872
+ typedef struct CompoundWriter {
873
+ Store *store;
874
+ const char *name;
875
+ HashSet *ids;
876
+ Array *file_entries;
877
+ bool merged;
878
+ } CompoundWriter;
879
+
880
+ CompoundWriter *open_cw(Store *store, char *name);
881
+ void cw_add_file(CompoundWriter *cw, char *id);
882
+ void cw_close(CompoundWriter *cw);
883
+
884
+ #endif