ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/index.h ADDED
@@ -0,0 +1,884 @@
1
+ #ifndef FRT_INDEX_H
2
+ #define FRT_INDEX_H
3
+
4
+ #include <limits.h>
5
+ #include "global.h"
6
+ #include "array.h"
7
+ #include "bitvector.h"
8
+ #include "hashset.h"
9
+ #include "priorityqueue.h"
10
+ #include "hash.h"
11
+ #include "store.h"
12
+ #include "document.h"
13
+ #include "analysis.h"
14
+
15
+ #define SEGMENT_NAME_MAX_LENGTH 100
16
+
17
+ typedef struct Config {
18
+ int merge_factor;
19
+ int min_merge_docs;
20
+ int max_merge_docs;
21
+ int max_field_length;
22
+ int term_index_interval;
23
+ } FerretConfig;
24
+
25
+ extern FerretConfig config;
26
+
27
+ typedef struct IndexReader IndexReader;
28
+ typedef struct IndexWriter IndexWriter;
29
+ typedef struct SegmentReader SegmentReader;
30
+
31
+ /***************************************************************************
32
+ *
33
+ * CacheObject
34
+ *
35
+ ***************************************************************************/
36
+
37
+ typedef struct CacheObject {
38
+ HshTable *ref_tab1;
39
+ HshTable *ref_tab2;
40
+ void *ref1;
41
+ void *ref2;
42
+ void *obj;
43
+ void (*destroy)(void *p);
44
+ } CacheObject;
45
+
46
+ void cache_destroy(CacheObject *co);
47
+ CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
48
+ void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
49
+ unsigned int co_hash(const void *key);
50
+ int co_eq(const void *key1, const void *key2);
51
+ HshTable *co_hsh_create();
52
+
53
+ /****************************************************************************
54
+ *
55
+ * FieldInfo
56
+ *
57
+ ****************************************************************************/
58
+
59
+ typedef struct FieldInfo {
60
+ char *name;
61
+ int number;
62
+ bool is_indexed : 1;
63
+ bool store_tv : 1;
64
+ bool store_offset : 1;
65
+ bool store_pos : 1;
66
+ bool omit_norms : 1;
67
+ } FieldInfo;
68
+
69
+ FieldInfo *fi_create(char *name,
70
+ int number,
71
+ bool is_indexed,
72
+ bool store_tv,
73
+ bool store_pos,
74
+ bool store_offset,
75
+ bool omit_norms);
76
+ void fi_destroy(void *p);
77
+
78
+ /****************************************************************************
79
+ *
80
+ * FieldInfos
81
+ *
82
+ ****************************************************************************/
83
+
84
+ typedef struct FieldInfos {
85
+ HashEntry **by_name;
86
+ FieldInfo **by_number;
87
+ int fcnt;
88
+ } FieldInfos;
89
+
90
+ FieldInfos *fis_create();
91
+ FieldInfos *fis_open(Store *store, char *filename);
92
+ void fis_destroy(void *p);
93
+ FieldInfo *fis_add(FieldInfos *fis,
94
+ char *name,
95
+ bool is_indexed,
96
+ bool store_tv,
97
+ bool store_offset,
98
+ bool store_pos,
99
+ bool omit_norms);
100
+
101
+ void fis_add_fields(FieldInfos *fis,
102
+ HashSet *field_names,
103
+ bool is_indexed,
104
+ bool store_tv,
105
+ bool store_offset,
106
+ bool store_pos,
107
+ bool omit_norms);
108
+ bool fis_has_vectors(FieldInfos *fis);
109
+ void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
110
+ FieldInfos *fis_read(FieldInfos *fis, InStream *is);
111
+ FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
112
+ unsigned long long fis_get_number(FieldInfos *fis, char *name);
113
+ FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
114
+
115
+
116
+ /****************************************************************************
117
+ *
118
+ * Term
119
+ *
120
+ ****************************************************************************/
121
+
122
+ typedef struct Term {
123
+ char *field;
124
+ char *text;
125
+ } Term;
126
+
127
+ Term *term_clone(Term *term);
128
+ Term *term_create(const char *field, char *text);
129
+ void term_destroy(void *p);
130
+ int term_cmp(void *t1, void *t2);
131
+ int term_eq(const void *t1, const void *t2);
132
+ unsigned int term_hash(const void *t);
133
+ char *term_to_s(Term *term);
134
+
135
+ /****************************************************************************
136
+ *
137
+ * TermBuffer
138
+ *
139
+ ****************************************************************************/
140
+
141
+ typedef struct TermBuffer {
142
+ char *field;
143
+ char text[MAX_WORD_SIZE];
144
+ } TermBuffer;
145
+
146
+ TermBuffer *tb_create();
147
+ void tb_destroy(void *p);
148
+ TermBuffer *tb_set_term(TermBuffer *tb, Term *t);
149
+ Term *tb_get_term(TermBuffer *tb);
150
+ int tb_cmp(TermBuffer *tb1, TermBuffer *tb2);
151
+ int tb_term_cmp(TermBuffer *tb, Term *t);
152
+ TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2);
153
+ TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis);
154
+
155
+ /****************************************************************************
156
+ *
157
+ * TermInfo
158
+ *
159
+ ****************************************************************************/
160
+
161
+ typedef struct TermInfo {
162
+ int doc_freq;
163
+ int freq_pointer;
164
+ int prox_pointer;
165
+ int skip_offset;
166
+ } TermInfo;
167
+
168
+ TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset);
169
+ TermInfo *ti_set(TermInfo *ti, int df, int fp, int pp, int so);
170
+ void ti_destroy(void *p);
171
+ TermInfo *ti_cpy(TermInfo *ti1, TermInfo *ti2);
172
+ TermInfo *ti_clone(TermInfo *other);
173
+ int ti_eq(TermInfo *ti1, TermInfo *ti2);
174
+
175
+ /****************************************************************************
176
+ *
177
+ * TermEnum
178
+ *
179
+ ****************************************************************************/
180
+
181
+ typedef struct TermEnumFilter TermEnumFilter;
182
+ typedef struct TermEnum TermEnum;
183
+ struct TermEnum {
184
+ void *data;
185
+ TermBuffer *(*next)(TermEnum *te);
186
+ void (*close)(TermEnum *te);
187
+ TermEnum *(*clone)(TermEnum *te);
188
+ TermBuffer *tb_curr;
189
+ TermBuffer *tb_prev;
190
+ TermInfo *ti_curr;
191
+ };
192
+
193
+ TermBuffer *te_skip_to(struct TermEnum *te, Term *t);
194
+
195
+ Term *te_get_term(struct TermEnum *te);
196
+ TermInfo *te_get_ti(struct TermEnum *te);
197
+
198
+ /* * SegmentTermEnum * */
199
+
200
+ typedef struct SegmentTermEnum {
201
+ FieldInfos *fis;
202
+ int is_index;
203
+ InStream *is;
204
+ int size;
205
+ int pos;
206
+ int index_pointer;
207
+ int index_interval;
208
+ int skip_interval;
209
+ int format_m1skip_interval;
210
+ int format;
211
+ } SegmentTermEnum;
212
+
213
+
214
+ TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index);
215
+ TermBuffer *ste_next(struct TermEnum *te);
216
+ void ste_close(struct TermEnum *te);
217
+
218
+ /* * MultiTermEnum * */
219
+
220
+ typedef struct MultiTermEnum {
221
+ int doc_freq;
222
+ PriorityQueue *smi_queue;
223
+ } MultiTermEnum;
224
+
225
+ TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *term);
226
+
227
+ /****************************************************************************
228
+ *
229
+ * TermInfosWriter
230
+ *
231
+ ****************************************************************************/
232
+
233
+ #define TERM_INFO_FORMAT -2
234
+
235
+ typedef struct TermInfosWriter {
236
+ int index_interval;
237
+ int skip_interval;
238
+ int size;
239
+ int last_index_pointer;
240
+ bool is_index;
241
+ OutStream *os;
242
+ struct TermInfosWriter *other;
243
+ Term *last_term;
244
+ TermInfo *last_term_info;
245
+ FieldInfos *fis;
246
+ char *curr_field;
247
+ int curr_field_num;
248
+ } TermInfosWriter;
249
+
250
+ TermInfosWriter *tiw_open(Store *store,
251
+ char *segment,
252
+ FieldInfos *fis,
253
+ int interval);
254
+ void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti);
255
+ void tiw_close(TermInfosWriter *tiw);
256
+
257
+ /****************************************************************************
258
+ *
259
+ * TermInfosReader
260
+ *
261
+ ****************************************************************************/
262
+
263
+ typedef struct TermInfosReader {
264
+ mutex_t mutex;
265
+ TermEnum *orig_te;
266
+ thread_key_t thread_te;
267
+ Array *te_bucket;
268
+ TermEnum *index_te;
269
+ int size;
270
+ int skip_interval;
271
+ int index_size;
272
+ Term **index_terms;
273
+ TermInfo **index_term_infos;
274
+ int *index_pointers;
275
+ } TermInfosReader;
276
+
277
+ TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis);
278
+ void tir_close(TermInfosReader *tir);
279
+ Term *tir_get_term(TermInfosReader *tir, int position);
280
+ int tir_get_term_pos(TermInfosReader *tir, Term *t);
281
+ TermInfo *tir_get_ti(TermInfosReader *tir, Term *t);
282
+
283
+ /****************************************************************************
284
+ *
285
+ * TVOffsetInfo
286
+ *
287
+ ****************************************************************************/
288
+
289
+ typedef struct TVOffsetInfo {
290
+ int start;
291
+ int end;
292
+ } TVOffsetInfo;
293
+
294
+ TVOffsetInfo *tvoi_create(int start, int end);
295
+ void tvoi_destroy(void *p);
296
+
297
+ /****************************************************************************
298
+ *
299
+ * TVField
300
+ *
301
+ ****************************************************************************/
302
+
303
+ typedef struct TVField {
304
+ int tvf_pointer;
305
+ int number;
306
+ unsigned int store_positions : 1;
307
+ unsigned int store_offsets : 1;
308
+ } TVField;
309
+
310
+ TVField *tvf_create(int number, int store_positions, int store_offsets);
311
+ void tvf_destroy(void *p);
312
+
313
+ /****************************************************************************
314
+ *
315
+ * TVTerm
316
+ *
317
+ ****************************************************************************/
318
+
319
+ typedef struct TVTerm {
320
+ char *text;
321
+ int freq;
322
+ int *positions;
323
+ TVOffsetInfo **offsets;
324
+ } TVTerm;
325
+
326
+ TVTerm *tvt_create(char *text,
327
+ int freq,
328
+ int *positions,
329
+ TVOffsetInfo **offsets);
330
+ void tvt_destroy(void *p);
331
+
332
+ /****************************************************************************
333
+ *
334
+ * TermVector
335
+ *
336
+ ****************************************************************************/
337
+
338
+ typedef struct TermVector {
339
+ char *field;
340
+ char **terms;
341
+ int tcnt;
342
+ int *freqs;
343
+ int **positions;
344
+ TVOffsetInfo ***offsets;
345
+ } TermVector;
346
+
347
+ TermVector *tv_create(const char *field,
348
+ char **terms,
349
+ int tcnt,
350
+ int *freqs,
351
+ int **positions,
352
+ TVOffsetInfo ***offsets);
353
+ void tv_destroy(void *p);
354
+
355
+ /****************************************************************************
356
+ *
357
+ * TermVectorsWriter
358
+ *
359
+ ****************************************************************************/
360
+
361
+ #define STORE_POSITIONS_WITH_TERMVECTOR 0x1
362
+ #define STORE_OFFSET_WITH_TERMVECTOR 0x2
363
+
364
+ #define FORMAT_VERSION 2
365
+ #define FORMAT_SIZE 4
366
+
367
+ #define TVX_EXTENSION ".tvx"
368
+ #define TVD_EXTENSION ".tvd"
369
+ #define TVF_EXTENSION ".tvf"
370
+
371
+ typedef struct TermVectorsWriter {
372
+ TVField *curr_field;
373
+ int curr_doc_pointer;
374
+ OutStream *tvx;
375
+ OutStream *tvd;
376
+ OutStream *tvf;
377
+ FieldInfos *fis;
378
+ TVField **fields;
379
+ int fcnt;
380
+ int fsize;
381
+ TVTerm **terms;
382
+ int tcnt;
383
+ int tsize;
384
+ } TermVectorsWriter;
385
+
386
+ TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
387
+ void tvw_close(TermVectorsWriter *tvw);
388
+ void tvw_open_doc(TermVectorsWriter *tvw);
389
+ void tvw_close_doc(TermVectorsWriter *tvw);
390
+ void tvw_open_field(TermVectorsWriter *tvw, char *field);
391
+ void tvw_close_field(TermVectorsWriter *tvw);
392
+ void tvw_add_term(TermVectorsWriter *tvw, char *text, int freq, int *positions, TVOffsetInfo **offsets);
393
+ void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
394
+
395
+
396
+ /****************************************************************************
397
+ *
398
+ * TermVectorsReader
399
+ *
400
+ ****************************************************************************/
401
+
402
+ typedef struct TermVectorsReader {
403
+ int size;
404
+ InStream *tvx;
405
+ InStream *tvd;
406
+ InStream *tvf;
407
+ FieldInfos *fis;
408
+ int tvd_format;
409
+ int tvf_format;
410
+ } TermVectorsReader;
411
+
412
+ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
413
+ TermVectorsReader *tvr_clone(TermVectorsReader *orig);
414
+ void tvr_close(TermVectorsReader *tvr);
415
+ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
416
+ char *field, int tvf_pointer);
417
+ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
418
+ TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
419
+
420
+ /****************************************************************************
421
+ *
422
+ * FieldsWriter
423
+ *
424
+ ****************************************************************************/
425
+
426
+ #define FIELD_IS_TOKENIZED 0X1
427
+ #define FIELD_IS_BINARY 0X2
428
+ #define FIELD_IS_COMPRESSED 0X4
429
+
430
+ typedef struct FieldsWriter {
431
+ FieldInfos *fis;
432
+ OutStream *fields_out;
433
+ OutStream *index_out;
434
+ } FieldsWriter;
435
+
436
+ FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
437
+ void fw_close(FieldsWriter *fw);
438
+ void fw_add_doc(FieldsWriter *fw, Document *doc);
439
+
440
+ /****************************************************************************
441
+ *
442
+ * TermDocEnum
443
+ *
444
+ ****************************************************************************/
445
+
446
+ typedef struct TermDocEnum TermDocEnum;
447
+ struct TermDocEnum {
448
+ void *data;
449
+ void (*seek)(TermDocEnum *tde, Term *term);
450
+ int (*doc_num)(TermDocEnum *tde);
451
+ int (*freq)(TermDocEnum *tde);
452
+ bool (*next)(TermDocEnum *tde);
453
+ int (*read)(TermDocEnum *tde, int *docs, int *freqs, int req_num);
454
+ bool (*skip_to)(TermDocEnum *tde, int target);
455
+ int (*next_position)(TermDocEnum *tde);
456
+ void (*close)(TermDocEnum *tde);
457
+ };
458
+
459
+ /* * SegmentTermDocEnum * */
460
+
461
+ typedef struct SegmentTermDocEnum SegmentTermDocEnum;
462
+ struct SegmentTermDocEnum {
463
+ SegmentReader *parent;
464
+ InStream *freq_in;
465
+ int count; // the number of docs for this term that we have skipped
466
+ int doc_freq; // the number of doc this term appears in
467
+ BitVector *deleted_docs;
468
+ int doc_num;
469
+ int freq;
470
+ int skip_interval;
471
+ int num_skips;
472
+ int skip_count;
473
+ InStream *skip_in;
474
+ int skip_doc;
475
+ int freq_pointer;
476
+ int prox_pointer;
477
+ int skip_pointer;
478
+ unsigned int have_skipped : 1;
479
+ void (*skip_prox)(SegmentTermDocEnum *stde);
480
+ InStream *prox_in;
481
+ int prox_cnt;
482
+ int position;
483
+ void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
484
+ };
485
+
486
+ TermDocEnum *stde_create(IndexReader *ir);
487
+ void stde_seek_ti(TermDocEnum *tde, TermInfo *ti);
488
+
489
+ /* * SegmentTermPosEnum * */
490
+ TermDocEnum *stpe_create(IndexReader *ir);
491
+
492
+ /* * MultiTermDocEnum * */
493
+ typedef struct MultiTermDocEnum MultiTermDocEnum;
494
+ struct MultiTermDocEnum {
495
+ IndexReader **irs;
496
+ int *starts;
497
+ int ir_cnt;
498
+ Term *term;
499
+ int base;
500
+ int pointer;
501
+ TermDocEnum **irs_tde;
502
+ TermDocEnum *curr_tde;
503
+ TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
504
+ };
505
+
506
+ TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
507
+
508
+ /* * MultiTermPosEnum * */
509
+ TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
510
+
511
+ /****************************************************************************
512
+ * MultipleTermDocPosEnum
513
+ ****************************************************************************/
514
+
515
+ #define MTDPE_POS_QUEUE_INIT_CAPA 8
516
+ typedef struct {
517
+ int doc_num;
518
+ int freq;
519
+ PriorityQueue *pq;
520
+ int *pos_queue;
521
+ int pos_queue_index;
522
+ int pos_queue_capa;
523
+ } MultipleTermDocPosEnum;
524
+
525
+ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
526
+
527
+ /****************************************************************************
528
+ *
529
+ * FieldsReader
530
+ *
531
+ ****************************************************************************/
532
+
533
+ typedef struct FieldsReader {
534
+ int len;
535
+ FieldInfos *fis;
536
+ InStream *fields_in;
537
+ InStream *index_in;
538
+ } FieldsReader;
539
+
540
+ FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
541
+ void fr_close(FieldsReader *fr);
542
+ Document *fr_get_doc(FieldsReader *fr, int doc_num);
543
+
544
+ /****************************************************************************
545
+ *
546
+ * Posting
547
+ *
548
+ ****************************************************************************/
549
+
550
+ typedef struct Posting {
551
+ Term *term;
552
+ int freq;
553
+ int size;
554
+ int *positions;
555
+ TVOffsetInfo **offsets;
556
+ } Posting;
557
+
558
+ Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
559
+ void p_destroy(void *p);
560
+ void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset);
561
+
562
+
563
+ /****************************************************************************
564
+ *
565
+ * DocumentWriter
566
+ *
567
+ ****************************************************************************/
568
+
569
+ #include "search.h"
570
+
571
+ typedef struct DocumentWriter {
572
+ Store *store;
573
+ Analyzer *analyzer;
574
+ Similarity *similarity;
575
+ HshTable *postingtable;
576
+ int pcnt;
577
+ FieldInfos *fis;
578
+ float *field_boosts;
579
+ int *field_lengths;
580
+ int *field_positions;
581
+ int *field_offsets;
582
+ int max_field_length;
583
+ int term_index_interval;
584
+ } DocumentWriter;
585
+
586
+ DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
587
+ Similarity *similarity, int max_field_length, int term_index_interval);
588
+ void dw_close(DocumentWriter *dw);
589
+ void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
590
+
591
+ /****************************************************************************
592
+ *
593
+ * SegmentInfo
594
+ *
595
+ ****************************************************************************/
596
+
597
+ typedef struct SegmentInfo {
598
+ char *name;
599
+ int doc_cnt;
600
+ Store *store;
601
+ } SegmentInfo;
602
+
603
+ SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
604
+ void si_destroy(void *p);
605
+ bool si_has_deletions(SegmentInfo *si);
606
+ bool si_uses_compound_file(SegmentInfo *si);
607
+ bool si_has_separate_norms(SegmentInfo *si);
608
+
609
+ /****************************************************************************
610
+ *
611
+ * SegmentInfos
612
+ *
613
+ ****************************************************************************/
614
+
615
+ typedef struct SegmentInfos {
616
+ Store *store;
617
+ SegmentInfo **segs;
618
+ int scnt;
619
+ int size;
620
+ int counter;
621
+ unsigned int version;
622
+ int format;
623
+ } SegmentInfos;
624
+
625
+ SegmentInfos *sis_create();
626
+ void sis_destroy(void *p);
627
+ void sis_add_si(SegmentInfos *sis, SegmentInfo *si);
628
+ void sis_del_at(SegmentInfos *sis, int at);
629
+ void sis_del_from_to(SegmentInfos *sis, int from, int to);
630
+ void sis_clear(SegmentInfos *sis);
631
+ void sis_read(SegmentInfos *sis, Store *store);
632
+ void sis_write(SegmentInfos *sis, Store *store);
633
+ int sis_read_current_version(Store *store);
634
+
635
+ /****************************************************************************
636
+ *
637
+ * IndexReader
638
+ *
639
+ ****************************************************************************/
640
+
641
+ enum FIELD_TYPE {
642
+ // all fields
643
+ IR_ALL,
644
+ // all indexed fields
645
+ IR_INDEXED,
646
+ // all fields which are not indexed
647
+ IR_UNINDEXED,
648
+ // all fields which are indexed with termvectors enables
649
+ IR_INDEXED_WITH_TERM_VECTOR,
650
+ // all fields which are indexed but don't have termvectors enabled
651
+ IR_INDEXED_NO_TERM_VECTOR,
652
+ // all fields where termvectors are enabled. Please note that only standard
653
+ // termvector fields are returned
654
+ IR_TERM_VECTOR,
655
+ // all field with termvectors wiht positions enabled
656
+ IR_TERM_VECTOR_WITH_POSITION,
657
+ // all fields where termvectors with offset position are set
658
+ IR_TERM_VECTOR_WITH_OFFSET,
659
+ // all fields where termvectors with offset and position values set
660
+ IR_TERM_VECTOR_WITH_POSITION_OFFSET
661
+ };
662
+
663
+ struct IndexReader {
664
+ mutex_t mutex;
665
+ HshTable *cache;
666
+ HshTable *sort_cache;
667
+ void *data;
668
+ Store *store;
669
+ Lock *write_lock;
670
+ SegmentInfos *sis;
671
+ bool has_changes : 1;
672
+ bool is_stale : 1;
673
+ bool is_owner : 1;
674
+ bool close_store : 1;
675
+ TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
676
+ Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
677
+ int (*num_docs)(IndexReader *ir);
678
+ int (*max_doc)(IndexReader *ir);
679
+ Document *(*get_doc)(IndexReader *ir, int doc_num);
680
+ uchar *(*get_norms)(IndexReader *ir, char *field);
681
+ uchar *(*get_norms_always)(IndexReader *ir, char *field);
682
+ void (*do_set_norm)(IndexReader *ir, int doc_num, char *field, uchar val);
683
+ void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf, int offset);
684
+ TermEnum *(*terms)(IndexReader *ir);
685
+ TermEnum *(*terms_from)(IndexReader *ir, Term *term);
686
+ int (*doc_freq)(IndexReader *ir, Term *t);
687
+ TermDocEnum *(*term_docs)(IndexReader *ir);
688
+ TermDocEnum *(*term_positions)(IndexReader *ir);
689
+ void (*do_delete_doc)(IndexReader *ir, int doc_num);
690
+ void (*do_undelete_all)(IndexReader *ir);
691
+ bool (*is_deleted)(IndexReader *ir, int doc_num);
692
+ bool (*has_deletions)(IndexReader *ir);
693
+ bool (*has_norms)(IndexReader *ir, char *field);
694
+ HashSet *(*get_field_names)(IndexReader *ir, int field_type);
695
+ void (*do_commit)(IndexReader *ir);
696
+ void (*do_close)(IndexReader *ir);
697
+ void (*acquire_write_lock)(IndexReader *ir);
698
+ };
699
+
700
+ IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_store);
701
+ IndexReader *ir_open(Store *store, int close_store);
702
+ bool ir_index_exists(Store *store);
703
+ void ir_close(IndexReader *ir);
704
+ void ir_commit(IndexReader *ir);
705
+ void ir_delete_doc(IndexReader *ir, int doc_num);
706
+ void ir_undelete_all(IndexReader *ir);
707
+ void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
708
+ void ir_destroy(void *p);
709
+ Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
710
+ TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
711
+ TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
712
+ void ir_add_cache(IndexReader *ir);
713
+ bool ir_is_latest(IndexReader *ir);
714
+
715
+ /****************************************************************************
716
+ *
717
+ * Norm
718
+ *
719
+ ****************************************************************************/
720
+
721
+ typedef struct Norm {
722
+ bool is_dirty : 1;
723
+ int field_num;
724
+ InStream *is;
725
+ uchar *bytes;
726
+ } Norm;
727
+
728
+ /****************************************************************************
729
+ *
730
+ * SegmentReader
731
+ *
732
+ ****************************************************************************/
733
+
734
+ struct SegmentReader {
735
+ FieldInfos *fis;
736
+ FieldsReader *fr;
737
+ char *segment;
738
+ BitVector *deleted_docs;
739
+ bool deleted_docs_dirty : 1;
740
+ bool undelete_all : 1;
741
+ bool norms_dirty : 1;
742
+ InStream *freq_in;
743
+ InStream *prox_in;
744
+ TermInfosReader *tir;
745
+ TermVectorsReader *orig_tvr;
746
+ thread_key_t thread_tvr;
747
+ Array *tvr_bucket;
748
+ HshTable *norms;
749
+ Store *cfs_store;
750
+ uchar *fake_norms;
751
+ };
752
+
753
+ IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_store);
754
+ IndexReader *sr_open_si(SegmentInfo *si);
755
+ //int sr_has_deletions(IndexReader *ir);
756
+
757
+ /****************************************************************************
758
+ *
759
+ * MultiReader
760
+ *
761
+ ****************************************************************************/
762
+
763
+ typedef struct MultiReader {
764
+ bool has_deletions : 1;
765
+ int max_doc;
766
+ int num_docs_cache;
767
+ int rcnt;
768
+ int *starts;
769
+ IndexReader **sub_readers;
770
+ HshTable *norms_cache;
771
+ } MultiReader;
772
+
773
+ IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
774
+ int rcnt, int close_store);
775
+
776
+ /****************************************************************************
777
+ *
778
+ * SegmentMergeInfo
779
+ *
780
+ ****************************************************************************/
781
+
782
+ typedef struct SegmentMergeInfo {
783
+ int base;
784
+ IndexReader *ir;
785
+ TermEnum *te;
786
+ TermBuffer *tb;
787
+ TermDocEnum *postings;
788
+ int *doc_map;
789
+ } SegmentMergeInfo;
790
+
791
+ SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
792
+ void smi_destroy(void *p);
793
+ TermBuffer *smi_next(SegmentMergeInfo *smi);
794
+ bool smi_lt(void *p1, void *p2);
795
+
796
+ /****************************************************************************
797
+ *
798
+ * SegmentMerger
799
+ *
800
+ ****************************************************************************/
801
+
802
+ typedef struct SegmentMerger {
803
+ Store *store;
804
+ char *name;
805
+ Array *readers;
806
+ FieldInfos *fis;
807
+ OutStream *freq_out;
808
+ OutStream *prox_out;
809
+ TermInfosWriter *tiw;
810
+ Term *terms_buf;
811
+ int terms_buf_pointer;
812
+ int terms_buf_size;
813
+ PriorityQueue *queue;
814
+ TermInfo *ti;
815
+ int term_index_interval;
816
+ OutStream *skip_buffer;
817
+ int skip_interval;
818
+ int last_skip_doc;
819
+ int last_skip_freq_pointer;
820
+ int last_skip_prox_pointer;
821
+ } SegmentMerger;
822
+
823
+ SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
824
+ void sm_destroy(void *p);
825
+ void sm_add(SegmentMerger *sm, IndexReader *ir);
826
+ int sm_merge(SegmentMerger *sm);
827
+ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
828
+
829
+
830
+ /****************************************************************************
831
+ *
832
+ * IndexWriter
833
+ *
834
+ ****************************************************************************/
835
+
836
+ #define WRITE_LOCK_NAME "write"
837
+ #define COMMIT_LOCK_NAME "commit"
838
+ struct IndexWriter {
839
+ mutex_t mutex;
840
+ int merge_factor;
841
+ int min_merge_docs;
842
+ int max_merge_docs;
843
+ int max_field_length;
844
+ int term_index_interval;
845
+ Store *store;
846
+ Analyzer *analyzer;
847
+ Similarity *similarity;
848
+ SegmentInfos *sis;
849
+ Store *ram_store;
850
+ Lock *write_lock;
851
+ bool close_store : 1;
852
+ bool close_analyzer : 1;
853
+ bool use_compound_file : 1;
854
+ };
855
+
856
+ IndexWriter *iw_open(Store *store, Analyzer *analyzer,
857
+ bool create, bool close_store, bool close_analyzer);
858
+ void iw_flush_ram_segments(IndexWriter *iw);
859
+ void iw_close(IndexWriter *iw);
860
+ int iw_doc_count(IndexWriter *iw);
861
+ void iw_add_doc(IndexWriter *iw, Document *doc);
862
+ void iw_optimize(IndexWriter *iw);
863
+ void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt);
864
+ void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
865
+
866
+ /****************************************************************************
867
+ *
868
+ * CompoundWriter
869
+ *
870
+ ****************************************************************************/
871
+
872
+ typedef struct CompoundWriter {
873
+ Store *store;
874
+ const char *name;
875
+ HashSet *ids;
876
+ Array *file_entries;
877
+ bool merged;
878
+ } CompoundWriter;
879
+
880
+ CompoundWriter *open_cw(Store *store, char *name);
881
+ void cw_add_file(CompoundWriter *cw, char *id);
882
+ void cw_close(CompoundWriter *cw);
883
+
884
+ #endif