sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,961 @@
1
+ #ifndef FRT_INDEX_H
2
+ #define FRT_INDEX_H
3
+
4
+ #include "global.h"
5
+ #include "document.h"
6
+ #include "analysis.h"
7
+ #include "hash.h"
8
+ #include "hashset.h"
9
+ #include "store.h"
10
+ #include "mempool.h"
11
+ #include "similarity.h"
12
+ #include "bitvector.h"
13
+ #include "priorityqueue.h"
14
+
15
+ typedef struct IndexReader IndexReader;
16
+ typedef struct MultiReader MultiReader;
17
+ typedef struct Deleter Deleter;
18
+
19
+ /****************************************************************************
20
+ *
21
+ * Config
22
+ *
23
+ ****************************************************************************/
24
+
25
+ typedef struct Config
26
+ {
27
+ int chunk_size;
28
+ int max_buffer_memory;
29
+ int index_interval;
30
+ int skip_interval;
31
+ int merge_factor;
32
+ int max_buffered_docs;
33
+ int max_merge_docs;
34
+ int max_field_length;
35
+ bool use_compound_file;
36
+ } Config;
37
+
38
+ extern const Config default_config;
39
+
40
+ /***************************************************************************
41
+ *
42
+ * CacheObject
43
+ *
44
+ ***************************************************************************/
45
+
46
+ typedef struct CacheObject {
47
+ HashTable *ref_tab1;
48
+ HashTable *ref_tab2;
49
+ void *ref1;
50
+ void *ref2;
51
+ void *obj;
52
+ void (*destroy)(void *p);
53
+ } CacheObject;
54
+
55
+ extern void cache_destroy(CacheObject *co);
56
+ extern CacheObject *co_create(HashTable *ref_tab1, HashTable *ref_tab2,
57
+ void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
58
+ extern HashTable *co_hash_create();
59
+
60
+ /****************************************************************************
61
+ *
62
+ * FieldInfo
63
+ *
64
+ ****************************************************************************/
65
+
66
+ enum StoreValues
67
+ {
68
+ STORE_NO = 0,
69
+ STORE_YES = 1,
70
+ STORE_COMPRESS = 2
71
+ };
72
+
73
+ enum IndexValues
74
+ {
75
+ INDEX_NO = 0,
76
+ INDEX_UNTOKENIZED = 1,
77
+ INDEX_YES = 3,
78
+ INDEX_UNTOKENIZED_OMIT_NORMS = 5,
79
+ INDEX_YES_OMIT_NORMS = 7
80
+ };
81
+
82
+ enum TermVectorValues
83
+ {
84
+ TERM_VECTOR_NO = 0,
85
+ TERM_VECTOR_YES = 1,
86
+ TERM_VECTOR_WITH_POSITIONS = 3,
87
+ TERM_VECTOR_WITH_OFFSETS = 5,
88
+ TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
89
+ };
90
+
91
+ #define FI_IS_STORED_BM 0x001
92
+ #define FI_IS_COMPRESSED_BM 0x002
93
+ #define FI_IS_INDEXED_BM 0x004
94
+ #define FI_IS_TOKENIZED_BM 0x008
95
+ #define FI_OMIT_NORMS_BM 0x010
96
+ #define FI_STORE_TERM_VECTOR_BM 0x020
97
+ #define FI_STORE_POSITIONS_BM 0x040
98
+ #define FI_STORE_OFFSETS_BM 0x080
99
+
100
+ typedef struct FieldInfo
101
+ {
102
+ char *name;
103
+ float boost;
104
+ unsigned int bits;
105
+ int number;
106
+ int ref_cnt;
107
+ } FieldInfo;
108
+
109
+ extern FieldInfo *fi_new(const char *name,
110
+ enum StoreValues store,
111
+ enum IndexValues index,
112
+ enum TermVectorValues term_vector);
113
+ extern char *fi_to_s(FieldInfo *fi);
114
+ extern void fi_deref(FieldInfo *fi);
115
+
116
+ #define fi_is_stored(fi) (((fi)->bits & FI_IS_STORED_BM) != 0)
117
+ #define fi_is_compressed(fi) (((fi)->bits & FI_IS_COMPRESSED_BM) != 0)
118
+ #define fi_is_indexed(fi) (((fi)->bits & FI_IS_INDEXED_BM) != 0)
119
+ #define fi_is_tokenized(fi) (((fi)->bits & FI_IS_TOKENIZED_BM) != 0)
120
+ #define fi_omit_norms(fi) (((fi)->bits & FI_OMIT_NORMS_BM) != 0)
121
+ #define fi_store_term_vector(fi) (((fi)->bits & FI_STORE_TERM_VECTOR_BM) != 0)
122
+ #define fi_store_positions(fi) (((fi)->bits & FI_STORE_POSITIONS_BM) != 0)
123
+ #define fi_store_offsets(fi) (((fi)->bits & FI_STORE_OFFSETS_BM) != 0)
124
+ #define fi_has_norms(fi)\
125
+ (((fi)->bits & (FI_OMIT_NORMS_BM|FI_IS_INDEXED_BM)) == FI_IS_INDEXED_BM)
126
+
127
+ /****************************************************************************
128
+ *
129
+ * FieldInfos
130
+ *
131
+ ****************************************************************************/
132
+
133
+ #define FIELD_INFOS_INIT_CAPA 4
134
+ /* carry changes over to dummy_fis in test/test_segments.c */
135
+ typedef struct FieldInfos
136
+ {
137
+ int store;
138
+ int index;
139
+ int term_vector;
140
+ int size;
141
+ int capa;
142
+ FieldInfo **fields;
143
+ HashTable *field_dict;
144
+ int ref_cnt;
145
+ } FieldInfos;
146
+
147
+ extern FieldInfos *fis_new(int store, int index, int term_vector);
148
+ extern FieldInfo *fis_add_field(FieldInfos *fis, FieldInfo *fi);
149
+ extern FieldInfo *fis_get_field(FieldInfos *fis, const char *name);
150
+ extern int fis_get_field_num(FieldInfos *fis, const char *name);
151
+ extern FieldInfo *fis_get_or_add_field(FieldInfos *fis, const char *name);
152
+ extern void fis_write(FieldInfos *fis, OutStream *os);
153
+ extern FieldInfos *fis_read(InStream *is);
154
+ extern char *fis_to_s(FieldInfos *fis);
155
+ extern void fis_deref(FieldInfos *fis);
156
+
157
+ /****************************************************************************
158
+ *
159
+ * SegmentInfo
160
+ *
161
+ ****************************************************************************/
162
+
163
+ #define SEGMENT_NAME_MAX_LENGTH 100
164
+ #define SEGMENTS_FILE_NAME "segments"
165
+
166
+ typedef struct SegmentInfo
167
+ {
168
+ int ref_cnt;
169
+ char *name;
170
+ Store *store;
171
+ int doc_cnt;
172
+ int del_gen;
173
+ int *norm_gens;
174
+ int norm_gens_size;
175
+ bool use_compound_file;
176
+ } SegmentInfo;
177
+
178
+ extern SegmentInfo *si_new(char *name, int doc_cnt, Store *store);
179
+ extern void si_deref(SegmentInfo *si);
180
+ extern bool si_has_deletions(SegmentInfo *si);
181
+ extern bool si_uses_compound_file(SegmentInfo *si);
182
+ extern bool si_has_separate_norms(SegmentInfo *si);
183
+ extern void si_advance_norm_gen(SegmentInfo *si, int field_num);
184
+
185
+ /****************************************************************************
186
+ *
187
+ * SegmentInfos
188
+ *
189
+ ****************************************************************************/
190
+
191
+ typedef struct SegmentInfos
192
+ {
193
+ FieldInfos *fis;
194
+ f_u64 counter;
195
+ f_u64 version;
196
+ f_i64 generation;
197
+ f_i32 format;
198
+ Store *store;
199
+ SegmentInfo **segs;
200
+ int size;
201
+ int capa;
202
+ } SegmentInfos;
203
+
204
+ extern char *fn_for_generation(char *buf, char *base, char *ext, f_i64 gen);
205
+
206
+ extern SegmentInfos *sis_new(FieldInfos *fis);
207
+ extern SegmentInfo *sis_new_segment(SegmentInfos *sis, int dcnt, Store *store);
208
+ extern SegmentInfo *sis_add_si(SegmentInfos *sis, SegmentInfo *si);
209
+ extern void sis_del_at(SegmentInfos *sis, int at);
210
+ extern void sis_del_from_to(SegmentInfos *sis, int from, int to);
211
+ extern void sis_clear(SegmentInfos *sis);
212
+ extern SegmentInfos *sis_read(Store *store);
213
+ extern void sis_write(SegmentInfos *sis, Store *store, Deleter *deleter);
214
+ extern f_u64 sis_read_current_version(Store *store);
215
+ extern void sis_destroy(SegmentInfos *sis);
216
+ extern f_i64 sis_current_segment_generation(Store *store);
217
+ extern char *sis_curr_seg_file_name(char *buf, Store *store);
218
+ extern void sis_put(SegmentInfos *sis, FILE *stream);
219
+
220
+ /****************************************************************************
221
+ *
222
+ * TermInfo
223
+ *
224
+ ****************************************************************************/
225
+
226
+ typedef struct TermInfo
227
+ {
228
+ int doc_freq;
229
+ off_t frq_ptr;
230
+ off_t prx_ptr;
231
+ off_t skip_offset;
232
+ } TermInfo;
233
+
234
+ #define ti_set(ti, mdf, mfp, mpp, mso) do {\
235
+ (ti).doc_freq = mdf;\
236
+ (ti).frq_ptr = mfp;\
237
+ (ti).prx_ptr = mpp;\
238
+ (ti).skip_offset = mso;\
239
+ } while (0)
240
+
241
+ /****************************************************************************
242
+ *
243
+ * TermEnum
244
+ *
245
+ ****************************************************************************/
246
+
247
+ typedef struct TermEnum TermEnum;
248
+
249
+ struct TermEnum
250
+ {
251
+ char curr_term[MAX_WORD_SIZE];
252
+ char prev_term[MAX_WORD_SIZE];
253
+ TermInfo curr_ti;
254
+ int curr_term_len;
255
+ int field_num;
256
+ TermEnum *(*set_field)(TermEnum *te, int field_num);
257
+ char *(*next)(TermEnum *te);
258
+ char *(*skip_to)(TermEnum *te, const char *term);
259
+ void (*close)(TermEnum *te);
260
+ TermEnum *(*clone)(TermEnum *te);
261
+ };
262
+
263
+ char *te_get_term(struct TermEnum *te);
264
+ TermInfo *te_get_ti(struct TermEnum *te);
265
+
266
+ /****************************************************************************
267
+ *
268
+ * SegmentTermEnum
269
+ *
270
+ ****************************************************************************/
271
+
272
+ /* * SegmentTermIndex * */
273
+
274
+ typedef struct SegmentTermIndex
275
+ {
276
+ off_t index_ptr;
277
+ off_t ptr;
278
+ int index_size;
279
+ int size;
280
+ char **index_terms;
281
+ int *index_term_lens;
282
+ TermInfo *index_term_infos;
283
+ off_t *index_ptrs;
284
+ } SegmentTermIndex;
285
+
286
+ /* * SegmentFieldIndex * */
287
+
288
+ typedef struct SegmentTermEnum SegmentTermEnum;
289
+
290
+ typedef struct SegmentFieldIndex
291
+ {
292
+ mutex_t mutex;
293
+ int skip_interval;
294
+ int index_interval;
295
+ off_t index_ptr;
296
+ TermEnum *index_te;
297
+ HashTable *field_dict;
298
+ } SegmentFieldIndex;
299
+
300
+ extern SegmentFieldIndex *sfi_open(Store *store, const char *segment);
301
+ extern void sfi_close(SegmentFieldIndex *sfi);
302
+
303
+
304
+ /* * SegmentTermEnum * */
305
+ struct SegmentTermEnum
306
+ {
307
+ TermEnum te;
308
+ InStream *is;
309
+ int size;
310
+ int pos;
311
+ int skip_interval;
312
+ SegmentFieldIndex *sfi;
313
+ };
314
+
315
+ extern void ste_close(TermEnum *te);
316
+ extern TermEnum *ste_clone(TermEnum *te);
317
+ extern TermEnum *ste_new(InStream *is, SegmentFieldIndex *sfi);
318
+
319
+ /* * MultiTermEnum * */
320
+
321
+ extern TermEnum *mte_new(MultiReader *mr, int field_num, const char *term);
322
+
323
+ /****************************************************************************
324
+ *
325
+ * TermInfosReader
326
+ *
327
+ ****************************************************************************/
328
+
329
+ #define TE_BUCKET_INIT_CAPA 1
330
+
331
+ typedef struct TermInfosReader
332
+ {
333
+ thread_key_t thread_te;
334
+ void **te_bucket;
335
+ TermEnum *orig_te;
336
+ int field_num;
337
+ } TermInfosReader;
338
+
339
+ extern TermInfosReader *tir_open(Store *store,
340
+ SegmentFieldIndex *sfi,
341
+ const char *segment);
342
+ extern TermInfosReader *tir_set_field(TermInfosReader *tir, int field_num);
343
+ extern TermInfo *tir_get_ti(TermInfosReader *tir, const char *term);
344
+ extern char *tir_get_term(TermInfosReader *tir, int pos);
345
+ extern void tir_close(TermInfosReader *tir);
346
+
347
+ /****************************************************************************
348
+ *
349
+ * TermInfosWriter
350
+ *
351
+ ****************************************************************************/
352
+
353
+ #define INDEX_INTERVAL 128
354
+ #define SKIP_INTERVAL 16
355
+
356
+ typedef struct TermWriter
357
+ {
358
+ int counter;
359
+ const char *last_term;
360
+ TermInfo last_term_info;
361
+ OutStream *os;
362
+ } TermWriter;
363
+
364
+ typedef struct TermInfosWriter
365
+ {
366
+ int field_count;
367
+ int index_interval;
368
+ int skip_interval;
369
+ off_t last_index_ptr;
370
+ OutStream *tfx_out;
371
+ TermWriter *tix_writer;
372
+ TermWriter *tis_writer;
373
+ } TermInfosWriter;
374
+
375
+ extern TermInfosWriter *tiw_open(Store *store,
376
+ const char *segment,
377
+ int index_interval,
378
+ int skip_interval);
379
+ extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
380
+ extern void tiw_add(TermInfosWriter *tiw,
381
+ const char *term,
382
+ int t_len,
383
+ TermInfo *ti);
384
+ extern void tiw_close(TermInfosWriter *tiw);
385
+
386
+ /****************************************************************************
387
+ *
388
+ * TermDocEnum
389
+ *
390
+ ****************************************************************************/
391
+
392
+ typedef struct TermDocEnum TermDocEnum;
393
+ struct TermDocEnum
394
+ {
395
+ void (*seek)(TermDocEnum *tde, int field_num, const char *term);
396
+ void (*seek_te)(TermDocEnum *tde, TermEnum *te);
397
+ void (*seek_ti)(TermDocEnum *tde, TermInfo *ti);
398
+ int (*doc_num)(TermDocEnum *tde);
399
+ int (*freq)(TermDocEnum *tde);
400
+ bool (*next)(TermDocEnum *tde);
401
+ int (*read)(TermDocEnum *tde, int *docs, int *freqs, int req_num);
402
+ bool (*skip_to)(TermDocEnum *tde, int target);
403
+ int (*next_position)(TermDocEnum *tde);
404
+ void (*close)(TermDocEnum *tde);
405
+ };
406
+
407
+ /* * SegmentTermDocEnum * */
408
+
409
+ typedef struct SegmentTermDocEnum SegmentTermDocEnum;
410
+ struct SegmentTermDocEnum
411
+ {
412
+ TermDocEnum tde;
413
+ void (*seek_prox)(SegmentTermDocEnum *stde, off_t prx_ptr);
414
+ void (*skip_prox)(SegmentTermDocEnum *stde);
415
+ TermInfosReader *tir;
416
+ InStream *frq_in;
417
+ InStream *prx_in;
418
+ InStream *skip_in;
419
+ BitVector *deleted_docs;
420
+ int count; /* number of docs for this term skipped */
421
+ int doc_freq; /* number of doc this term appears in */
422
+ int doc_num;
423
+ int freq;
424
+ int num_skips;
425
+ int skip_interval;
426
+ int skip_count;
427
+ int skip_doc;
428
+ int prx_cnt;
429
+ int position;
430
+ off_t frq_ptr;
431
+ off_t prx_ptr;
432
+ off_t skip_ptr;
433
+ bool have_skipped : 1;
434
+ };
435
+
436
+ extern TermDocEnum *stde_new(TermInfosReader *tir, InStream *frq_in,
437
+ BitVector *deleted_docs, int skip_interval);
438
+
439
+ /* * SegmentTermDocEnum * */
440
+ extern TermDocEnum *stpe_new(TermInfosReader *tir, InStream *frq_in,
441
+ InStream *prx_in, BitVector *deleted_docs,
442
+ int skip_interval);
443
+
444
+ /****************************************************************************
445
+ * MultipleTermDocPosEnum
446
+ ****************************************************************************/
447
+
448
+ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
449
+ int t_cnt);
450
+
451
+ /****************************************************************************
452
+ *
453
+ * Offset
454
+ *
455
+ ****************************************************************************/
456
+
457
+ typedef struct Offset
458
+ {
459
+ off_t start;
460
+ off_t end;
461
+ } Offset;
462
+
463
+ extern Offset *offset_new(off_t start, off_t end);
464
+
465
+ /****************************************************************************
466
+ *
467
+ * Occurence
468
+ *
469
+ ****************************************************************************/
470
+
471
+ typedef struct Occurence
472
+ {
473
+ struct Occurence *next;
474
+ int pos;
475
+ } Occurence;
476
+
477
+ /****************************************************************************
478
+ *
479
+ * Posting
480
+ *
481
+ ****************************************************************************/
482
+
483
+ typedef struct Posting
484
+ {
485
+ int freq;
486
+ int doc_num;
487
+ Occurence *first_occ;
488
+ struct Posting *next;
489
+ } Posting;
490
+
491
+ extern Posting *p_new(MemoryPool *mp, int doc_num, int pos);
492
+
493
+ /****************************************************************************
494
+ *
495
+ * PostingList
496
+ *
497
+ ****************************************************************************/
498
+
499
+ typedef struct PostingList
500
+ {
501
+ const char *term;
502
+ int term_len;
503
+ Posting *first;
504
+ Posting *last;
505
+ Occurence *last_occ;
506
+ } PostingList;
507
+
508
+ extern PostingList *pl_new(MemoryPool *mp, const char *term,
509
+ int term_len, Posting *p);
510
+ extern void pl_add_occ(MemoryPool *mp, PostingList *pl, int pos);
511
+
512
+ /****************************************************************************
513
+ *
514
+ * TVField
515
+ *
516
+ ****************************************************************************/
517
+
518
+ typedef struct TVField
519
+ {
520
+ int field_num;
521
+ int size;
522
+ } TVField;
523
+
524
+ /****************************************************************************
525
+ *
526
+ * TVTerm
527
+ *
528
+ ****************************************************************************/
529
+
530
+ typedef struct TVTerm
531
+ {
532
+ char *text;
533
+ int freq;
534
+ int *positions;
535
+ } TVTerm;
536
+
537
+ /****************************************************************************
538
+ *
539
+ * TermVector
540
+ *
541
+ ****************************************************************************/
542
+
543
+ typedef struct TermVector
544
+ {
545
+ int field_num;
546
+ char *field;
547
+ int term_cnt;
548
+ TVTerm *terms;
549
+ int offset_cnt;
550
+ Offset *offsets;
551
+ } TermVector;
552
+
553
+ extern void tv_destroy(TermVector *tv);
554
+ extern int tv_get_tv_term_index(TermVector *tv, const char *term);
555
+ extern TVTerm *tv_get_tv_term(TermVector *tv, const char *term);
556
+
557
+ /****************************************************************************
558
+ *
559
+ * TermVectorsWriter
560
+ *
561
+ ****************************************************************************/
562
+
563
+ #define TV_FIELD_INIT_CAPA 8
564
+
565
+ typedef struct TermVectorsWriter
566
+ {
567
+ OutStream *tvx_out;
568
+ OutStream *tvd_out;
569
+ FieldInfos *fis;
570
+ TVField *fields;
571
+ off_t tvd_ptr;
572
+ } TermVectorsWriter;
573
+
574
+ extern TermVectorsWriter *tvw_open(Store *store,
575
+ const char *segment,
576
+ FieldInfos *fis);
577
+ extern void tvw_open_doc(TermVectorsWriter *tvw);
578
+ extern void tvw_close_doc(TermVectorsWriter *tvw);
579
+ extern void tvw_add_postings(TermVectorsWriter *tvw,
580
+ int field_num,
581
+ PostingList **plists,
582
+ int posting_count,
583
+ Offset *offsets,
584
+ int offset_count);
585
+ extern void tvw_close(TermVectorsWriter *tvw);
586
+
587
+ /****************************************************************************
588
+ *
589
+ * TermVectorsReader
590
+ *
591
+ ****************************************************************************/
592
+
593
+ typedef struct TermVectorsReader
594
+ {
595
+ int size;
596
+ InStream *tvx_in;
597
+ InStream *tvd_in;
598
+ FieldInfos *fis;
599
+ } TermVectorsReader;
600
+
601
+ extern TermVectorsReader *tvr_open(Store *store,
602
+ const char *segment,
603
+ FieldInfos *fis);
604
+ extern TermVectorsReader *tvr_clone(TermVectorsReader *orig);
605
+ extern void tvr_close(TermVectorsReader *tvr);
606
+ extern HashTable *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
607
+ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
608
+ int doc_num,
609
+ int field_num);
610
+
611
+ /****************************************************************************
612
+ *
613
+ * LazyDoc
614
+ *
615
+ ****************************************************************************/
616
+
617
+ /* * * LazyDocField * * */
618
+ typedef struct LazyDocFieldData
619
+ {
620
+ off_t start;
621
+ int length;
622
+ char *text;
623
+ } LazyDocFieldData;
624
+
625
+ typedef struct LazyDoc LazyDoc;
626
+ typedef struct LazyDocField
627
+ {
628
+ char *name;
629
+ int size; /* number of data elements */
630
+ LazyDocFieldData *data;
631
+ int len; /* length of data elements concatenated */
632
+ LazyDoc *doc;
633
+ } LazyDocField;
634
+
635
+ extern char *lazy_df_get_data(LazyDocField *self, int i);
636
+ extern void lazy_df_get_bytes(LazyDocField *self, char *buf,
637
+ int start, int len);
638
+
639
+ /* * * LazyDoc * * */
640
+ struct LazyDoc
641
+ {
642
+ HashTable *field_dict;
643
+ int size;
644
+ LazyDocField **fields;
645
+ InStream *fields_in;
646
+ };
647
+
648
+ extern void lazy_doc_close(LazyDoc *self);
649
+
650
+ /****************************************************************************
651
+ *
652
+ * FieldsReader
653
+ *
654
+ ****************************************************************************/
655
+
656
+ typedef struct FieldsReader
657
+ {
658
+ int size;
659
+ FieldInfos *fis;
660
+ Store *store;
661
+ InStream *fdx_in;
662
+ InStream *fdt_in;
663
+ } FieldsReader;
664
+
665
+ extern FieldsReader *fr_open(Store *store,
666
+ const char *segment, FieldInfos *fis);
667
+ extern FieldsReader *fr_clone(FieldsReader *orig);
668
+ extern void fr_close(FieldsReader *fr);
669
+ extern Document *fr_get_doc(FieldsReader *fr, int doc_num);
670
+ extern LazyDoc *fr_get_lazy_doc(FieldsReader *fr, int doc_num);
671
+ extern HashTable *fr_get_tv(FieldsReader *fr, int doc_num);
672
+ extern TermVector *fr_get_field_tv(FieldsReader *fr, int doc_num,
673
+ int field_num);
674
+
675
+ /****************************************************************************
676
+ *
677
+ * FieldsWriter
678
+ *
679
+ ****************************************************************************/
680
+
681
+ typedef struct FieldsWriter
682
+ {
683
+ FieldInfos *fis;
684
+ OutStream *fdt_out;
685
+ OutStream *fdx_out;
686
+ TVField *tv_fields;
687
+ off_t start_ptr;
688
+ } FieldsWriter;
689
+
690
+ extern FieldsWriter *fw_open(Store *store,
691
+ const char *segment, FieldInfos *fis);
692
+ extern void fw_close(FieldsWriter *fw);
693
+ extern void fw_add_doc(FieldsWriter *fw, Document *doc);
694
+ extern void fw_add_postings(FieldsWriter *fw,
695
+ int field_num,
696
+ PostingList **plists,
697
+ int posting_count,
698
+ Offset *offsets,
699
+ int offset_count);
700
+ extern void fw_write_tv_index(FieldsWriter *fw);
701
+
702
+ /****************************************************************************
703
+ *
704
+ * Deleter
705
+ *
706
+ * A utility class (used by both IndexReader and IndexWriter) to keep track of
707
+ * files that need to be deleted because they are no longer referenced by the
708
+ * index.
709
+ *
710
+ ****************************************************************************/
711
+
712
+ struct Deleter
713
+ {
714
+ Store *store;
715
+ SegmentInfos *sis;
716
+ HashSet *pending;
717
+ };
718
+
719
+ extern Deleter *deleter_new(SegmentInfos *sis, Store *store);
720
+ extern void deleter_destroy(Deleter *dlr);
721
+ extern void deleter_clear_pending_files(Deleter *dlr);
722
+ extern void deleter_delete_file(Deleter *dlr, char *file_name);
723
+ extern void deleter_find_deletable_files(Deleter *dlr);
724
+ extern void deleter_commit_pending_files(Deleter *dlr);
725
+ extern void deleter_delete_files(Deleter *dlr, char **files, int file_cnt);
726
+
727
+ /****************************************************************************
728
+ *
729
+ * IndexReader
730
+ *
731
+ ****************************************************************************/
732
+
733
+ #define WRITE_LOCK_NAME "write"
734
+ #define COMMIT_LOCK_NAME "commit"
735
+
736
+ struct IndexReader
737
+ {
738
+ int (*num_docs)(IndexReader *ir);
739
+ int (*max_doc)(IndexReader *ir);
740
+ Document *(*get_doc)(IndexReader *ir, int doc_num);
741
+ LazyDoc *(*get_lazy_doc)(IndexReader *ir, int doc_num);
742
+ uchar *(*get_norms)(IndexReader *ir, int field_num);
743
+ uchar *(*get_norms_into)(IndexReader *ir, int field_num,
744
+ uchar *buf);
745
+ TermEnum *(*terms)(IndexReader *ir, int field_num);
746
+ TermEnum *(*terms_from)(IndexReader *ir, int field_num,
747
+ const char *term);
748
+ int (*doc_freq)(IndexReader *ir, int field_num,
749
+ const char *term);
750
+ TermDocEnum *(*term_docs)(IndexReader *ir);
751
+ TermDocEnum *(*term_positions)(IndexReader *ir);
752
+ TermVector *(*term_vector)(IndexReader *ir, int doc_num,
753
+ const char *field);
754
+ HashTable *(*term_vectors)(IndexReader *ir, int doc_num);
755
+ bool (*is_deleted)(IndexReader *ir, int doc_num);
756
+ bool (*has_deletions)(IndexReader *ir);
757
+ void (*acquire_write_lock)(IndexReader *ir);
758
+ void (*set_norm_i)(IndexReader *ir, int doc_num, int field_num,
759
+ uchar val);
760
+ void (*delete_doc_i)(IndexReader *ir, int doc_num);
761
+ void (*undelete_all_i)(IndexReader *ir);
762
+ void (*set_deleter_i)(IndexReader *ir, Deleter *dlr);
763
+ bool (*is_latest_i)(IndexReader *ir);
764
+ void (*commit_i)(IndexReader *ir);
765
+ void (*close_i)(IndexReader *ir);
766
+ int ref_cnt;
767
+ Deleter *deleter;
768
+ Store *store;
769
+ Lock *write_lock;
770
+ SegmentInfos *sis;
771
+ FieldInfos *fis;
772
+ HashTable *cache;
773
+ HashTable *sort_cache;
774
+ uchar *fake_norms;
775
+ mutex_t mutex;
776
+ bool has_changes : 1;
777
+ bool is_stale : 1;
778
+ bool is_owner : 1;
779
+ };
780
+
781
+ extern IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
782
+ extern IndexReader *ir_open(Store *store);
783
+ extern int ir_get_field_num(IndexReader *ir, const char *field);
784
+ extern bool ir_index_exists(Store *store);
785
+ extern void ir_close(IndexReader *ir);
786
+ extern void ir_commit(IndexReader *ir);
787
+ extern void ir_delete_doc(IndexReader *ir, int doc_num);
788
+ extern void ir_undelete_all(IndexReader *ir);
789
+ extern int ir_doc_freq(IndexReader *ir, const char *field, const char *term);
790
+ extern void ir_set_norm(IndexReader *ir, int doc_num, const char *field,
791
+ uchar val);
792
+ extern uchar *ir_get_norms_i(IndexReader *ir, int field_num);
793
+ extern uchar *ir_get_norms(IndexReader *ir, const char *field);
794
+ extern uchar *ir_get_norms_into(IndexReader *ir, const char *field, uchar *buf);
795
+ extern void ir_destroy(IndexReader *self);
796
+ extern Document *ir_get_doc_with_term(IndexReader *ir, const char *field,
797
+ const char *term);
798
+ extern TermEnum *ir_terms(IndexReader *ir, const char *field);
799
+ extern TermEnum *ir_terms_from(IndexReader *ir, const char *field,
800
+ const char *t);
801
+ extern TermDocEnum *ir_term_docs_for(IndexReader *ir, const char *field,
802
+ const char *term);
803
+ extern TermDocEnum *ir_term_positions_for(IndexReader *ir, const char *fld,
804
+ const char *t);
805
+ extern void ir_add_cache(IndexReader *ir);
806
+ extern bool ir_is_latest(IndexReader *ir);
807
+
808
+ /****************************************************************************
809
+ * MultiReader
810
+ ****************************************************************************/
811
+
812
+ struct MultiReader {
813
+ IndexReader ir;
814
+ int max_doc;
815
+ int num_docs_cache;
816
+ int r_cnt;
817
+ int *starts;
818
+ IndexReader **sub_readers;
819
+ HashTable *norms_cache;
820
+ bool has_deletions : 1;
821
+ int **field_num_map;
822
+ };
823
+
824
+ extern int mr_get_field_num(MultiReader *mr, int ir_num, int f_num);
825
+ extern IndexReader *mr_open(IndexReader **sub_readers, const int r_cnt);
826
+
827
+
828
+ /****************************************************************************
829
+ *
830
+ * Boost
831
+ *
832
+ ****************************************************************************/
833
+
834
+ typedef struct Boost
835
+ {
836
+ float val;
837
+ int doc_num;
838
+ struct Boost *next;
839
+ } Boost;
840
+
841
+ /****************************************************************************
842
+ *
843
+ * FieldInverter
844
+ *
845
+ ****************************************************************************/
846
+
847
+ typedef struct FieldInverter
848
+ {
849
+ HashTable *plists;
850
+ uchar *norms;
851
+ FieldInfo *fi;
852
+ int length;
853
+ bool is_tokenized : 1;
854
+ bool store_term_vector : 1;
855
+ bool store_offsets : 1;
856
+ bool has_norms : 1;
857
+ } FieldInverter;
858
+
859
+ /****************************************************************************
860
+ *
861
+ * DocWriter
862
+ *
863
+ ****************************************************************************/
864
+
865
+ #define DW_OFFSET_INIT_CAPA 512
866
+ typedef struct IndexWriter IndexWriter;
867
+
868
+ typedef struct DocWriter
869
+ {
870
+ Store *store;
871
+ SegmentInfo *si;
872
+ FieldInfos *fis;
873
+ TermVectorsWriter *tvw;
874
+ FieldsWriter *fw;
875
+ MemoryPool *mp;
876
+ Analyzer *analyzer;
877
+ HashTable *curr_plists;
878
+ HashTable *fields;
879
+ Similarity *similarity;
880
+ Offset *offsets;
881
+ int offsets_size;
882
+ int offsets_capa;
883
+ int doc_num;
884
+ int index_interval;
885
+ int skip_interval;
886
+ int max_field_length;
887
+ int max_buffered_docs;
888
+ } DocWriter;
889
+
890
+ extern DocWriter *dw_open(IndexWriter *is, SegmentInfo *si);
891
+ extern void dw_close(DocWriter *dw);
892
+ extern void dw_add_doc(DocWriter *dw, Document *doc);
893
+ extern void dw_new_segment(DocWriter *dw, SegmentInfo *si);
894
+
895
+ /****************************************************************************
896
+ *
897
+ * IndexWriter
898
+ *
899
+ ****************************************************************************/
900
+
901
+ typedef struct DelTerm
902
+ {
903
+ int field_num;
904
+ char *term;
905
+ } DelTerm;
906
+
907
+ struct IndexWriter
908
+ {
909
+ Config config;
910
+ mutex_t mutex;
911
+ Store *store;
912
+ Analyzer *analyzer;
913
+ SegmentInfos *sis;
914
+ FieldInfos *fis;
915
+ DocWriter *dw;
916
+ Similarity *similarity;
917
+ Lock *write_lock;
918
+ Deleter *deleter;
919
+ };
920
+
921
+ extern void index_create(Store *store, FieldInfos *fis);
922
+ extern bool index_is_locked(Store *store);
923
+ extern IndexWriter *iw_open(Store *store, volatile Analyzer *analyzer,
924
+ const Config *config);
925
+ extern void iw_delete_term(IndexWriter *iw, const char *field,
926
+ const char *term);
927
+ extern void iw_close(IndexWriter *iw);
928
+ extern void iw_add_doc(IndexWriter *iw, Document *doc);
929
+ extern int iw_doc_count(IndexWriter *iw);
930
+ extern void iw_commit(IndexWriter *iw);
931
+ extern void iw_optimize(IndexWriter *iw);
932
+ extern void iw_add_readers(IndexWriter *iw, IndexReader **readers,
933
+ const int r_cnt);
934
+
935
+ /****************************************************************************
936
+ *
937
+ * CompoundWriter
938
+ *
939
+ ****************************************************************************/
940
+
941
+ #define CW_INIT_CAPA 16
942
+ typedef struct CWFileEntry
943
+ {
944
+ char *name;
945
+ off_t dir_offset;
946
+ off_t data_offset;
947
+ } CWFileEntry;
948
+
949
+ typedef struct CompoundWriter {
950
+ Store *store;
951
+ const char *name;
952
+ HashSet *ids;
953
+ CWFileEntry *file_entries;
954
+ } CompoundWriter;
955
+
956
+ extern CompoundWriter *open_cw(Store *store, char *name);
957
+ extern void cw_add_file(CompoundWriter *cw, char *id);
958
+ extern void cw_close(CompoundWriter *cw);
959
+
960
+
961
+ #endif