ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/index_rw.c ADDED
@@ -0,0 +1,2543 @@
1
+ #include <index.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include <array.h>
5
+
6
+ const char *INDEX_EXTENSIONS[] = {
7
+ "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
8
+ "tvx", "tvd", "tvf", "tvp"
9
+ };
10
+
11
+ const char *COMPOUND_EXTENSIONS[] = {
12
+ "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
13
+ };
14
+
15
+ const char *VECTOR_EXTENSIONS[] = {
16
+ "tvx", "tvd", "tvf"
17
+ };
18
+
19
+ FerretConfig config = {
20
+ 10, // default merge_factor
21
+ 10, // default min_merge_docs
22
+ INT_MAX, // default max_merge_docs
23
+ 10000, // default max_field_length
24
+ 128 // default term_index_interval
25
+ };
26
+
27
+ /***************************************************************************
28
+ *
29
+ * CacheObject
30
+ *
31
+ ***************************************************************************/
32
+
33
+ unsigned int co_hash(const void *key)
34
+ {
35
+ return (unsigned int)key;
36
+ }
37
+
38
+ int co_eq(const void *key1, const void *key2)
39
+ {
40
+ return (key1 == key2);
41
+ }
42
+
43
+ void co_destroy(void *p)
44
+ {
45
+ CacheObject *co = (CacheObject *)p;
46
+ h_rem(co->ref_tab1, co->ref2, false);
47
+ h_rem(co->ref_tab2, co->ref1, false);
48
+ co->destroy(co->obj);
49
+ free(co);
50
+ }
51
+
52
+ CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
53
+ void *ref1, void *ref2, void (*destroy)(void *p), void *obj)
54
+ {
55
+ CacheObject *co = ALLOC(CacheObject);
56
+ h_set(ref_tab1, ref2, co);
57
+ h_set(ref_tab2, ref1, co);
58
+ co->ref_tab1 = ref_tab1;
59
+ co->ref_tab2 = ref_tab2;
60
+ co->ref1 = ref1;
61
+ co->ref2 = ref2;
62
+ co->destroy = destroy;
63
+ co->obj = obj;
64
+ return co;
65
+ }
66
+
67
+ HshTable *co_hsh_create()
68
+ {
69
+ return h_new(&co_hash, &co_eq, NULL, &co_destroy);
70
+ }
71
+
72
+ /***************************************************************************
73
+ *
74
+ * Posting
75
+ *
76
+ ***************************************************************************/
77
+
78
+ Posting *p_create(Term *term, int position, TVOffsetInfo *offset)
79
+ {
80
+ Posting *p = ALLOC(Posting);
81
+ p->freq = 1;
82
+ p->size = 1;
83
+ p->term = term;
84
+ p->positions = ALLOC(int);
85
+ p->positions[0] = position;
86
+ p->offsets = ALLOC(TVOffsetInfo *);
87
+ p->offsets[0] = offset;
88
+ return p;
89
+ }
90
+
91
+ void p_destroy(void *p)
92
+ {
93
+ // the positions and offsets will be put in a TVTerm so no need to free
94
+ int i;
95
+ Posting *post = (Posting *)p;
96
+ free(post->positions);
97
+ for (i = 0; i < post->freq; i++)
98
+ tvoi_destroy(post->offsets[i]);
99
+ free(post->offsets);
100
+ free(p);
101
+ }
102
+
103
+ void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset)
104
+ {
105
+ if (p->freq >= p->size) {
106
+ p->size *= 2;
107
+ REALLOC_N(p->positions, int, p->size);
108
+ REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
109
+ }
110
+ p->positions[p->freq] = position;
111
+ p->offsets[p->freq] = offset;
112
+ p->freq++;
113
+ }
114
+
115
+ inline int p_cmp(const void *const p1, const void *const p2)
116
+ {
117
+ Term *t1 = (*(Posting **)p1)->term;
118
+ Term *t2 = (*(Posting **)p2)->term;
119
+ int res = strcmp(t1->field, t2->field);
120
+ if (res != 0) {
121
+ return res;
122
+ } else {
123
+ return strcmp(t1->text, t2->text);
124
+ }
125
+ }
126
+
127
+ DocumentWriter *dw_open(Store *store,
128
+ Analyzer *analyzer,
129
+ Similarity *similarity,
130
+ int max_field_length,
131
+ int term_index_interval)
132
+ {
133
+ DocumentWriter *dw = ALLOC(DocumentWriter);
134
+ dw->store = store;
135
+ dw->analyzer = analyzer;
136
+ dw->similarity = similarity;
137
+ dw->fis = NULL;
138
+ dw->postingtable = h_new(&term_hash, &term_eq, &term_destroy, &p_destroy);
139
+ dw->max_field_length = max_field_length;
140
+ dw->term_index_interval = term_index_interval;
141
+ return dw;
142
+ }
143
+
144
+ void dw_close(DocumentWriter *dw)
145
+ {
146
+ if (dw->fis) fis_destroy(dw->fis);
147
+ h_destroy(dw->postingtable);
148
+ free(dw);
149
+ }
150
+
151
+ void dw_add_position(DocumentWriter *dw, char *field, char *text,
152
+ int position, TVOffsetInfo *offset)
153
+ {
154
+ Term termbuf = {field, text}, *term;
155
+ Posting *p = (Posting *)h_get(dw->postingtable, &termbuf);
156
+
157
+ if (p) { // word seen before
158
+ // double the size of posting to make room for more posts.
159
+ if (p->freq >= p->size) {
160
+ p->size <<= 1;
161
+ REALLOC_N(p->positions, int, p->size);
162
+ p->offsets = REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
163
+ }
164
+ p->positions[p->freq] = position; // add new position
165
+ p->offsets[p->freq] = offset; // add new position
166
+ p->freq++; // update frequency
167
+ } else { // word not seen before
168
+ term = term_create(field, text);
169
+ h_set(dw->postingtable, term, p_create(term, position, offset));
170
+ }
171
+ }
172
+
173
+ void dw_invert_doc(DocumentWriter *dw, Document *doc)
174
+ {
175
+ int i;
176
+ int dfcnt = doc->dfcnt;
177
+ char *field_name, *text;
178
+ int field_number, length, position, offset, slen;
179
+ TokenStream *stream;
180
+ Token *token;
181
+ FieldInfo *fi;
182
+
183
+ DocField **fields = doc->df_arr, *field;
184
+ for (i = 0; i < dfcnt; i++) {
185
+ field = fields[i];
186
+ field_name = field->name;
187
+ fi = ((FieldInfo *)ht_get(dw->fis->by_name, field_name));
188
+ field_number = fi->number;
189
+
190
+ length = dw->field_lengths[field_number];
191
+ offset = dw->field_offsets[field_number];
192
+ position = dw->field_positions[field_number];
193
+
194
+ if (fi->is_indexed) {
195
+ if (!field->is_tokenized) {// un-tokenized field
196
+ text = field->data;
197
+ slen = strlen(text);
198
+ if (fi->store_offset) {
199
+ dw_add_position(dw, field_name, text, position,
200
+ tvoi_create(offset, offset+slen));
201
+ } else {
202
+ dw_add_position(dw, field_name, text, position, NULL);
203
+ }
204
+ offset += slen;
205
+ length++;
206
+ } else {
207
+
208
+ // Tokenize field and add to posting_table
209
+ stream = a_get_ts(dw->analyzer, field_name, field->data);
210
+
211
+ while ((token = ts_next(stream)) != NULL) {
212
+ position += (token->pos_inc - 1);
213
+
214
+ if (fi->store_offset) {
215
+ dw_add_position(dw,
216
+ field_name,
217
+ token->text,
218
+ position,
219
+ tvoi_create(offset + token->start, offset + token->end));
220
+ position++;
221
+ } else {
222
+ dw_add_position(dw, field_name, token->text, position, NULL);
223
+ position++;
224
+ }
225
+
226
+ length++;
227
+ // stop if we reach the max field length
228
+ if (length > dw->max_field_length)
229
+ break;
230
+ }
231
+
232
+ if (token)
233
+ offset += token->end + 1;
234
+ }
235
+ dw->field_lengths[field_number] = length;
236
+ dw->field_offsets[field_number] = offset;
237
+ dw->field_positions[field_number] = position;
238
+ dw->field_boosts[field_number] *= field->boost;
239
+ }
240
+ }
241
+ }
242
+
243
+ Posting **dw_sort_posting_table(DocumentWriter *dw)
244
+ {
245
+ HshTable *ht = dw->postingtable;
246
+ int i;
247
+ dw->pcnt = i = ht->used;
248
+ Posting **postings = ALLOC_N(Posting *, i);
249
+ HshEntry *he = ht->table;
250
+ while (i > 0) {
251
+ if (he->value != NULL) {
252
+ i--;
253
+ postings[i] = (Posting *)he->value;
254
+ }
255
+ he++;
256
+ }
257
+ qsort(postings, dw->pcnt, sizeof(Posting *), &p_cmp);
258
+ return postings;
259
+ }
260
+
261
+ void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
262
+ {
263
+ OutStream *freq_out, *prox_out;
264
+ TermInfosWriter *tiw;
265
+ TermVectorsWriter *tvw = NULL;
266
+ FieldInfo *fi;
267
+ Store *store = dw->store;
268
+ TermInfo *ti;
269
+ Posting *posting;
270
+ int i, j, posting_freq, position, last_position;
271
+ char fname[SEGMENT_NAME_MAX_LENGTH], *curr_field = NULL, *term_field;
272
+ strcpy(fname, segment);
273
+
274
+ //open files for inverse index storage
275
+ sprintf(fname, "%s.frq", segment);
276
+ freq_out = store->create_output(store, fname);
277
+ sprintf(fname, "%s.prx", segment);
278
+ prox_out = store->create_output(store, fname);
279
+ tiw = tiw_open(store, segment, dw->fis, dw->term_index_interval);
280
+ ti = ti_create(0, 0, 0, 0);
281
+
282
+ for (i = 0; i < dw->pcnt; i++) {
283
+ posting = postings[i];
284
+
285
+ // add an entry to the dictionary with pointers to prox and freq_out files
286
+ ti_set(ti, 1, os_pos(freq_out), os_pos(prox_out), -1);
287
+ tiw_add(tiw, posting->term, ti);
288
+
289
+ // add an entry to the freq_out file
290
+ posting_freq = posting->freq;
291
+ if (posting_freq == 1) { // optimize freq=1
292
+ os_write_vint(freq_out, 1); // set low bit of doc num.
293
+ } else {
294
+ os_write_vint(freq_out, 0); // the doc number
295
+ os_write_vint(freq_out, posting_freq); // frequency in doc
296
+ }
297
+
298
+ last_position = 0; // write positions
299
+
300
+ for (j = 0; j < posting_freq; j++) {
301
+ position = posting->positions[j];
302
+ os_write_vint(prox_out, position - last_position);
303
+ last_position = position;
304
+ }
305
+
306
+ // check to see if we switched to a new field
307
+ term_field = posting->term->field;
308
+ if (curr_field != term_field) {
309
+ // changing field - see if there is something to save
310
+ curr_field = term_field;
311
+ fi = (FieldInfo *)ht_get(dw->fis->by_name, curr_field);
312
+ if (fi->store_tv) {
313
+ if (tvw == NULL) {
314
+ tvw = tvw_open(store, segment, dw->fis);
315
+ tvw_open_doc(tvw);
316
+ }
317
+ tvw_open_field(tvw, curr_field);
318
+
319
+ } else if (tvw != NULL) {
320
+ tvw_close_field(tvw);
321
+ }
322
+ }
323
+ // tvw->curr_field != NULL implies field is still open
324
+ if (tvw != NULL && tvw->curr_field != NULL) {
325
+ tvw_add_term(tvw, posting->term->text, posting_freq, posting->positions, posting->offsets);
326
+ }
327
+ }
328
+ if (tvw != NULL) {
329
+ tvw_close_doc(tvw);
330
+ tvw_close(tvw);
331
+ }
332
+ // make an effort to close all streams we can but remember and re-raise
333
+ // the last exception encountered in this process
334
+ os_close(freq_out);
335
+ os_close(prox_out);
336
+ tiw_close(tiw);
337
+ ti_destroy(ti);
338
+ }
339
+
340
+ void dw_write_norms(DocumentWriter *dw, char *segment)
341
+ {
342
+ int i;
343
+ float norm;
344
+ OutStream *norms_out;
345
+ char fname[SEGMENT_NAME_MAX_LENGTH];
346
+ FieldInfos *fis = dw->fis;
347
+ FieldInfo *fi;
348
+
349
+ for (i = 0; i < fis->fcnt; i++) {
350
+ fi = fis->by_number[i];
351
+
352
+ if (fi->is_indexed && !fi->omit_norms) {
353
+ norm = dw->field_boosts[i] * sim_length_norm(dw->similarity, fi->name, dw->field_lengths[i]);
354
+ sprintf(fname, "%s.f%d", segment, i);
355
+ norms_out = dw->store->create_output(dw->store, fname);
356
+ os_write_byte(norms_out, sim_encode_norm(dw->similarity, norm));
357
+ os_close(norms_out);
358
+ }
359
+ }
360
+ }
361
+
362
+ void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc)
363
+ {
364
+ int i;
365
+ // write field names
366
+ dw->fis = fis_create();
367
+ fis_add_doc(dw->fis, doc);
368
+ fis_write(dw->fis, dw->store, segment, ".fnm");
369
+
370
+ // write field values
371
+ FieldsWriter *fw = fw_open(dw->store, segment, dw->fis);
372
+ fw_add_doc(fw, doc);
373
+ fw_close(fw);
374
+
375
+ // invert doc into posting_table
376
+ h_clear(dw->postingtable); // clear posting_table
377
+
378
+ dw->field_boosts = ALLOC_N(float, dw->fis->fcnt);
379
+ dw->field_lengths = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
380
+ dw->field_offsets = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
381
+ dw->field_positions = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
382
+
383
+ for (i = 0; i < dw->fis->fcnt; i++)
384
+ dw->field_boosts[i] = doc->boost;
385
+
386
+ dw_invert_doc(dw, doc);
387
+
388
+ // sort posting_table into an array
389
+ Posting **postings = dw_sort_posting_table(dw);
390
+
391
+ // write postings
392
+ dw_write_postings(dw, postings, segment);
393
+ free(postings);
394
+
395
+ // write norms of indexed fields
396
+ dw_write_norms(dw, segment);
397
+
398
+ free(dw->field_boosts);
399
+ free(dw->field_lengths);
400
+ free(dw->field_offsets);
401
+ free(dw->field_positions);
402
+ }
403
+
404
+ /****************************************************************************
405
+ *
406
+ * SegmentInfo
407
+ *
408
+ ****************************************************************************/
409
+
410
+ SegmentInfo *si_create(char *name, int doc_cnt, Store *store)
411
+ {
412
+ SegmentInfo *si = ALLOC(SegmentInfo);
413
+ si->name = name;
414
+ si->doc_cnt = doc_cnt;
415
+ si->store = store;
416
+ return si;
417
+ }
418
+
419
+ void si_destroy(void *p)
420
+ {
421
+ SegmentInfo *si = (SegmentInfo *)p;
422
+ free(si->name);
423
+ free(si);
424
+ }
425
+
426
+ bool si_has_deletions(SegmentInfo *si)
427
+ {
428
+ char del_file_name[SEGMENT_NAME_MAX_LENGTH];
429
+ sprintf(del_file_name, "%s.del", si->name);
430
+ return si->store->exists(si->store, del_file_name);
431
+ }
432
+
433
+ bool si_uses_compound_file(SegmentInfo *si)
434
+ {
435
+ char compound_file_name[SEGMENT_NAME_MAX_LENGTH];
436
+ sprintf(compound_file_name, "%s.cfs", si->name);
437
+ return si->store->exists(si->store, compound_file_name);
438
+ }
439
+
440
+ struct NormTester {
441
+ bool has_norm_file;
442
+ char *segment_name;
443
+ };
444
+ void is_norm_file(char *fname, void *arg)
445
+ {
446
+ struct NormTester *nt = (struct NormTester *)arg;
447
+ char norm_file_pattern[SEGMENT_NAME_MAX_LENGTH];
448
+ sprintf(norm_file_pattern, "%s.s", nt->segment_name);
449
+ if (strncmp(fname, norm_file_pattern, strlen(norm_file_pattern)) == 0) {
450
+ nt->has_norm_file = true;
451
+ }
452
+ }
453
+
454
+ bool si_has_separate_norms(SegmentInfo *si)
455
+ {
456
+ struct NormTester nt;
457
+ nt.segment_name = si->name;
458
+ nt.has_norm_file = false;
459
+ si->store->each(si->store, &is_norm_file, &nt);
460
+
461
+ return nt.has_norm_file;
462
+ }
463
+
464
+
465
+ /****************************************************************************
466
+ *
467
+ * SegmentInfos
468
+ *
469
+ ****************************************************************************/
470
+
471
+ #include <time.h>
472
+ #define FORMAT -1
473
+ #define SEGMENT_FILENAME "segments"
474
+ #define TEMPORARY_SEGMENT_FILENAME "segments.new"
475
+
476
+ SegmentInfos *sis_create()
477
+ {
478
+ SegmentInfos *sis = ALLOC(SegmentInfos);
479
+ sis->format = FORMAT;
480
+ sis->version = (unsigned int)time(NULL);
481
+ sis->scnt = 0;
482
+ sis->counter = 0;
483
+ sis->size = 4;
484
+ sis->segs = ALLOC_N(SegmentInfo *, sis->size);
485
+ return sis;
486
+ }
487
+
488
+ void sis_destroy_not_infos(void *p)
489
+ {
490
+ SegmentInfos *sis = (SegmentInfos *)p;
491
+ free(sis->segs);
492
+ free(p);
493
+ }
494
+
495
+ void sis_destroy(void *p)
496
+ {
497
+ int i;
498
+ SegmentInfos *sis = (SegmentInfos *)p;
499
+ for (i = 0; i < sis->scnt; i++)
500
+ si_destroy(sis->segs[i]);
501
+ free(sis->segs);
502
+ free(p);
503
+ }
504
+
505
+ void sis_add_si(SegmentInfos *sis, SegmentInfo *si)
506
+ {
507
+ if (sis->scnt >= sis->size) {
508
+ sis->size = sis->scnt * 2;
509
+ REALLOC_N(sis->segs, SegmentInfo *, sis->size);
510
+ }
511
+ sis->segs[sis->scnt] = si;
512
+ sis->scnt++;
513
+ }
514
+
515
+ void sis_del_at(SegmentInfos *sis, int at)
516
+ {
517
+ int i;
518
+ si_destroy(sis->segs[at]);
519
+ sis->scnt--;
520
+ for (i = at; i < sis->scnt; i++)
521
+ sis->segs[i] = sis->segs[i+1];
522
+ }
523
+
524
+ void sis_del_from_to(SegmentInfos *sis, int from, int to)
525
+ {
526
+ int i, num_to_del = to - from;
527
+ sis->scnt -= num_to_del;
528
+ for (i = from; i < to; i++) {
529
+ si_destroy(sis->segs[i]);
530
+ }
531
+ for (i = from; i < sis->scnt; i++) {
532
+ sis->segs[i] = sis->segs[i+num_to_del];
533
+ }
534
+ }
535
+
536
+ void sis_clear(SegmentInfos *sis)
537
+ {
538
+ int i;
539
+ for (i = 0; i < sis->scnt; i++) {
540
+ si_destroy(sis->segs[i]);
541
+ }
542
+ sis->scnt = 0;
543
+ }
544
+
545
+ void sis_read(SegmentInfos *sis, Store *store)
546
+ {
547
+ int doc_cnt;
548
+ char *name;
549
+ InStream *is = store->open_input(store, SEGMENT_FILENAME);
550
+ sis->format = is_read_int(is);
551
+
552
+ if (sis->format < 0) { // file contains explicit format info
553
+ // check that it is a format we can understand
554
+ if (sis->format < FORMAT)
555
+ eprintf(ERROR, "Unknown format version: %ld", sis->format);
556
+ sis->version = is_read_long(is);
557
+ sis->counter = is_read_int(is);
558
+ } else { // file is in old format without explicit format info
559
+ sis->counter = sis->format;
560
+ }
561
+
562
+ int seg_count = is_read_int(is);
563
+ int i;
564
+ for (i = 0; i < seg_count; i++) {
565
+ name = is_read_string(is);
566
+ doc_cnt = is_read_int(is);
567
+ sis_add_si(sis, si_create(name, doc_cnt, store));
568
+ }
569
+
570
+ if (sis->format >= 0) {
571
+ // in old format the version number may be at the end of the file
572
+ if (is_pos(is) >= is_length(is))
573
+ sis->version = 0; // old file format without version number
574
+ else
575
+ sis->version = is_read_long(is); // read version
576
+ }
577
+ is_close(is);
578
+ }
579
+
580
+ void sis_write(SegmentInfos *sis, Store *store)
581
+ {
582
+ int i;
583
+ SegmentInfo *si;
584
+ OutStream *os = store->create_output(store, TEMPORARY_SEGMENT_FILENAME);
585
+ os_write_int(os, FORMAT);
586
+ os_write_long(os, ++(sis->version)); // every write changes the index
587
+ os_write_int(os, sis->counter);
588
+ os_write_int(os, sis->scnt);
589
+ for (i = 0; i < sis->scnt; i++) {
590
+ si = sis->segs[i];
591
+ os_write_string(os, si->name);
592
+ os_write_int(os, si->doc_cnt);
593
+ }
594
+
595
+ os_close(os);
596
+
597
+ //install new segment info
598
+ store->rename(store, TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME);
599
+ }
600
+
601
+ int sis_read_current_version(Store *store)
602
+ {
603
+ if (!store->exists(store, SEGMENT_FILENAME))
604
+ return 0;
605
+ InStream *is = store->open_input(store, SEGMENT_FILENAME);
606
+ int format = 0;
607
+ int version = 0;
608
+ format = is_read_int(is);
609
+ if (format < 0) {
610
+ if (format < FORMAT)
611
+ eprintf(ERROR, "Unknown format version: %ld", format);
612
+ version = is_read_long(is);
613
+ }
614
+ is_close(is);
615
+
616
+ if (format < 0)
617
+ return version;
618
+
619
+ // We cannot be sure about the format of the file.
620
+ // Therefore we have to read the whole file and cannot simply
621
+ // seek to the version entry.
622
+
623
+ SegmentInfos *sis = sis_create();
624
+ sis_read(sis, store);
625
+ version = sis->version;
626
+ sis_destroy(sis);
627
+ return version;
628
+ }
629
+
630
+ /****************************************************************************
631
+ *
632
+ * IndexWriter
633
+ *
634
+ ****************************************************************************/
635
+
636
+ IndexWriter *iw_open(Store *store, Analyzer *analyzer,
637
+ bool create, bool close_store, bool close_analyzer)
638
+ {
639
+ IndexWriter *iw = ALLOC(IndexWriter);
640
+ if (create)
641
+ store->clear_all(store);
642
+ mutex_init(&iw->mutex, NULL);
643
+ iw->merge_factor = config.merge_factor;
644
+ iw->min_merge_docs = config.min_merge_docs;
645
+ iw->max_merge_docs = config.max_merge_docs;
646
+ iw->max_field_length = config.max_field_length;
647
+ iw->term_index_interval = config.term_index_interval;
648
+ iw->use_compound_file = true;
649
+ iw->store = store;
650
+ iw->close_store = close_store;
651
+ iw->close_analyzer = close_analyzer;
652
+ iw->analyzer = analyzer;
653
+ iw->sis = sis_create();
654
+ iw->similarity = sim_create_default();
655
+ iw->ram_store = open_ram_store();
656
+
657
+ mutex_lock(&store->mutex);
658
+ // keep the write_lock obtained until the IndexWriter is closed.
659
+ iw->write_lock = store->open_lock(store, WRITE_LOCK_NAME);
660
+ if (!iw->write_lock->obtain(iw->write_lock)) {
661
+ eprintf(STATE_ERROR,
662
+ "Could not obtain write lock when trying to write index");
663
+ }
664
+
665
+ if (create) {
666
+ Lock *commit_lock = store->open_lock(store, COMMIT_LOCK_NAME);
667
+ if (!commit_lock->obtain(commit_lock)) {
668
+ eprintf(STATE_ERROR,
669
+ "Could not obtain commit lock when trying to commit index");
670
+ }
671
+ // commit the index
672
+ store->clear(store);
673
+ sis_write(iw->sis, store);
674
+ //
675
+ commit_lock->release(commit_lock);
676
+ store->close_lock(commit_lock);
677
+ } else {
678
+ sis_read(iw->sis, store);
679
+ }
680
+ mutex_unlock(&store->mutex);
681
+ return iw;
682
+ }
683
+
684
+ const char base36_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
685
+
686
+ char *new_segment_name(int counter)
687
+ {
688
+ char buf[SEGMENT_NAME_MAX_LENGTH];
689
+ buf[SEGMENT_NAME_MAX_LENGTH - 1] = '\0';
690
+ int i;
691
+ for (i = SEGMENT_NAME_MAX_LENGTH - 2; ; i--) {
692
+ buf[i] = base36_digitmap[counter%36];
693
+ counter /= 36;
694
+ if (counter == 0) break;
695
+ }
696
+ i--;
697
+ buf[i] = '_';
698
+ return estrdup(&buf[i]);
699
+ }
700
+
701
+ int iw_doc_count(IndexWriter *iw)
702
+ {
703
+ int i, doc_cnt = 0;
704
+ mutex_lock(&iw->mutex);
705
+ for (i = 0; i < iw->sis->scnt; i++)
706
+ doc_cnt += iw->sis->segs[i]->doc_cnt;
707
+ mutex_unlock(&iw->mutex);
708
+ return doc_cnt;
709
+ }
710
+
711
+ void delete_files(Array *file_names, Store *store)
712
+ {
713
+ int i;
714
+ for (i = 0; i < file_names->size; i++) {
715
+ store->remove(store, (char *)file_names->elems[i]);
716
+ }
717
+ ary_destroy(file_names);
718
+ }
719
+
720
+
721
+ Array *sr_file_names(IndexReader *ir);
722
+ void iw_delete_segments(IndexWriter *iw, IndexReader **segment_readers, int del_cnt)
723
+ {
724
+ // The java version keeps a record of files that it couldn't delete. This
725
+ // shouldn't be a problem on linux I hope.
726
+ IndexReader *ir;
727
+ int i;
728
+ for (i = 0; i < del_cnt; i++) {
729
+ ir = segment_readers[i];
730
+ delete_files(sr_file_names(ir), ir->store);
731
+ }
732
+ }
733
+
734
+ void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merger)
735
+ {
736
+ char merged_tmp[SEGMENT_NAME_MAX_LENGTH], merged_cfs[SEGMENT_NAME_MAX_LENGTH];
737
+
738
+ mutex_lock(&iw->store->mutex);
739
+ sprintf(merged_tmp, "%s.tmp", merged_name);
740
+ sprintf(merged_cfs, "%s.cfs", merged_name);
741
+
742
+ Array *files_to_delete = sm_create_compound_file(merger, merged_tmp);
743
+ Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
744
+
745
+ if (!commit_lock->obtain(commit_lock)) {
746
+ eprintf(STATE_ERROR,
747
+ "Could not obtain commit lock when trying to commit index");
748
+ }
749
+
750
+ // make compound file visible for SegmentReaders
751
+ iw->store->rename(iw->store, merged_tmp, merged_cfs);
752
+ // delete now unused files of segment
753
+ delete_files(files_to_delete, iw->store);
754
+
755
+ commit_lock->release(commit_lock);
756
+ iw->store->close_lock(commit_lock);
757
+ mutex_unlock(&iw->store->mutex);
758
+ }
759
+
760
+ void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segment)
761
+ {
762
+ int i;
763
+ IndexReader *segments_to_delete[max_segment - min_segment];
764
+ int del_cnt = 0;
765
+
766
+ char *merged_name = new_segment_name(iw->sis->counter++);
767
+
768
+ SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
769
+ IndexReader *reader;
770
+
771
+
772
+ for (i = min_segment; i < max_segment; i++) {
773
+ reader = sr_open(iw->sis, i, false, false);
774
+ sm_add(merger, reader);
775
+ if ((reader->store == iw->store) || // if we own the directory
776
+ (reader->store == iw->ram_store)) {
777
+ segments_to_delete[del_cnt++] = reader; // queue segment for deletion
778
+ }
779
+ }
780
+
781
+ int merged_doc_count = sm_merge(merger);
782
+
783
+ sis_del_from_to(iw->sis, min_segment, max_segment);
784
+
785
+ sis_add_si(iw->sis, si_create(merged_name, merged_doc_count, iw->store));
786
+
787
+ // close readers before we attempt to delete now-obsolete segments
788
+
789
+ mutex_lock(&iw->store->mutex);
790
+ Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
791
+ if (!commit_lock->obtain(commit_lock)) {
792
+ eprintf(STATE_ERROR,
793
+ "Could not obtain commit lock when trying to commit index");
794
+ }
795
+ // commit the index
796
+ sis_write(iw->sis, iw->store);
797
+ iw_delete_segments(iw, segments_to_delete, del_cnt);
798
+ //
799
+ commit_lock->release(commit_lock);
800
+ iw->store->close_lock(commit_lock);
801
+ mutex_unlock(&iw->store->mutex);
802
+
803
+ if (iw->use_compound_file) {
804
+ make_compound_file(iw, merged_name, merger);
805
+ }
806
+
807
+ sm_destroy(merger);
808
+ }
809
+
810
+ void iw_merge_segments(IndexWriter *iw, int min_segment)
811
+ {
812
+ iw_merge_segments_with_max(iw, min_segment, iw->sis->scnt);
813
+ }
814
+
815
+ void iw_maybe_merge_segments(IndexWriter *iw)
816
+ {
817
+ int target_merge_docs = iw->min_merge_docs;
818
+ int min_segment, merge_docs;
819
+ SegmentInfo *si;
820
+
821
+ while (target_merge_docs <= iw->max_merge_docs) {
822
+ // find segments smaller than current target size
823
+ min_segment = iw->sis->scnt - 1;
824
+ merge_docs = 0;
825
+ while (min_segment >= 0) {
826
+ si = iw->sis->segs[min_segment];
827
+ if (si->doc_cnt >= target_merge_docs)
828
+ break;
829
+ merge_docs += si->doc_cnt;
830
+ min_segment -= 1;
831
+ }
832
+
833
+ if (merge_docs >= target_merge_docs) // found a merge to do
834
+ iw_merge_segments(iw, min_segment + 1);
835
+ else
836
+ break;
837
+
838
+ target_merge_docs *= iw->merge_factor; // increase target size
839
+ }
840
+ }
841
+
842
+ void iw_flush_ram_segments(IndexWriter *iw)
843
+ {
844
+ int min_segment = iw->sis->scnt-1;
845
+ int doc_count = 0;
846
+ SegmentInfo **segs = iw->sis->segs;
847
+ while ((min_segment >= 0) &&
848
+ (segs[min_segment]->store == iw->ram_store)) {
849
+ doc_count += segs[min_segment]->doc_cnt;
850
+ min_segment--;
851
+ }
852
+ /* the following if statement is actually incrementing for different
853
+ * reasons. If min_segment < 0 then we must increment as we searched
854
+ * off the end. If the top segment is not ram_store there are no
855
+ * ram segments to flush so we increment so the next check will return
856
+ * us from this function. Lastly, the min_segment stopped at a segment
857
+ * that wasn't the ram segment. But if it fit's in with the merge
858
+ * factor, why not merge it. Otherwise we leave it and increment min_seg
859
+ */
860
+ if (min_segment < 0 || // add one FS segment?
861
+ (doc_count + segs[min_segment]->doc_cnt) > iw->merge_factor ||
862
+ (segs[iw->sis->scnt-1]->store != iw->ram_store))
863
+ min_segment++;
864
+ if (min_segment >= iw->sis->scnt)
865
+ return;
866
+ iw_merge_segments(iw, min_segment);
867
+ }
868
+
869
+ void iw_add_doc(IndexWriter *iw, Document *doc)
870
+ {
871
+ DocumentWriter *dw;
872
+ char *segment_name;
873
+
874
+ mutex_lock(&iw->mutex);
875
+ dw = dw_open(iw->ram_store,
876
+ iw->analyzer,
877
+ iw->similarity,
878
+ iw->max_field_length,
879
+ iw->term_index_interval);
880
+ segment_name = new_segment_name(iw->sis->counter++);
881
+ dw_add_doc(dw, segment_name, doc);
882
+ dw_close(dw);
883
+ sis_add_si(iw->sis, si_create(segment_name, 1, iw->ram_store));
884
+ iw_maybe_merge_segments(iw);
885
+ mutex_unlock(&iw->mutex);
886
+ }
887
+
888
+ static inline void iw_optimize_internal(IndexWriter *iw)
889
+ {
890
+ int min_segment;
891
+ iw_flush_ram_segments(iw);
892
+ while (iw->sis->scnt > 1 ||
893
+ (iw->sis->scnt == 1 &&
894
+ ( si_has_deletions(iw->sis->segs[0]) ||
895
+ (iw->sis->segs[0]->store != iw->store) ||
896
+ (iw->use_compound_file &&
897
+ (!si_uses_compound_file(iw->sis->segs[0]) ||
898
+ si_has_separate_norms(iw->sis->segs[0])))))) {
899
+ min_segment = iw->sis->scnt - iw->merge_factor;
900
+ iw_merge_segments(iw, min_segment < 0 ? 0 : min_segment);
901
+ }
902
+ }
903
+ void iw_optimize(IndexWriter *iw)
904
+ {
905
+ mutex_lock(&iw->mutex);
906
+ iw_optimize_internal(iw);
907
+ mutex_unlock(&iw->mutex);
908
+ }
909
+
910
+ void iw_close(IndexWriter *iw)
911
+ {
912
+ mutex_lock(&iw->mutex);
913
+ iw_flush_ram_segments(iw);
914
+ ram_close(iw->ram_store);
915
+ sis_destroy(iw->sis);
916
+
917
+ sim_destroy(iw->similarity);
918
+ if (iw->close_analyzer) a_destroy(iw->analyzer);
919
+
920
+ iw->write_lock->release(iw->write_lock);
921
+ iw->store->close_lock(iw->write_lock);
922
+
923
+ if (iw->close_store)
924
+ store_close(iw->store);
925
+ mutex_destroy(&iw->mutex);
926
+ free(iw);
927
+ }
928
+
929
+ void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
930
+ {
931
+ int i, j, end, start;
932
+
933
+ mutex_lock(&iw->mutex);
934
+ iw_optimize_internal(iw); // start with zero or 1 seg
935
+
936
+ start = iw->sis->scnt;
937
+
938
+ for (i = 0; i < cnt; i++) {
939
+ Store *store = stores[i];
940
+ SegmentInfos *sis = sis_create(); // read infos from dir
941
+ sis_read(sis, store);
942
+
943
+ for (j = 0; j < sis->scnt; j++) {
944
+ SegmentInfo *si = sis->segs[j];
945
+ sis_add_si(iw->sis, si);
946
+ }
947
+ sis_destroy_not_infos(sis);
948
+ }
949
+
950
+ // merge newly added segments in log(n) passes
951
+ while (iw->sis->scnt > start + iw->merge_factor) {
952
+ for (i = start + 1; i < iw->sis->scnt; i++) {
953
+ end = MIN(iw->sis->scnt, i + iw->merge_factor);
954
+ if (end - i > 1) {
955
+ iw_merge_segments_with_max(iw, i, end);
956
+ }
957
+ }
958
+ }
959
+
960
+ // final cleanup
961
+ iw_optimize_internal(iw);
962
+ mutex_unlock(&iw->mutex);
963
+ }
964
+
965
+
966
+ /**
967
+ * This adds an array of readers to the index leaving the added readers open.
968
+ */
969
+ void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
970
+ {
971
+ IndexReader *ir = NULL;
972
+ int i, del_cnt = 0;
973
+
974
+ mutex_lock(&iw->mutex);
975
+ iw_optimize_internal(iw); // start with zero or 1 seg
976
+
977
+ char *merged_name = new_segment_name(iw->sis->counter++);
978
+
979
+ SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
980
+ merger->readers->free_elem = NULL; // don't close readers
981
+
982
+ if (iw->sis->scnt == 1) {// add existing index, if any
983
+ ir = sr_open_si(iw->sis->segs[0]);
984
+ sm_add(merger, ir);
985
+ del_cnt = 1;
986
+ }
987
+
988
+ for (i = 0; i < cnt; i++) {
989
+ sm_add(merger, irs[i]);
990
+ }
991
+
992
+ int doc_count = sm_merge(merger); // merge 'em
993
+
994
+ // pop old infos and add new ones.
995
+ sis_clear(iw->sis);
996
+ sis_add_si(iw->sis, si_create(merged_name, doc_count, iw->store));
997
+
998
+
999
+ Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
1000
+ if (!commit_lock->obtain(commit_lock)) // obtain write lock
1001
+ eprintf(STATE_ERROR, "Index locked for commit: %s", COMMIT_LOCK_NAME);
1002
+
1003
+ sis_write(iw->sis, iw->store); // commit changes
1004
+ iw_delete_segments(iw, &ir, del_cnt);
1005
+ if (ir) ir_close(ir);
1006
+
1007
+ commit_lock->release(commit_lock);
1008
+ iw->store->close_lock(commit_lock);
1009
+
1010
+ if (iw->use_compound_file) {
1011
+ make_compound_file(iw, merged_name, merger);
1012
+ }
1013
+
1014
+ iw_optimize_internal(iw);
1015
+ sm_destroy(merger);
1016
+
1017
+ mutex_unlock(&iw->mutex);
1018
+ }
1019
+
1020
+ /****************************************************************************
1021
+ *
1022
+ * Norm
1023
+ *
1024
+ ****************************************************************************/
1025
+
1026
+ Norm *norm_create(InStream *is, int field_num)
1027
+ {
1028
+ Norm *norm = ALLOC(Norm);
1029
+ norm->is = is;
1030
+ norm->field_num = field_num;
1031
+ norm->bytes = NULL;
1032
+ norm->is_dirty = false;
1033
+ return norm;
1034
+ }
1035
+
1036
+ void norm_destroy(void *p)
1037
+ {
1038
+ Norm *norm = (Norm *)p;
1039
+ is_close(norm->is);
1040
+ if (norm->bytes != NULL) free(norm->bytes);
1041
+ free(norm);
1042
+ }
1043
+
1044
+ void norm_rewrite(Norm *norm, Store *store, char *segment,
1045
+ int doc_count, Store *cfs_store)
1046
+ {
1047
+ if (norm->bytes == NULL)
1048
+ return; // These norms do not need to be rewritten
1049
+
1050
+ char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1051
+ char norm_fname[SEGMENT_NAME_MAX_LENGTH];
1052
+ sprintf(tmp_fname, "%s.tmp", segment);
1053
+ OutStream *os = store->create_output(store, tmp_fname);
1054
+ os_write_bytes(os, norm->bytes, doc_count);
1055
+ os_close(os);
1056
+ if (cfs_store) {
1057
+ sprintf(norm_fname, "%s.s%d", segment, norm->field_num);
1058
+ } else {
1059
+ sprintf(norm_fname, "%s.f%d", segment, norm->field_num);
1060
+ }
1061
+ store->rename(store, tmp_fname, norm_fname);
1062
+ norm->is_dirty = false;
1063
+ }
1064
+
1065
+ /****************************************************************************
1066
+ *
1067
+ * SegmentReader
1068
+ *
1069
+ ****************************************************************************/
1070
+
1071
+ #define GET_SR SegmentReader *sr = (SegmentReader *)ir->data;
1072
+
1073
+ int sr_max_doc(IndexReader *ir)
1074
+ {
1075
+ return ((SegmentReader *)ir->data)->fr->len;
1076
+ }
1077
+
1078
+ static inline void sr_close_norms(SegmentReader *sr)
1079
+ {
1080
+ h_destroy(sr->norms);
1081
+ }
1082
+
1083
+ static inline TermVectorsReader *sr_tvr(SegmentReader *sr)
1084
+ {
1085
+ TermVectorsReader *tvr;
1086
+ if ((tvr = thread_getspecific(sr->thread_tvr)) == NULL) {
1087
+ tvr = tvr_clone(sr->orig_tvr);
1088
+ if (tvr == NULL) printf("scuk\n");
1089
+ ary_append(sr->tvr_bucket, tvr);
1090
+ thread_setspecific(sr->thread_tvr, tvr);
1091
+ }
1092
+ return tvr;
1093
+ }
1094
+
1095
+ void sr_close(IndexReader *ir)
1096
+ {
1097
+ GET_SR;
1098
+ fr_close(sr->fr);
1099
+ tir_close(sr->tir);
1100
+
1101
+ if (sr->freq_in) is_close(sr->freq_in);
1102
+ if (sr->prox_in) is_close(sr->prox_in);
1103
+ fis_destroy(sr->fis);
1104
+
1105
+ sr_close_norms(sr);
1106
+
1107
+ if (sr->orig_tvr) {
1108
+ tvr_close(sr->orig_tvr);
1109
+ thread_key_delete(sr->thread_tvr);
1110
+ ary_destroy(sr->tvr_bucket);
1111
+ }
1112
+ if (sr->deleted_docs) bv_destroy(sr->deleted_docs);
1113
+ if (sr->cfs_store) sr->cfs_store->close(sr->cfs_store);
1114
+ if (sr->fake_norms) free(sr->fake_norms);
1115
+ free(sr->segment);
1116
+ free(sr);
1117
+ }
1118
+
1119
+ void sr_delete_doc(IndexReader *ir, int doc_num)
1120
+ {
1121
+ GET_SR;
1122
+ if (sr->deleted_docs == NULL)
1123
+ sr->deleted_docs = bv_create();
1124
+
1125
+ sr->deleted_docs_dirty = true;
1126
+ sr->undelete_all = false;
1127
+ bv_set(sr->deleted_docs, doc_num);
1128
+ }
1129
+
1130
+ static inline bool sr_is_deleted_internal(IndexReader *ir, int doc_num)
1131
+ {
1132
+ GET_SR;
1133
+ return (sr->deleted_docs != NULL && bv_get(sr->deleted_docs, doc_num));
1134
+ }
1135
+
1136
+ bool sr_is_deleted(IndexReader *ir, int doc_num)
1137
+ {
1138
+ bool is_del;
1139
+
1140
+ mutex_lock(&ir->mutex);
1141
+ is_del = sr_is_deleted_internal(ir, doc_num);
1142
+ mutex_unlock(&ir->mutex);
1143
+
1144
+ return is_del;
1145
+ }
1146
+
1147
+ bool sr_has_norms(IndexReader *ir, char *field)
1148
+ {
1149
+ bool has_norms;
1150
+ GET_SR;
1151
+ mutex_lock(&ir->mutex);
1152
+ has_norms = h_has_key(sr->norms, field);
1153
+ mutex_unlock(&ir->mutex);
1154
+
1155
+ return has_norms;
1156
+ }
1157
+
1158
+ bool sr_has_deletions(IndexReader *ir)
1159
+ {
1160
+ GET_SR;
1161
+ return (sr->deleted_docs != NULL);
1162
+ }
1163
+
1164
+ void sr_undelete_all(IndexReader *ir)
1165
+ {
1166
+ GET_SR;
1167
+ sr->undelete_all = true;
1168
+ sr->deleted_docs_dirty = false;
1169
+ if (sr->deleted_docs != NULL) bv_destroy(sr->deleted_docs);
1170
+ sr->deleted_docs = NULL;
1171
+ }
1172
+
1173
+ TermEnum *sr_terms(IndexReader *ir)
1174
+ {
1175
+ TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1176
+ return te->clone(te);
1177
+ }
1178
+
1179
+ TermEnum *sr_terms_from(IndexReader *ir, Term *term)
1180
+ {
1181
+ TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1182
+ TermEnum *ret_te = te->clone(te);
1183
+ te_skip_to(ret_te, term);
1184
+ return ret_te;
1185
+ }
1186
+
1187
+ Document *sr_get_doc(IndexReader *ir, int doc_num)
1188
+ {
1189
+ Document *doc;
1190
+ mutex_lock(&ir->mutex);
1191
+ if (sr_is_deleted_internal(ir, doc_num)) {
1192
+ mutex_unlock(&ir->mutex);
1193
+ eprintf(STATE_ERROR,
1194
+ "Tried to get doc <%ld> that has already been deleted", doc_num);
1195
+ }
1196
+ GET_SR;
1197
+ doc = fr_get_doc(sr->fr, doc_num);
1198
+ mutex_unlock(&ir->mutex);
1199
+ return doc;
1200
+ }
1201
+
1202
+ static inline void
1203
+ sr_get_norms_into_internal(IndexReader *ir, char *field, uchar *buf, int offset)
1204
+ {
1205
+ GET_SR;
1206
+ Norm *norm = h_get(sr->norms, field);
1207
+ if (norm == NULL) {
1208
+ memset(buf + offset*sizeof(uchar), 0, sr_max_doc(ir)*sizeof(uchar));
1209
+ } else if (norm->bytes != NULL) { // can copy from cache
1210
+ memcpy(buf + offset*sizeof(uchar), norm->bytes, sr_max_doc(ir)*sizeof(uchar));
1211
+ } else {
1212
+ InStream *norm_in = is_clone(norm->is);
1213
+ // read from disk
1214
+ is_seek(norm_in, 0);
1215
+ is_read_bytes(norm_in, buf, offset, sr_max_doc(ir));
1216
+ is_close(norm_in);
1217
+ }
1218
+ }
1219
+
1220
+ void sr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1221
+ {
1222
+ mutex_lock(&ir->mutex);
1223
+ sr_get_norms_into_internal(ir, field, buf, offset);
1224
+ mutex_unlock(&ir->mutex);
1225
+ }
1226
+
1227
+ static inline uchar *sr_get_norms_internal(IndexReader *ir, char *field)
1228
+ {
1229
+ GET_SR;
1230
+ Norm *norm = h_get(sr->norms, field);
1231
+ if (norm == NULL) // not an indexed field
1232
+ return NULL;
1233
+
1234
+ if (norm->bytes == NULL) { // value not yet read
1235
+ uchar *bytes = ALLOC_N(uchar, ir->max_doc(ir));
1236
+ sr_get_norms_into_internal(ir, field, bytes, 0);
1237
+ norm->bytes = bytes; // cache it
1238
+ }
1239
+ return norm->bytes;
1240
+ }
1241
+
1242
+ uchar *sr_get_norms(IndexReader *ir, char *field)
1243
+ {
1244
+ uchar *norms;
1245
+ mutex_lock(&ir->mutex);
1246
+ norms = sr_get_norms_internal(ir, field);
1247
+ mutex_unlock(&ir->mutex);
1248
+ return norms;
1249
+ }
1250
+
1251
+ static inline uchar *sr_get_norms_always(IndexReader *ir, char *field)
1252
+ {
1253
+ uchar *bytes;
1254
+ GET_SR;
1255
+ mutex_lock(&ir->mutex);
1256
+
1257
+ bytes = sr_get_norms_internal(ir, field);
1258
+ if (bytes == NULL) {
1259
+ if (sr->fake_norms) {
1260
+ bytes = sr->fake_norms;
1261
+ } else {
1262
+ int len = ir->max_doc(ir);
1263
+ sr->fake_norms = bytes = ALLOC_N(uchar, len);
1264
+ memset(bytes, 0, len);
1265
+ }
1266
+ }
1267
+ mutex_unlock(&ir->mutex);
1268
+ return bytes;
1269
+ }
1270
+
1271
+ void sr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1272
+ {
1273
+ GET_SR;
1274
+ Norm *norm;
1275
+
1276
+ norm = h_get(sr->norms, field);
1277
+ if (norm != NULL) { /* an indexed field */
1278
+ norm->is_dirty = true; // mark it dirty
1279
+ sr->norms_dirty = true;
1280
+
1281
+ sr_get_norms_internal(ir, field)[doc_num] = val;
1282
+ }
1283
+ }
1284
+
1285
+ int sr_doc_freq(IndexReader *ir, Term *t)
1286
+ {
1287
+ GET_SR;
1288
+ TermInfo *ti = tir_get_ti(sr->tir, t);
1289
+ if (ti != NULL) {
1290
+ int df = ti->doc_freq;
1291
+ ti_destroy(ti);
1292
+ return df;
1293
+ } else return 0;
1294
+ }
1295
+
1296
+ Array *sr_file_names(IndexReader *ir)
1297
+ {
1298
+ GET_SR;
1299
+ Array *file_names = ary_create(0, &efree);
1300
+ FieldInfo *fi;
1301
+ int i;
1302
+ char fname[SEGMENT_NAME_MAX_LENGTH];
1303
+
1304
+ for (i = 0; i < NELEMS(INDEX_EXTENSIONS); i++) {
1305
+ sprintf(fname, "%s.%s", sr->segment, INDEX_EXTENSIONS[i]);
1306
+ if (ir->store->exists(ir->store, fname))
1307
+ ary_append(file_names, estrdup(fname));
1308
+ }
1309
+
1310
+ for (i = 0; i < sr->fis->fcnt; i++) {
1311
+ fi = sr->fis->by_number[i];
1312
+ if (fi->is_indexed && !fi->omit_norms) {
1313
+ if (sr->cfs_store) {
1314
+ sprintf(fname, "%s.s%d", sr->segment, i);
1315
+ } else {
1316
+ sprintf(fname, "%s.f%d", sr->segment, i);
1317
+ }
1318
+ if (ir->store->exists(ir->store, fname))
1319
+ ary_append(file_names, estrdup(fname));
1320
+ }
1321
+ }
1322
+ return file_names;
1323
+ }
1324
+
1325
+ HashSet *sr_get_field_names(IndexReader *ir, int field_type)
1326
+ {
1327
+ int i;
1328
+ GET_SR;
1329
+ HashSet *field_set = hs_str_create(NULL);
1330
+ FieldInfo *fi;
1331
+ for (i = 0; i < sr->fis->fcnt; i++) {
1332
+ fi = sr->fis->by_number[i];
1333
+ switch(field_type) {
1334
+ case IR_ALL:
1335
+ hs_add(field_set, fi->name);
1336
+ break;
1337
+ case IR_UNINDEXED:
1338
+ if (!fi->is_indexed) hs_add(field_set, fi->name);
1339
+ break;
1340
+ case IR_INDEXED:
1341
+ if (fi->is_indexed) hs_add(field_set, fi->name);
1342
+ break;
1343
+ case IR_INDEXED_NO_TERM_VECTOR:
1344
+ if (fi->is_indexed && !fi->store_tv) hs_add(field_set, fi->name);
1345
+ break;
1346
+ case IR_TERM_VECTOR:
1347
+ if (fi->store_tv && !fi->store_pos && !fi->store_offset)
1348
+ hs_add(field_set, fi->name);
1349
+ break;
1350
+ case IR_INDEXED_WITH_TERM_VECTOR:
1351
+ if (fi->is_indexed && fi->store_tv) hs_add(field_set, fi->name);
1352
+ break;
1353
+ case IR_TERM_VECTOR_WITH_POSITION:
1354
+ if (fi->store_pos && !fi->store_offset) hs_add(field_set, fi->name);
1355
+ break;
1356
+ case IR_TERM_VECTOR_WITH_OFFSET:
1357
+ if (!fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1358
+ case IR_TERM_VECTOR_WITH_POSITION_OFFSET:
1359
+ if (fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1360
+ break;
1361
+ default:
1362
+ eprintf(ARG_ERROR, "Invalid field_type <%ld>.", field_type);
1363
+ }
1364
+ }
1365
+ return field_set;
1366
+ }
1367
+
1368
+ int sr_num_docs(IndexReader *ir)
1369
+ {
1370
+ GET_SR;
1371
+
1372
+ mutex_lock(&ir->mutex);
1373
+ int num_docs = sr_max_doc(ir);
1374
+ if (sr->deleted_docs != NULL)
1375
+ num_docs -= sr->deleted_docs->count;
1376
+ mutex_unlock(&ir->mutex);
1377
+ return num_docs;
1378
+ }
1379
+
1380
+ TermDocEnum *sr_term_docs(IndexReader *ir)
1381
+ {
1382
+ return stde_create(ir);
1383
+ }
1384
+
1385
+ TermDocEnum *sr_term_positions(IndexReader *ir)
1386
+ {
1387
+ return stpe_create(ir);
1388
+ }
1389
+
1390
+ void sr_open_norms(IndexReader *ir, Store *cfs_store)
1391
+ {
1392
+ GET_SR;
1393
+ int i;
1394
+ FieldInfo *fi;
1395
+ Store *tmp_store;
1396
+ char fname[SEGMENT_NAME_MAX_LENGTH];
1397
+ for (i = 0; i < sr->fis->fcnt; i++) {
1398
+ tmp_store = ir->store;
1399
+ fi = sr->fis->by_number[i];
1400
+ if (fi->is_indexed && !fi->omit_norms) {
1401
+ sprintf(fname, "%s.s%d", sr->segment, fi->number);
1402
+ if (! tmp_store->exists(tmp_store, fname)) {
1403
+ sprintf(fname, "%s.f%d", sr->segment, fi->number);
1404
+ tmp_store = cfs_store;
1405
+ }
1406
+ h_set(sr->norms, fi->name,
1407
+ norm_create(tmp_store->open_input(tmp_store, fname), fi->number));
1408
+ }
1409
+ }
1410
+ sr->norms_dirty = false;
1411
+ }
1412
+
1413
+ TermVector *sr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1414
+ {
1415
+ GET_SR;
1416
+ FieldInfo *fi = (FieldInfo *)ht_get(sr->fis->by_name, field);
1417
+ TermVectorsReader *tvr;
1418
+
1419
+ if (fi == NULL || !fi->store_tv || !sr->orig_tvr || !(tvr = sr_tvr(sr)))
1420
+ return NULL;
1421
+
1422
+ return tvr_get_field_tv(tvr, doc_num, field);
1423
+ }
1424
+
1425
+ Array *sr_get_term_vectors(IndexReader *ir, int doc_num)
1426
+ {
1427
+ GET_SR;
1428
+ TermVectorsReader *tvr;
1429
+ if (sr->orig_tvr == NULL || (tvr = sr_tvr(sr)) == NULL)
1430
+ return NULL;
1431
+
1432
+ return tvr_get_tv(tvr, doc_num);
1433
+ }
1434
+
1435
+ void sr_commit(IndexReader *ir)
1436
+ {
1437
+ GET_SR;
1438
+ char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1439
+ char del_fname[SEGMENT_NAME_MAX_LENGTH];
1440
+ sprintf(del_fname, "%s.del", sr->segment);
1441
+
1442
+ if (sr->deleted_docs_dirty) { // re-write deleted
1443
+ sprintf(tmp_fname, "%s.tmp", sr->segment);
1444
+ bv_write(sr->deleted_docs, ir->store, tmp_fname);
1445
+ ir->store->rename(ir->store, tmp_fname, del_fname);
1446
+ }
1447
+ if (sr->undelete_all && ir->store->exists(ir->store, del_fname))
1448
+ ir->store->remove(ir->store, del_fname);
1449
+ if (sr->norms_dirty) {// re-write norms
1450
+ int i;
1451
+ FieldInfo *fi;
1452
+ for (i = 0; i < sr->fis->fcnt; i++) {
1453
+ fi = sr->fis->by_number[i];
1454
+ if (fi->is_indexed) {
1455
+ norm_rewrite((Norm *)h_get(sr->norms, fi->name), ir->store,
1456
+ sr->segment, sr_max_doc(ir), sr->cfs_store);
1457
+ }
1458
+ }
1459
+ }
1460
+ sr->deleted_docs_dirty = false;
1461
+ sr->norms_dirty = false;
1462
+ sr->undelete_all = false;
1463
+ }
1464
+
1465
+ IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
1466
+ {
1467
+ Store *store = si->store;
1468
+ SegmentReader *sr = ALLOC(SegmentReader);
1469
+ ir->get_term_vector = &sr_get_term_vector;
1470
+ ir->get_term_vectors = &sr_get_term_vectors;
1471
+ ir->num_docs = &sr_num_docs;
1472
+ ir->max_doc = &sr_max_doc;
1473
+ ir->get_doc = &sr_get_doc;
1474
+ ir->get_norms_into = &sr_get_norms_into;
1475
+ ir->get_norms = &sr_get_norms;
1476
+ ir->get_norms_always = &sr_get_norms_always;
1477
+ ir->do_set_norm = &sr_set_norm;
1478
+ ir->terms = &sr_terms;
1479
+ ir->terms_from = &sr_terms_from;
1480
+ ir->doc_freq = &sr_doc_freq;
1481
+ ir->term_docs = &sr_term_docs;
1482
+ ir->term_positions = &sr_term_positions;
1483
+ ir->do_delete_doc = &sr_delete_doc;
1484
+ ir->is_deleted = &sr_is_deleted;
1485
+ ir->has_norms = &sr_has_norms;
1486
+ ir->has_deletions = &sr_has_deletions;
1487
+ ir->do_undelete_all = &sr_undelete_all;
1488
+ ir->get_field_names = &sr_get_field_names;
1489
+ ir->do_commit = &sr_commit;
1490
+ ir->do_close = &sr_close;
1491
+ ir->data = sr;
1492
+ sr->segment = estrdup(si->name);
1493
+ char fname[SEGMENT_NAME_MAX_LENGTH];
1494
+ sr->cfs_store = NULL;
1495
+ sr->fake_norms = NULL;
1496
+ sprintf(fname, "%s.cfs", sr->segment);
1497
+ if (store->exists(store, fname)) {
1498
+ sr->cfs_store = open_cmpd_store(store, fname);
1499
+ store = sr->cfs_store;
1500
+ }
1501
+
1502
+ sprintf(fname, "%s.fnm", sr->segment);
1503
+ sr->fis = fis_open(store, fname);
1504
+ sr->fr = fr_open(store, sr->segment, sr->fis);
1505
+
1506
+ sr->tir = tir_open(store, sr->segment, sr->fis);
1507
+ sr->deleted_docs = NULL;
1508
+ sr->deleted_docs_dirty = false;
1509
+ sr->undelete_all = false;
1510
+ if (si_has_deletions(si)) {
1511
+ sprintf(fname, "%s.del", sr->segment);
1512
+ sr->deleted_docs = bv_read(si->store, fname);
1513
+ }
1514
+
1515
+ sprintf(fname, "%s.frq", sr->segment);
1516
+ sr->freq_in = store->open_input(store, fname);
1517
+ sprintf(fname, "%s.prx", sr->segment);
1518
+ sr->prox_in = store->open_input(store, fname);
1519
+ sr->norms = h_new_str(NULL, &norm_destroy);
1520
+ sr_open_norms(ir, store);
1521
+
1522
+ if (fis_has_vectors(sr->fis)) {
1523
+ sr->orig_tvr = tvr_open(store, sr->segment, sr->fis);
1524
+ thread_key_create(&sr->thread_tvr, NULL);
1525
+ sr->tvr_bucket = ary_create(1, (destroy_func_t)&tvr_close);
1526
+ } else {
1527
+ sr->orig_tvr = NULL;
1528
+ }
1529
+ return ir;
1530
+ }
1531
+
1532
+ IndexReader *sr_open_si(SegmentInfo *si)
1533
+ {
1534
+ IndexReader *ir = ir_create(si->store, NULL, false, false);
1535
+ return sr_open_internal(ir, si);
1536
+ }
1537
+
1538
+ IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_store)
1539
+ {
1540
+ SegmentInfo *si = sis->segs[si_num];
1541
+ IndexReader *ir = ir_create(si->store, sis, is_owner, close_store);
1542
+ return sr_open_internal(ir, si);
1543
+ }
1544
+ /****************************************************************************
1545
+ *
1546
+ * MultiReader
1547
+ *
1548
+ ****************************************************************************/
1549
+
1550
+ #define GET_MR MultiReader *mr = (MultiReader *)ir->data
1551
+ #define GET_READER(doc_num) MultiReader *mr = (MultiReader *)ir->data;\
1552
+ int i = mr_reader_index(mr, doc_num);\
1553
+ IndexReader *reader = mr->sub_readers[i];
1554
+
1555
+
1556
+
1557
+ int mr_reader_index(MultiReader *mr, int doc_num)
1558
+ {
1559
+ int lo = 0; // search @starts array
1560
+ int hi = mr->rcnt - 1; // for first element less
1561
+ int mid;
1562
+ int mid_value;
1563
+
1564
+ while (hi >= lo) {
1565
+ mid = (lo + hi) >> 1;
1566
+ mid_value = mr->starts[mid];
1567
+ if (doc_num < mid_value) {
1568
+ hi = mid - 1;
1569
+ } else if (doc_num > mid_value) {
1570
+ lo = mid + 1;
1571
+ } else { // found a match
1572
+ while ((mid+1 < mr->rcnt) && (mr->starts[mid+1] == mid_value))
1573
+ mid += 1; // scan to last match in case we have empty segments
1574
+ return mid;
1575
+ }
1576
+ }
1577
+ return hi;
1578
+ }
1579
+
1580
+ TermVector *mr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1581
+ {
1582
+ GET_READER(doc_num);
1583
+ return reader->get_term_vector(reader, doc_num - mr->starts[i], field);
1584
+ }
1585
+
1586
+ Array *mr_get_term_vectors(IndexReader *ir, int doc_num)
1587
+ {
1588
+ GET_READER(doc_num);
1589
+ return reader->get_term_vectors(reader, doc_num - mr->starts[i]);
1590
+ }
1591
+
1592
+ int mr_num_docs(IndexReader *ir)
1593
+ {
1594
+ int i, num_docs;
1595
+ GET_MR;
1596
+ mutex_lock(&ir->mutex);
1597
+ if (mr->num_docs_cache == -1) {
1598
+ IndexReader *reader;
1599
+ mr->num_docs_cache = 0;
1600
+ for (i = 0; i < mr->rcnt; i++) {
1601
+ reader = mr->sub_readers[i];
1602
+ mr->num_docs_cache += reader->num_docs(reader);
1603
+ }
1604
+ }
1605
+ num_docs = mr->num_docs_cache;
1606
+ mutex_unlock(&ir->mutex);
1607
+
1608
+ return num_docs;
1609
+ }
1610
+
1611
+ int mr_max_doc(IndexReader *ir)
1612
+ {
1613
+ GET_MR;
1614
+ return mr->max_doc;
1615
+ }
1616
+
1617
+ Document *mr_get_doc(IndexReader *ir, int doc_num)
1618
+ {
1619
+ GET_READER(doc_num);
1620
+ return reader->get_doc(reader, doc_num - mr->starts[i]);
1621
+ }
1622
+
1623
+ void mr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1624
+ {
1625
+ int i;
1626
+ GET_MR;
1627
+
1628
+ mutex_lock(&ir->mutex);
1629
+ uchar *bytes = h_get(mr->norms_cache, field);
1630
+ if (bytes != NULL) {
1631
+ memcpy(buf + offset, bytes, mr->max_doc);
1632
+ } else {
1633
+ IndexReader *reader;
1634
+ for (i = 0; i < mr->rcnt; i++) {
1635
+ reader = mr->sub_readers[i];
1636
+ reader->get_norms_into(reader, field, buf, offset + mr->starts[i]);
1637
+ }
1638
+ }
1639
+ mutex_unlock(&ir->mutex);
1640
+ }
1641
+
1642
+ uchar *mr_get_norms(IndexReader *ir, char *field)
1643
+ {
1644
+ int i;
1645
+ GET_MR;
1646
+ uchar *bytes;
1647
+ IndexReader *reader;
1648
+
1649
+ mutex_lock(&ir->mutex);
1650
+ bytes = h_get(mr->norms_cache, field);
1651
+ if (bytes == NULL) {
1652
+ bytes = ALLOC_N(uchar, mr->max_doc);
1653
+
1654
+ for (i = 0; i < mr->rcnt; i++) {
1655
+ reader = mr->sub_readers[i];
1656
+ reader->get_norms_into(reader, field, bytes, mr->starts[i]);
1657
+ }
1658
+ h_set(mr->norms_cache, field, bytes); // update cache
1659
+ }
1660
+ mutex_unlock(&ir->mutex);
1661
+
1662
+ return bytes;
1663
+ }
1664
+
1665
+ void mr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1666
+ {
1667
+ GET_READER(doc_num);
1668
+ h_del(mr->norms_cache, field); // clear cache
1669
+ ir_set_norm(reader, doc_num - mr->starts[i], field, val);
1670
+ }
1671
+
1672
+ TermEnum *mr_terms(IndexReader *ir)
1673
+ {
1674
+ GET_MR;
1675
+ return mte_create(mr->sub_readers, mr->starts, mr->rcnt, NULL);
1676
+ }
1677
+
1678
+ TermEnum *mr_terms_from(IndexReader *ir, Term *term)
1679
+ {
1680
+ GET_MR;
1681
+ return mte_create(mr->sub_readers, mr->starts, mr->rcnt, term);
1682
+ }
1683
+
1684
+ int mr_doc_freq(IndexReader *ir, Term *t)
1685
+ {
1686
+ int total = 0, i; // sum freqs in segments
1687
+ GET_MR;
1688
+
1689
+ IndexReader *reader;
1690
+ for (i = 0; i < mr->rcnt; i++) {
1691
+ reader = mr->sub_readers[i];
1692
+ total += reader->doc_freq(reader, t);
1693
+ }
1694
+ return total;
1695
+ }
1696
+
1697
+ TermDocEnum *mr_term_docs(IndexReader *ir)
1698
+ {
1699
+ GET_MR;
1700
+ return mtde_create(mr->sub_readers, mr->starts, mr->rcnt);
1701
+ }
1702
+
1703
+ TermDocEnum *mr_term_positions(IndexReader *ir)
1704
+ {
1705
+ GET_MR;
1706
+ return mtpe_create(mr->sub_readers, mr->starts, mr->rcnt);
1707
+ }
1708
+
1709
+ void mr_delete_doc(IndexReader *ir, int doc_num)
1710
+ {
1711
+ GET_READER(doc_num);
1712
+ mr->num_docs_cache = -1; // invalidate cache
1713
+
1714
+ reader->do_delete_doc(reader, doc_num - mr->starts[i]); // dispatch to segment reader
1715
+ mr->has_deletions = true;
1716
+ }
1717
+
1718
+ bool mr_is_deleted(IndexReader *ir, int doc_num)
1719
+ {
1720
+ GET_READER(doc_num);
1721
+ return reader->is_deleted(reader, doc_num - mr->starts[i]);
1722
+ }
1723
+
1724
+ bool mr_has_norms(IndexReader *ir, char *field)
1725
+ {
1726
+ bool has_norms = false;
1727
+ int i;
1728
+ GET_MR;
1729
+
1730
+ IndexReader *reader;
1731
+ for (i = 0; i < mr->rcnt; i++) {
1732
+ reader = mr->sub_readers[i];
1733
+ if (reader->has_norms(reader, field)) {
1734
+ has_norms = true;
1735
+ break;
1736
+ }
1737
+ }
1738
+
1739
+ return has_norms;
1740
+ }
1741
+
1742
+ bool mr_has_deletions(IndexReader *ir)
1743
+ {
1744
+ GET_MR;
1745
+ return mr->has_deletions;
1746
+ }
1747
+
1748
+ void mr_undelete_all(IndexReader *ir)
1749
+ {
1750
+ int i;
1751
+ GET_MR;
1752
+ mr->num_docs_cache = -1; // invalidate cache
1753
+ IndexReader *reader;
1754
+ for (i = 0; i < mr->rcnt; i++) {
1755
+ reader = mr->sub_readers[i];
1756
+ reader->do_undelete_all(reader);
1757
+ }
1758
+ mr->has_deletions = false;
1759
+ }
1760
+
1761
+ HashSet *mr_get_field_names(IndexReader *ir, int field_type)
1762
+ {
1763
+ int i;
1764
+ GET_MR;
1765
+ HashSet *field_set = hs_str_create(NULL);
1766
+ IndexReader *reader;
1767
+ for (i = 0; i < mr->rcnt; i++) {
1768
+ reader = mr->sub_readers[i];
1769
+ hs_merge(field_set, reader->get_field_names(reader, field_type));
1770
+ }
1771
+ return field_set;
1772
+ }
1773
+
1774
+ void mr_commit(IndexReader *ir)
1775
+ {
1776
+ GET_MR;
1777
+ int i;
1778
+ IndexReader *reader;
1779
+ for (i = 0; i < mr->rcnt; i++) {
1780
+ reader = mr->sub_readers[i];
1781
+ reader->do_commit(reader);
1782
+ }
1783
+ }
1784
+
1785
+ void mr_close(IndexReader *ir)
1786
+ {
1787
+ GET_MR;
1788
+ int i;
1789
+ IndexReader *reader;
1790
+ for (i = 0; i < mr->rcnt; i++) {
1791
+ reader = mr->sub_readers[i];
1792
+ ir_close(reader);
1793
+ }
1794
+ free(mr->sub_readers);
1795
+ h_destroy(mr->norms_cache);
1796
+ free(mr->starts);
1797
+ free(mr);
1798
+ }
1799
+
1800
+ IndexReader *mr_open(Store *store,
1801
+ SegmentInfos *sis,
1802
+ IndexReader **sub_readers,
1803
+ int rcnt,
1804
+ int close_store)
1805
+ {
1806
+ int i;
1807
+ MultiReader *mr = ALLOC(MultiReader);
1808
+ IndexReader *sub_reader;
1809
+ mr->sub_readers = sub_readers;
1810
+ mr->rcnt = rcnt;
1811
+
1812
+ mr->max_doc = 0;
1813
+ mr->num_docs_cache = -1;
1814
+ mr->has_deletions = false;
1815
+
1816
+ mr->starts = ALLOC_N(int, (rcnt+1));
1817
+ for (i = 0; i < rcnt; i++) {
1818
+ sub_reader = sub_readers[i];
1819
+ mr->starts[i] = mr->max_doc;
1820
+ mr->max_doc += sub_reader->max_doc(sub_reader); // compute max_docs
1821
+
1822
+ if (sub_reader->has_deletions(sub_reader))
1823
+ mr->has_deletions = true;
1824
+ }
1825
+ mr->starts[rcnt] = mr->max_doc;
1826
+ mr->norms_cache = h_new_str(NULL, &efree);
1827
+
1828
+ IndexReader *ir = ir_create(store, sis, true, close_store);
1829
+ ir->get_term_vector = &mr_get_term_vector;
1830
+ ir->get_term_vectors = &mr_get_term_vectors;
1831
+ ir->num_docs = &mr_num_docs;
1832
+ ir->max_doc = &mr_max_doc;
1833
+ ir->get_doc = &mr_get_doc;
1834
+ ir->get_norms_into = &mr_get_norms_into;
1835
+ ir->get_norms = &mr_get_norms;
1836
+ ir->get_norms_always = &mr_get_norms;
1837
+ ir->do_set_norm = &mr_set_norm;
1838
+ ir->terms = &mr_terms;
1839
+ ir->terms_from = &mr_terms_from;
1840
+ ir->doc_freq = &mr_doc_freq;
1841
+ ir->term_docs = &mr_term_docs;
1842
+ ir->term_positions = &mr_term_positions;
1843
+ ir->do_delete_doc = &mr_delete_doc;
1844
+ ir->is_deleted = &mr_is_deleted;
1845
+ ir->has_norms = &mr_has_norms;
1846
+ ir->has_deletions = &mr_has_deletions;
1847
+ ir->do_undelete_all = &mr_undelete_all;
1848
+ ir->get_field_names = &mr_get_field_names;
1849
+ ir->do_commit = &mr_commit;
1850
+ ir->do_close = &mr_close;
1851
+ ir->data = mr;
1852
+
1853
+ return ir;
1854
+ }
1855
+
1856
+ /****************************************************************************
1857
+ *
1858
+ * SegmentMergeInfo
1859
+ *
1860
+ ****************************************************************************/
1861
+
1862
+ bool smi_lt(void *p1, void *p2)
1863
+ {
1864
+ SegmentMergeInfo *smi1 = (SegmentMergeInfo *)p1;
1865
+ SegmentMergeInfo *smi2 = (SegmentMergeInfo *)p2;
1866
+
1867
+ int cmpres = tb_cmp(smi1->tb, smi2->tb);
1868
+ if (cmpres == 0) {
1869
+ return smi1->base < smi2->base;
1870
+ } else {
1871
+ return cmpres < 0;
1872
+ }
1873
+ }
1874
+
1875
+ int *smi_load_doc_map(SegmentMergeInfo *smi)
1876
+ {
1877
+ IndexReader *ir = smi->ir;
1878
+ if (ir->has_deletions(ir) && (smi->doc_map == NULL)) {
1879
+ int max_doc = ir->max_doc(ir);
1880
+ smi->doc_map = ALLOC_N(int, max_doc);
1881
+ int j = 0, i;
1882
+ for (i = 0; i < max_doc; i++) {
1883
+ if (ir->is_deleted(ir, i)) {
1884
+ smi->doc_map[i] = -1;
1885
+ } else {
1886
+ smi->doc_map[i] = j++;
1887
+ }
1888
+ }
1889
+ }
1890
+ return smi->doc_map;
1891
+ }
1892
+
1893
+ SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir)
1894
+ {
1895
+ SegmentMergeInfo *smi = ALLOC(SegmentMergeInfo);
1896
+ smi->base = base;
1897
+ smi->ir = ir;
1898
+ smi->te = te;
1899
+ smi->tb = te->tb_curr;
1900
+ smi->postings = ir->term_positions(ir);
1901
+ smi->doc_map = NULL;
1902
+ return smi;
1903
+ }
1904
+
1905
+ void smi_destroy(void *p)
1906
+ {
1907
+ SegmentMergeInfo *smi = (SegmentMergeInfo *)p;
1908
+ smi->postings->close(smi->postings);
1909
+ smi->te->close(smi->te);
1910
+ if (smi->doc_map != NULL)
1911
+ free(smi->doc_map);
1912
+ free(smi);
1913
+ }
1914
+
1915
+ TermBuffer *smi_next(SegmentMergeInfo *smi)
1916
+ {
1917
+ return (smi->tb = smi->te->next(smi->te));
1918
+ }
1919
+
1920
+ /****************************************************************************
1921
+ *
1922
+ * SegmentMerger
1923
+ *
1924
+ ****************************************************************************/
1925
+
1926
+ SegmentMerger *sm_create(Store *store, char *name, int term_index_interval)
1927
+ {
1928
+ SegmentMerger *sm = ALLOC(SegmentMerger);
1929
+ sm->store = store;
1930
+ sm->name = estrdup(name);
1931
+ sm->readers = ary_create(config.merge_factor, &ir_destroy);
1932
+ sm->fis = NULL;
1933
+ sm->freq_out = NULL;
1934
+ sm->prox_out = NULL;
1935
+ sm->tiw = NULL;
1936
+ sm->queue = NULL;
1937
+ sm->ti = ti_create(0, 0, 0, 0);
1938
+ sm->term_index_interval = term_index_interval;
1939
+ sm->skip_buffer = ram_create_buffer();
1940
+ sm->skip_interval = -1;
1941
+ return sm;
1942
+ }
1943
+
1944
+ void sm_close(SegmentMerger *sm)
1945
+ {
1946
+ int i;
1947
+ if (sm->freq_out != NULL) os_close(sm->freq_out);
1948
+ if (sm->prox_out != NULL) os_close(sm->prox_out);
1949
+ if (sm->tiw != NULL) {
1950
+ for (i = 0; i < sm->terms_buf_size; i++)
1951
+ free(sm->terms_buf[i].text);
1952
+ free(sm->terms_buf);
1953
+ tiw_close(sm->tiw);
1954
+ }
1955
+ if (sm->queue != NULL) pq_destroy(sm->queue);
1956
+ sm->freq_out = NULL;
1957
+ sm->prox_out = NULL;
1958
+ sm->tiw = NULL;
1959
+ sm->queue = NULL;
1960
+ }
1961
+
1962
+ void sm_destroy(void *p)
1963
+ {
1964
+ SegmentMerger *sm = (SegmentMerger *)p;
1965
+ if (sm->fis != NULL) fis_destroy(sm->fis);
1966
+ ary_destroy(sm->readers);
1967
+ sm_close(sm);
1968
+ free(sm->name);
1969
+ ti_destroy(sm->ti);
1970
+ ram_destroy_buffer(sm->skip_buffer);
1971
+ free(sm);
1972
+ }
1973
+
1974
+ void sm_add(SegmentMerger *sm, IndexReader *ir)
1975
+ {
1976
+ ary_append(sm->readers, ir);
1977
+ }
1978
+
1979
+ static inline void sm_add_indexed(IndexReader *ir,
1980
+ FieldInfos *fis,
1981
+ HashSet *fields,
1982
+ bool store_tv,
1983
+ bool store_pos,
1984
+ bool store_offset)
1985
+ {
1986
+ int i;
1987
+ char *field;
1988
+ for (i = 0; i < fields->size; i++) {
1989
+ field = (char *)fields->elems[i];
1990
+ fis_add(fis, field, true, store_tv, store_pos, store_offset,
1991
+ !ir->has_norms(ir, field));
1992
+ }
1993
+ hs_destroy(fields);
1994
+ }
1995
+
1996
+ int sm_merge_fields(SegmentMerger *sm)
1997
+ {
1998
+ int i, j, maxdoc;
1999
+ FieldInfos *fis = sm->fis = fis_create();
2000
+ int doc_count = 0;
2001
+ Document *doc;
2002
+ for (i = 0; i < sm->readers->size; i++) {
2003
+ IndexReader *ir = sm->readers->elems[i];
2004
+
2005
+ sm_add_indexed(ir, fis,
2006
+ ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION_OFFSET),
2007
+ true, true, true);
2008
+ sm_add_indexed(ir, fis,
2009
+ ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION),
2010
+ true, true, false);
2011
+ sm_add_indexed(ir, fis,
2012
+ ir->get_field_names(ir, IR_TERM_VECTOR_WITH_OFFSET),
2013
+ true, false, true);
2014
+ sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_TERM_VECTOR),
2015
+ true, false, false);
2016
+ sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_INDEXED),
2017
+ false, false, false);
2018
+ fis_add_fields(fis, ir->get_field_names(ir, IR_UNINDEXED),
2019
+ false, false, false, false, false);
2020
+ }
2021
+ fis_write(fis, sm->store, sm->name, ".fnm");
2022
+
2023
+ // merge field values
2024
+ FieldsWriter *fw = fw_open(sm->store, sm->name, fis);
2025
+
2026
+ for (i = 0; i < sm->readers->size; i++) {
2027
+ IndexReader *ir = sm->readers->elems[i];
2028
+ maxdoc = ir->max_doc(ir);
2029
+ for (j = 0; j < maxdoc; j++) {
2030
+ if (!ir->is_deleted(ir, j)) { // skip deleted docs
2031
+ doc = ir->get_doc(ir, j);
2032
+ fw_add_doc(fw, doc);
2033
+ doc_destroy(doc);
2034
+ doc_count++;
2035
+ }
2036
+ }
2037
+ }
2038
+ fw_close(fw);
2039
+ return doc_count;
2040
+ }
2041
+
2042
+ void sm_reset_skip(SegmentMerger *sm)
2043
+ {
2044
+ ramo_reset(sm->skip_buffer);
2045
+ sm->last_skip_doc = 0;
2046
+ sm->last_skip_freq_pointer = os_pos(sm->freq_out);
2047
+ sm->last_skip_prox_pointer = os_pos(sm->prox_out);
2048
+ }
2049
+
2050
+ inline void sm_buffer_skip(SegmentMerger *sm, int doc)
2051
+ {
2052
+ int freq_pointer = os_pos(sm->freq_out);
2053
+ int prox_pointer = os_pos(sm->prox_out);
2054
+
2055
+ os_write_vint(sm->skip_buffer, doc - sm->last_skip_doc);
2056
+ os_write_vint(sm->skip_buffer, freq_pointer - sm->last_skip_freq_pointer);
2057
+ os_write_vint(sm->skip_buffer, prox_pointer - sm->last_skip_prox_pointer);
2058
+
2059
+ sm->last_skip_doc = doc;
2060
+ sm->last_skip_freq_pointer = freq_pointer;
2061
+ sm->last_skip_prox_pointer = prox_pointer;
2062
+ }
2063
+
2064
+ int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2065
+ {
2066
+ int i, j;
2067
+ int last_doc = 0, base, doc, doc_code, freq, last_position, position;
2068
+ int *doc_map = NULL;
2069
+ int df = 0; // number of docs w/ term
2070
+ TermDocEnum *postings;
2071
+ SegmentMergeInfo *smi;
2072
+ sm_reset_skip(sm);
2073
+ for (i = 0; i < cnt; i++) {
2074
+ smi = smis[i];
2075
+ postings = smi->postings;
2076
+ base = smi->base;
2077
+ doc_map = smi_load_doc_map(smi);
2078
+
2079
+ stde_seek_ti(postings, smi->te->ti_curr);
2080
+ while (postings->next(postings)) {
2081
+ doc = postings->doc_num(postings);
2082
+ if (doc_map != NULL)
2083
+ doc = doc_map[doc]; // work around deletions
2084
+ doc += base; // convert to merged space
2085
+
2086
+ if (doc < last_doc)
2087
+ eprintf(STATE_ERROR,
2088
+ "docs out of order curent doc = %ld and previous doc = %ld",
2089
+ doc, last_doc);
2090
+
2091
+ df++;
2092
+
2093
+ if ((df % sm->skip_interval) == 0)
2094
+ sm_buffer_skip(sm, last_doc);
2095
+
2096
+ doc_code = (doc - last_doc) << 1; // use low bit to flag freq=1
2097
+ last_doc = doc;
2098
+
2099
+ freq = postings->freq(postings);
2100
+ if (freq == 1) {
2101
+ os_write_vint(sm->freq_out, doc_code | 1); // write doc & freq=1
2102
+ } else {
2103
+ os_write_vint(sm->freq_out, doc_code); // write doc
2104
+ os_write_vint(sm->freq_out, freq); // write freqency in doc
2105
+ }
2106
+
2107
+
2108
+ last_position = 0; // write position deltas
2109
+ for (j = 0; j < freq; j++) {
2110
+ position = postings->next_position(postings);
2111
+ os_write_vint(sm->prox_out, position - last_position);
2112
+ last_position = position;
2113
+ }
2114
+ }
2115
+ }
2116
+ return df;
2117
+ }
2118
+
2119
+ int sm_write_skip(SegmentMerger *sm)
2120
+ {
2121
+ int skip_pointer = os_pos(sm->freq_out);
2122
+ ramo_write_to(sm->skip_buffer, sm->freq_out);
2123
+ return skip_pointer;
2124
+ }
2125
+
2126
+ Term *sm_tb_to_term(SegmentMerger *sm, TermBuffer *tb)
2127
+ {
2128
+ int index = sm->terms_buf_pointer % sm->terms_buf_size;
2129
+ sm->terms_buf_pointer++;
2130
+ sm->terms_buf[index].field = tb->field;
2131
+ strcpy(sm->terms_buf[index].text, tb->text);
2132
+ return &(sm->terms_buf[index]);
2133
+ }
2134
+
2135
+ void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2136
+ {
2137
+ int freq_pointer = os_pos(sm->freq_out);
2138
+ int prox_pointer = os_pos(sm->prox_out);
2139
+
2140
+ int df = sm_append_postings(sm, smis, cnt); // append posting data
2141
+
2142
+ int skip_pointer = sm_write_skip(sm);
2143
+
2144
+ if (df > 0) {
2145
+ // add an entry to the dictionary with pointers to prox and freq files
2146
+ ti_set(sm->ti, df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer));
2147
+ tiw_add(sm->tiw, sm_tb_to_term(sm, smis[0]->tb), sm->ti);
2148
+ }
2149
+ }
2150
+
2151
+ void sm_merge_term_infos(SegmentMerger *sm)
2152
+ {
2153
+ int base = 0;
2154
+ int i, match_size;
2155
+ IndexReader *ir;
2156
+ TermEnum *te;
2157
+ SegmentMergeInfo *smi, *top;
2158
+ TermBuffer *tb;
2159
+
2160
+ for (i = 0; i < sm->readers->size; i++) {
2161
+ ir = sm->readers->elems[i];
2162
+ te = ir->terms(ir);
2163
+ smi = smi_create(base, te, ir);
2164
+ base += ir->num_docs(ir);
2165
+ if (smi_next(smi) != NULL)
2166
+ pq_push(sm->queue, smi); // initialize @queue
2167
+ else
2168
+ smi_destroy(smi);
2169
+ }
2170
+
2171
+ SegmentMergeInfo **match = ALLOC_N(SegmentMergeInfo *, sm->readers->size);
2172
+
2173
+ while (sm->queue->count > 0) {
2174
+ // for (i = 1; i <= sm->queue->count; i++) {
2175
+ // printf("<{%s:%s}>", ((SegmentMergeInfo *)sm->queue->heap[i])->tb->field,
2176
+ // ((SegmentMergeInfo *)sm->queue->heap[i])->tb->text);
2177
+ // }printf("\n\n");
2178
+ match_size = 0; // pop matching terms
2179
+ match[match_size] = pq_pop(sm->queue);
2180
+ match_size++;
2181
+ tb = match[0]->tb;
2182
+ top = pq_top(sm->queue);
2183
+ while ((top != NULL) && (tb_cmp(tb, top->tb) == 0)) {
2184
+ match[match_size] = pq_pop(sm->queue);
2185
+ match_size++;
2186
+ top = pq_top(sm->queue);
2187
+ }
2188
+
2189
+ //printf(">%s:%s<\n", match[0]->tb->field, match[0]->tb->text);
2190
+ sm_merge_term_info(sm, match, match_size); // add new TermInfo
2191
+
2192
+ while (match_size > 0) {
2193
+ match_size--;
2194
+ smi = match[match_size];
2195
+ if (smi_next(smi) != NULL)
2196
+ pq_push(sm->queue, smi); // restore queue
2197
+ else
2198
+ smi_destroy(smi); // done with a segment
2199
+ }
2200
+ }
2201
+ free(match);
2202
+ }
2203
+
2204
+ void sm_merge_terms(SegmentMerger *sm)
2205
+ {
2206
+ int i;
2207
+ char fname[SEGMENT_NAME_MAX_LENGTH];
2208
+ sprintf(fname, "%s.frq", sm->name);
2209
+ sm->freq_out = sm->store->create_output(sm->store, fname);
2210
+ sprintf(fname, "%s.prx", sm->name);
2211
+ sm->prox_out = sm->store->create_output(sm->store, fname);
2212
+ sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
2213
+ // terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
2214
+ // to keep the last index_interval terms so that it can compare the last term
2215
+ // put in the index with the next one. So the size of the buffer must by
2216
+ // index_interval + 2.
2217
+ sm->terms_buf_pointer = 0;
2218
+ sm->terms_buf_size = sm->tiw->index_interval + 2;
2219
+ sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
2220
+ for (i = 0; i < sm->terms_buf_size; i++) {
2221
+ sm->terms_buf[i].field = NULL;
2222
+ sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
2223
+ }
2224
+ sm->skip_interval = sm->tiw->skip_interval;
2225
+ sm->queue = pq_create(sm->readers->size, &smi_lt);
2226
+
2227
+ sm_merge_term_infos(sm);
2228
+
2229
+ sm_close(sm);
2230
+ }
2231
+
2232
+ void sm_merge_norms(SegmentMerger *sm)
2233
+ {
2234
+ int i, j, k, max_doc;
2235
+ uchar *norm_buf;
2236
+ FieldInfo *fi;
2237
+ OutStream *os;
2238
+ char fname[SEGMENT_NAME_MAX_LENGTH];
2239
+ IndexReader *ir;
2240
+ for (i = 0; i < sm->fis->fcnt; i++) {
2241
+ fi = sm->fis->by_number[i];
2242
+ if (fi->is_indexed && !fi->omit_norms) {
2243
+ sprintf(fname, "%s.f%d", sm->name, i);
2244
+ os = sm->store->create_output(sm->store, fname);
2245
+ for (j = 0; j < sm->readers->size; j++) {
2246
+ ir = sm->readers->elems[j];
2247
+ max_doc = ir->max_doc(ir);
2248
+ norm_buf = ALLOC_N(uchar, max_doc);
2249
+ memset(norm_buf, 0, sizeof(uchar) * max_doc);
2250
+ ir->get_norms_into(ir, fi->name, norm_buf, 0);
2251
+ for (k = 0; k < max_doc; k++) {
2252
+ if (!ir->is_deleted(ir, k)) {
2253
+ os_write_byte(os, norm_buf[k]);
2254
+ }
2255
+ }
2256
+ free(norm_buf);
2257
+ }
2258
+ os_close(os);
2259
+ }
2260
+ }
2261
+ }
2262
+
2263
+ void sm_merge_vectors(SegmentMerger *sm)
2264
+ {
2265
+ int i, j, max_doc;
2266
+ TermVectorsWriter *tvw = tvw_open(sm->store, sm->name, sm->fis);
2267
+ IndexReader *ir;
2268
+ Array *tvs;
2269
+ for (i = 0; i < sm->readers->size; i++) {
2270
+ ir = sm->readers->elems[i];
2271
+ max_doc = ir->max_doc(ir);
2272
+ for (j = 0; j < max_doc; j++) {
2273
+ // skip deleted docs
2274
+ if (! ir->is_deleted(ir, j)) {
2275
+ tvs = ir->get_term_vectors(ir, j);
2276
+ tvw_add_all_doc_vectors(tvw, tvs);
2277
+ ary_destroy(tvs);
2278
+ }
2279
+ }
2280
+ }
2281
+ tvw_close(tvw);
2282
+ }
2283
+
2284
+ int sm_merge(SegmentMerger *sm)
2285
+ {
2286
+ int doc_count = sm_merge_fields(sm);
2287
+ sm_merge_terms(sm);
2288
+ sm_merge_norms(sm);
2289
+ if (fis_has_vectors(sm->fis))
2290
+ sm_merge_vectors(sm);
2291
+ return doc_count;
2292
+ }
2293
+
2294
+ Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
2295
+ {
2296
+ Array *files = ary_create(0, &efree);
2297
+ CompoundWriter *cw = open_cw(sm->store, file_name);
2298
+ FieldInfo *fi;
2299
+ char fname[SEGMENT_NAME_MAX_LENGTH];
2300
+
2301
+ int i;
2302
+ for (i = 0; i < NELEMS(COMPOUND_EXTENSIONS); i++) {
2303
+ sprintf(fname, "%s.%s", sm->name, COMPOUND_EXTENSIONS[i]);
2304
+ ary_append(files, estrdup(fname));
2305
+ }
2306
+
2307
+ // Field norm files
2308
+ for (i = 0; i < sm->fis->fcnt; i++) {
2309
+ fi = sm->fis->by_number[i];
2310
+ if (fi->is_indexed && !fi->omit_norms) {
2311
+ sprintf(fname, "%s.f%d", sm->name, i);
2312
+ ary_append(files, estrdup(fname));
2313
+ }
2314
+ }
2315
+
2316
+ // Vector files
2317
+ if (fis_has_vectors(sm->fis)) {
2318
+ for (i = 0; i < NELEMS(VECTOR_EXTENSIONS); i++) {
2319
+ sprintf(fname, "%s.%s", sm->name, VECTOR_EXTENSIONS[i]);
2320
+ ary_append(files, estrdup(fname));
2321
+ }
2322
+ }
2323
+
2324
+ // Now merge all added files
2325
+ for (i = 0; i < files->size; i++) {
2326
+ cw_add_file(cw, (char *)files->elems[i]);
2327
+ }
2328
+
2329
+ // Perform the merge
2330
+ cw_close(cw);
2331
+
2332
+ return files;
2333
+ }
2334
+
2335
+ /****************************************************************************
2336
+ *
2337
+ * IndexReader
2338
+ *
2339
+ ****************************************************************************/
2340
+
2341
+ void ir_acquire_not_necessary(IndexReader *ir) {}
2342
+ void ir_acquire_write_lock(IndexReader *ir)
2343
+ {
2344
+ if (ir->is_stale)
2345
+ eprintf(STATE_ERROR, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations");
2346
+
2347
+ if (ir->write_lock == NULL) {
2348
+ ir->write_lock = ir->store->open_lock(ir->store, WRITE_LOCK_NAME);
2349
+ if (!ir->write_lock->obtain(ir->write_lock)) // obtain write lock
2350
+ eprintf(STATE_ERROR, "Index locked for write: %s", WRITE_LOCK_NAME);
2351
+
2352
+ // we have to check whether index has changed since this reader was opened.
2353
+ // if so, this reader is no longer valid for deletion
2354
+ if (sis_read_current_version(ir->store) > ir->sis->version) {
2355
+ ir->is_stale = true;
2356
+ ir->write_lock->release(ir->write_lock);
2357
+ ir->store->close_lock(ir->write_lock);
2358
+ ir->write_lock = NULL;
2359
+ eprintf(STATE_ERROR, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations");
2360
+ }
2361
+ }
2362
+ }
2363
+
2364
+ IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_store)
2365
+ {
2366
+ IndexReader *ir = ALLOC(IndexReader);
2367
+
2368
+ mutex_init(&ir->mutex, NULL);
2369
+ ir->is_owner = is_owner;
2370
+ if (is_owner) {
2371
+ ir->acquire_write_lock = &ir_acquire_write_lock;
2372
+ } else {
2373
+ ir->acquire_write_lock = &ir_acquire_not_necessary;
2374
+ }
2375
+
2376
+ ir->store = store;
2377
+ ir->close_store = close_store;
2378
+ ir->sis = sis;
2379
+ ir->has_changes = false;
2380
+ ir->is_stale = false;
2381
+ ir->write_lock = NULL;
2382
+ ir->cache = NULL;
2383
+ ir->sort_cache = NULL;
2384
+ return ir;
2385
+ }
2386
+
2387
+ IndexReader *ir_open(Store *store, int close_store)
2388
+ {
2389
+ int i;
2390
+ IndexReader *ir;
2391
+ SegmentInfos *sis;
2392
+
2393
+ mutex_lock(&store->mutex);
2394
+ sis = sis_create();
2395
+ sis_read(sis, store);
2396
+ if (sis->scnt == 1) {
2397
+ ir = sr_open(sis, 0, true, close_store);
2398
+ } else {
2399
+ IndexReader **readers = ALLOC_N(IndexReader *, sis->scnt);
2400
+ for (i = 0; i < sis->scnt; i++) {
2401
+ readers[i] = sr_open(sis, i, false, false);
2402
+ }
2403
+ ir = mr_open(store, sis, readers, sis->scnt, close_store);
2404
+ }
2405
+ mutex_unlock(&store->mutex);
2406
+ return ir;
2407
+ }
2408
+
2409
+ bool ir_index_exists(Store *store)
2410
+ {
2411
+ return store->exists(store, "segments");
2412
+ }
2413
+
2414
+ void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
2415
+ {
2416
+ mutex_lock(&ir->mutex);
2417
+ ir->acquire_write_lock(ir);
2418
+ ir->do_set_norm(ir, doc_num, field, val);
2419
+ ir->has_changes = true;
2420
+ mutex_unlock(&ir->mutex);
2421
+ }
2422
+
2423
+ void ir_undelete_all(IndexReader *ir)
2424
+ {
2425
+ mutex_lock(&ir->mutex);
2426
+ ir->acquire_write_lock(ir);
2427
+ ir->do_undelete_all(ir);
2428
+ ir->has_changes = true;
2429
+ mutex_unlock(&ir->mutex);
2430
+ }
2431
+
2432
+ void ir_delete_doc(IndexReader *ir, int doc_num)
2433
+ {
2434
+ mutex_lock(&ir->mutex);
2435
+ ir->acquire_write_lock(ir);
2436
+ ir->do_delete_doc(ir, doc_num);
2437
+ ir->has_changes = true;
2438
+ mutex_unlock(&ir->mutex);
2439
+ }
2440
+
2441
+ Document *ir_get_doc_with_term(IndexReader *ir, Term *term)
2442
+ {
2443
+ TermDocEnum *tde = ir_term_docs_for(ir, term);
2444
+ if (!tde) return NULL;
2445
+
2446
+ Document *doc = NULL;
2447
+ if (tde->next(tde))
2448
+ doc = ir->get_doc(ir, tde->doc_num(tde));
2449
+ tde->close(tde);
2450
+ return doc;
2451
+ }
2452
+
2453
+ TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term)
2454
+ {
2455
+ TermDocEnum *tde = ir->term_docs(ir);
2456
+ tde->seek(tde, term);
2457
+ return tde;
2458
+ }
2459
+
2460
+ TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term)
2461
+ {
2462
+ TermDocEnum *tde = ir->term_positions(ir);
2463
+ tde->seek(tde, term);
2464
+ return tde;
2465
+ }
2466
+
2467
+ void ir_commit_internal(IndexReader *ir)
2468
+ {
2469
+ if (ir->has_changes) {
2470
+ if (ir->is_owner) {
2471
+
2472
+ mutex_lock(&ir->store->mutex);
2473
+ Lock *commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
2474
+ if (!commit_lock->obtain(commit_lock)) // obtain write lock
2475
+ eprintf(STATE_ERROR, "Index locked for commit: %s", COMMIT_LOCK_NAME);
2476
+
2477
+ ir->do_commit(ir);
2478
+ sis_write(ir->sis, ir->store);
2479
+
2480
+ commit_lock->release(commit_lock);
2481
+ ir->store->close_lock(commit_lock);
2482
+ mutex_unlock(&ir->store->mutex);
2483
+
2484
+ if (ir->write_lock != NULL) {
2485
+ ir->write_lock->release(ir->write_lock); // release write lock
2486
+ ir->store->close_lock(ir->write_lock);
2487
+ ir->write_lock = NULL;
2488
+ }
2489
+ } else {
2490
+ ir->do_commit(ir);
2491
+ }
2492
+ ir->has_changes = false;
2493
+ }
2494
+ }
2495
+
2496
+ void ir_commit(IndexReader *ir)
2497
+ {
2498
+ mutex_lock(&ir->mutex);
2499
+ ir_commit_internal(ir);
2500
+ mutex_unlock(&ir->mutex);
2501
+ }
2502
+
2503
+ void ir_close(IndexReader *ir)
2504
+ {
2505
+ mutex_lock(&ir->mutex);
2506
+ ir_commit_internal(ir);
2507
+ ir->do_close(ir);
2508
+ if (ir->close_store) {
2509
+ ir->store->close(ir->store);
2510
+ }
2511
+ if (ir->is_owner) {
2512
+ sis_destroy(ir->sis);
2513
+ }
2514
+ if (ir->cache) {
2515
+ h_destroy(ir->cache);
2516
+ }
2517
+ if (ir->sort_cache) {
2518
+ h_destroy(ir->sort_cache);
2519
+ }
2520
+
2521
+ mutex_destroy(&ir->mutex);
2522
+ free(ir);
2523
+ }
2524
+
2525
+ void ir_destroy(void *p)
2526
+ {
2527
+ IndexReader *ir = (IndexReader *)p;
2528
+ ir_close(ir);
2529
+ }
2530
+
2531
+ /**
2532
+ * Don't call this method if the cache already exists
2533
+ **/
2534
+ void ir_add_cache(IndexReader *ir)
2535
+ {
2536
+ ir->cache = co_hsh_create();
2537
+ }
2538
+
2539
+ bool ir_is_latest(IndexReader *ir)
2540
+ {
2541
+ return sis_read_current_version(ir->store) == ir->sis->version;
2542
+ }
2543
+