ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/index_rw.c ADDED
@@ -0,0 +1,2543 @@
1
+ #include <index.h>
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include <array.h>
5
+
6
+ const char *INDEX_EXTENSIONS[] = {
7
+ "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
8
+ "tvx", "tvd", "tvf", "tvp"
9
+ };
10
+
11
+ const char *COMPOUND_EXTENSIONS[] = {
12
+ "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
13
+ };
14
+
15
+ const char *VECTOR_EXTENSIONS[] = {
16
+ "tvx", "tvd", "tvf"
17
+ };
18
+
19
+ FerretConfig config = {
20
+ 10, // default merge_factor
21
+ 10, // default min_merge_docs
22
+ INT_MAX, // default max_merge_docs
23
+ 10000, // default max_field_length
24
+ 128 // default term_index_interval
25
+ };
26
+
27
+ /***************************************************************************
28
+ *
29
+ * CacheObject
30
+ *
31
+ ***************************************************************************/
32
+
33
+ unsigned int co_hash(const void *key)
34
+ {
35
+ return (unsigned int)key;
36
+ }
37
+
38
+ int co_eq(const void *key1, const void *key2)
39
+ {
40
+ return (key1 == key2);
41
+ }
42
+
43
+ void co_destroy(void *p)
44
+ {
45
+ CacheObject *co = (CacheObject *)p;
46
+ h_rem(co->ref_tab1, co->ref2, false);
47
+ h_rem(co->ref_tab2, co->ref1, false);
48
+ co->destroy(co->obj);
49
+ free(co);
50
+ }
51
+
52
+ CacheObject *co_create(HshTable *ref_tab1, HshTable *ref_tab2,
53
+ void *ref1, void *ref2, void (*destroy)(void *p), void *obj)
54
+ {
55
+ CacheObject *co = ALLOC(CacheObject);
56
+ h_set(ref_tab1, ref2, co);
57
+ h_set(ref_tab2, ref1, co);
58
+ co->ref_tab1 = ref_tab1;
59
+ co->ref_tab2 = ref_tab2;
60
+ co->ref1 = ref1;
61
+ co->ref2 = ref2;
62
+ co->destroy = destroy;
63
+ co->obj = obj;
64
+ return co;
65
+ }
66
+
67
+ HshTable *co_hsh_create()
68
+ {
69
+ return h_new(&co_hash, &co_eq, NULL, &co_destroy);
70
+ }
71
+
72
+ /***************************************************************************
73
+ *
74
+ * Posting
75
+ *
76
+ ***************************************************************************/
77
+
78
+ Posting *p_create(Term *term, int position, TVOffsetInfo *offset)
79
+ {
80
+ Posting *p = ALLOC(Posting);
81
+ p->freq = 1;
82
+ p->size = 1;
83
+ p->term = term;
84
+ p->positions = ALLOC(int);
85
+ p->positions[0] = position;
86
+ p->offsets = ALLOC(TVOffsetInfo *);
87
+ p->offsets[0] = offset;
88
+ return p;
89
+ }
90
+
91
+ void p_destroy(void *p)
92
+ {
93
+ // the positions and offsets will be put in a TVTerm so no need to free
94
+ int i;
95
+ Posting *post = (Posting *)p;
96
+ free(post->positions);
97
+ for (i = 0; i < post->freq; i++)
98
+ tvoi_destroy(post->offsets[i]);
99
+ free(post->offsets);
100
+ free(p);
101
+ }
102
+
103
+ void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset)
104
+ {
105
+ if (p->freq >= p->size) {
106
+ p->size *= 2;
107
+ REALLOC_N(p->positions, int, p->size);
108
+ REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
109
+ }
110
+ p->positions[p->freq] = position;
111
+ p->offsets[p->freq] = offset;
112
+ p->freq++;
113
+ }
114
+
115
+ inline int p_cmp(const void *const p1, const void *const p2)
116
+ {
117
+ Term *t1 = (*(Posting **)p1)->term;
118
+ Term *t2 = (*(Posting **)p2)->term;
119
+ int res = strcmp(t1->field, t2->field);
120
+ if (res != 0) {
121
+ return res;
122
+ } else {
123
+ return strcmp(t1->text, t2->text);
124
+ }
125
+ }
126
+
127
+ DocumentWriter *dw_open(Store *store,
128
+ Analyzer *analyzer,
129
+ Similarity *similarity,
130
+ int max_field_length,
131
+ int term_index_interval)
132
+ {
133
+ DocumentWriter *dw = ALLOC(DocumentWriter);
134
+ dw->store = store;
135
+ dw->analyzer = analyzer;
136
+ dw->similarity = similarity;
137
+ dw->fis = NULL;
138
+ dw->postingtable = h_new(&term_hash, &term_eq, &term_destroy, &p_destroy);
139
+ dw->max_field_length = max_field_length;
140
+ dw->term_index_interval = term_index_interval;
141
+ return dw;
142
+ }
143
+
144
+ void dw_close(DocumentWriter *dw)
145
+ {
146
+ if (dw->fis) fis_destroy(dw->fis);
147
+ h_destroy(dw->postingtable);
148
+ free(dw);
149
+ }
150
+
151
+ void dw_add_position(DocumentWriter *dw, char *field, char *text,
152
+ int position, TVOffsetInfo *offset)
153
+ {
154
+ Term termbuf = {field, text}, *term;
155
+ Posting *p = (Posting *)h_get(dw->postingtable, &termbuf);
156
+
157
+ if (p) { // word seen before
158
+ // double the size of posting to make room for more posts.
159
+ if (p->freq >= p->size) {
160
+ p->size <<= 1;
161
+ REALLOC_N(p->positions, int, p->size);
162
+ p->offsets = REALLOC_N(p->offsets, TVOffsetInfo *, p->size);
163
+ }
164
+ p->positions[p->freq] = position; // add new position
165
+ p->offsets[p->freq] = offset; // add new position
166
+ p->freq++; // update frequency
167
+ } else { // word not seen before
168
+ term = term_create(field, text);
169
+ h_set(dw->postingtable, term, p_create(term, position, offset));
170
+ }
171
+ }
172
+
173
+ void dw_invert_doc(DocumentWriter *dw, Document *doc)
174
+ {
175
+ int i;
176
+ int dfcnt = doc->dfcnt;
177
+ char *field_name, *text;
178
+ int field_number, length, position, offset, slen;
179
+ TokenStream *stream;
180
+ Token *token;
181
+ FieldInfo *fi;
182
+
183
+ DocField **fields = doc->df_arr, *field;
184
+ for (i = 0; i < dfcnt; i++) {
185
+ field = fields[i];
186
+ field_name = field->name;
187
+ fi = ((FieldInfo *)ht_get(dw->fis->by_name, field_name));
188
+ field_number = fi->number;
189
+
190
+ length = dw->field_lengths[field_number];
191
+ offset = dw->field_offsets[field_number];
192
+ position = dw->field_positions[field_number];
193
+
194
+ if (fi->is_indexed) {
195
+ if (!field->is_tokenized) {// un-tokenized field
196
+ text = field->data;
197
+ slen = strlen(text);
198
+ if (fi->store_offset) {
199
+ dw_add_position(dw, field_name, text, position,
200
+ tvoi_create(offset, offset+slen));
201
+ } else {
202
+ dw_add_position(dw, field_name, text, position, NULL);
203
+ }
204
+ offset += slen;
205
+ length++;
206
+ } else {
207
+
208
+ // Tokenize field and add to posting_table
209
+ stream = a_get_ts(dw->analyzer, field_name, field->data);
210
+
211
+ while ((token = ts_next(stream)) != NULL) {
212
+ position += (token->pos_inc - 1);
213
+
214
+ if (fi->store_offset) {
215
+ dw_add_position(dw,
216
+ field_name,
217
+ token->text,
218
+ position,
219
+ tvoi_create(offset + token->start, offset + token->end));
220
+ position++;
221
+ } else {
222
+ dw_add_position(dw, field_name, token->text, position, NULL);
223
+ position++;
224
+ }
225
+
226
+ length++;
227
+ // stop if we reach the max field length
228
+ if (length > dw->max_field_length)
229
+ break;
230
+ }
231
+
232
+ if (token)
233
+ offset += token->end + 1;
234
+ }
235
+ dw->field_lengths[field_number] = length;
236
+ dw->field_offsets[field_number] = offset;
237
+ dw->field_positions[field_number] = position;
238
+ dw->field_boosts[field_number] *= field->boost;
239
+ }
240
+ }
241
+ }
242
+
243
+ Posting **dw_sort_posting_table(DocumentWriter *dw)
244
+ {
245
+ HshTable *ht = dw->postingtable;
246
+ int i;
247
+ dw->pcnt = i = ht->used;
248
+ Posting **postings = ALLOC_N(Posting *, i);
249
+ HshEntry *he = ht->table;
250
+ while (i > 0) {
251
+ if (he->value != NULL) {
252
+ i--;
253
+ postings[i] = (Posting *)he->value;
254
+ }
255
+ he++;
256
+ }
257
+ qsort(postings, dw->pcnt, sizeof(Posting *), &p_cmp);
258
+ return postings;
259
+ }
260
+
261
+ void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
262
+ {
263
+ OutStream *freq_out, *prox_out;
264
+ TermInfosWriter *tiw;
265
+ TermVectorsWriter *tvw = NULL;
266
+ FieldInfo *fi;
267
+ Store *store = dw->store;
268
+ TermInfo *ti;
269
+ Posting *posting;
270
+ int i, j, posting_freq, position, last_position;
271
+ char fname[SEGMENT_NAME_MAX_LENGTH], *curr_field = NULL, *term_field;
272
+ strcpy(fname, segment);
273
+
274
+ //open files for inverse index storage
275
+ sprintf(fname, "%s.frq", segment);
276
+ freq_out = store->create_output(store, fname);
277
+ sprintf(fname, "%s.prx", segment);
278
+ prox_out = store->create_output(store, fname);
279
+ tiw = tiw_open(store, segment, dw->fis, dw->term_index_interval);
280
+ ti = ti_create(0, 0, 0, 0);
281
+
282
+ for (i = 0; i < dw->pcnt; i++) {
283
+ posting = postings[i];
284
+
285
+ // add an entry to the dictionary with pointers to prox and freq_out files
286
+ ti_set(ti, 1, os_pos(freq_out), os_pos(prox_out), -1);
287
+ tiw_add(tiw, posting->term, ti);
288
+
289
+ // add an entry to the freq_out file
290
+ posting_freq = posting->freq;
291
+ if (posting_freq == 1) { // optimize freq=1
292
+ os_write_vint(freq_out, 1); // set low bit of doc num.
293
+ } else {
294
+ os_write_vint(freq_out, 0); // the doc number
295
+ os_write_vint(freq_out, posting_freq); // frequency in doc
296
+ }
297
+
298
+ last_position = 0; // write positions
299
+
300
+ for (j = 0; j < posting_freq; j++) {
301
+ position = posting->positions[j];
302
+ os_write_vint(prox_out, position - last_position);
303
+ last_position = position;
304
+ }
305
+
306
+ // check to see if we switched to a new field
307
+ term_field = posting->term->field;
308
+ if (curr_field != term_field) {
309
+ // changing field - see if there is something to save
310
+ curr_field = term_field;
311
+ fi = (FieldInfo *)ht_get(dw->fis->by_name, curr_field);
312
+ if (fi->store_tv) {
313
+ if (tvw == NULL) {
314
+ tvw = tvw_open(store, segment, dw->fis);
315
+ tvw_open_doc(tvw);
316
+ }
317
+ tvw_open_field(tvw, curr_field);
318
+
319
+ } else if (tvw != NULL) {
320
+ tvw_close_field(tvw);
321
+ }
322
+ }
323
+ // tvw->curr_field != NULL implies field is still open
324
+ if (tvw != NULL && tvw->curr_field != NULL) {
325
+ tvw_add_term(tvw, posting->term->text, posting_freq, posting->positions, posting->offsets);
326
+ }
327
+ }
328
+ if (tvw != NULL) {
329
+ tvw_close_doc(tvw);
330
+ tvw_close(tvw);
331
+ }
332
+ // make an effort to close all streams we can but remember and re-raise
333
+ // the last exception encountered in this process
334
+ os_close(freq_out);
335
+ os_close(prox_out);
336
+ tiw_close(tiw);
337
+ ti_destroy(ti);
338
+ }
339
+
340
+ void dw_write_norms(DocumentWriter *dw, char *segment)
341
+ {
342
+ int i;
343
+ float norm;
344
+ OutStream *norms_out;
345
+ char fname[SEGMENT_NAME_MAX_LENGTH];
346
+ FieldInfos *fis = dw->fis;
347
+ FieldInfo *fi;
348
+
349
+ for (i = 0; i < fis->fcnt; i++) {
350
+ fi = fis->by_number[i];
351
+
352
+ if (fi->is_indexed && !fi->omit_norms) {
353
+ norm = dw->field_boosts[i] * sim_length_norm(dw->similarity, fi->name, dw->field_lengths[i]);
354
+ sprintf(fname, "%s.f%d", segment, i);
355
+ norms_out = dw->store->create_output(dw->store, fname);
356
+ os_write_byte(norms_out, sim_encode_norm(dw->similarity, norm));
357
+ os_close(norms_out);
358
+ }
359
+ }
360
+ }
361
+
362
+ void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc)
363
+ {
364
+ int i;
365
+ // write field names
366
+ dw->fis = fis_create();
367
+ fis_add_doc(dw->fis, doc);
368
+ fis_write(dw->fis, dw->store, segment, ".fnm");
369
+
370
+ // write field values
371
+ FieldsWriter *fw = fw_open(dw->store, segment, dw->fis);
372
+ fw_add_doc(fw, doc);
373
+ fw_close(fw);
374
+
375
+ // invert doc into posting_table
376
+ h_clear(dw->postingtable); // clear posting_table
377
+
378
+ dw->field_boosts = ALLOC_N(float, dw->fis->fcnt);
379
+ dw->field_lengths = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
380
+ dw->field_offsets = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
381
+ dw->field_positions = ALLOC_AND_ZERO_N(int, dw->fis->fcnt);
382
+
383
+ for (i = 0; i < dw->fis->fcnt; i++)
384
+ dw->field_boosts[i] = doc->boost;
385
+
386
+ dw_invert_doc(dw, doc);
387
+
388
+ // sort posting_table into an array
389
+ Posting **postings = dw_sort_posting_table(dw);
390
+
391
+ // write postings
392
+ dw_write_postings(dw, postings, segment);
393
+ free(postings);
394
+
395
+ // write norms of indexed fields
396
+ dw_write_norms(dw, segment);
397
+
398
+ free(dw->field_boosts);
399
+ free(dw->field_lengths);
400
+ free(dw->field_offsets);
401
+ free(dw->field_positions);
402
+ }
403
+
404
+ /****************************************************************************
405
+ *
406
+ * SegmentInfo
407
+ *
408
+ ****************************************************************************/
409
+
410
+ SegmentInfo *si_create(char *name, int doc_cnt, Store *store)
411
+ {
412
+ SegmentInfo *si = ALLOC(SegmentInfo);
413
+ si->name = name;
414
+ si->doc_cnt = doc_cnt;
415
+ si->store = store;
416
+ return si;
417
+ }
418
+
419
+ void si_destroy(void *p)
420
+ {
421
+ SegmentInfo *si = (SegmentInfo *)p;
422
+ free(si->name);
423
+ free(si);
424
+ }
425
+
426
+ bool si_has_deletions(SegmentInfo *si)
427
+ {
428
+ char del_file_name[SEGMENT_NAME_MAX_LENGTH];
429
+ sprintf(del_file_name, "%s.del", si->name);
430
+ return si->store->exists(si->store, del_file_name);
431
+ }
432
+
433
+ bool si_uses_compound_file(SegmentInfo *si)
434
+ {
435
+ char compound_file_name[SEGMENT_NAME_MAX_LENGTH];
436
+ sprintf(compound_file_name, "%s.cfs", si->name);
437
+ return si->store->exists(si->store, compound_file_name);
438
+ }
439
+
440
+ struct NormTester {
441
+ bool has_norm_file;
442
+ char *segment_name;
443
+ };
444
+ void is_norm_file(char *fname, void *arg)
445
+ {
446
+ struct NormTester *nt = (struct NormTester *)arg;
447
+ char norm_file_pattern[SEGMENT_NAME_MAX_LENGTH];
448
+ sprintf(norm_file_pattern, "%s.s", nt->segment_name);
449
+ if (strncmp(fname, norm_file_pattern, strlen(norm_file_pattern)) == 0) {
450
+ nt->has_norm_file = true;
451
+ }
452
+ }
453
+
454
+ bool si_has_separate_norms(SegmentInfo *si)
455
+ {
456
+ struct NormTester nt;
457
+ nt.segment_name = si->name;
458
+ nt.has_norm_file = false;
459
+ si->store->each(si->store, &is_norm_file, &nt);
460
+
461
+ return nt.has_norm_file;
462
+ }
463
+
464
+
465
+ /****************************************************************************
466
+ *
467
+ * SegmentInfos
468
+ *
469
+ ****************************************************************************/
470
+
471
+ #include <time.h>
472
+ #define FORMAT -1
473
+ #define SEGMENT_FILENAME "segments"
474
+ #define TEMPORARY_SEGMENT_FILENAME "segments.new"
475
+
476
+ SegmentInfos *sis_create()
477
+ {
478
+ SegmentInfos *sis = ALLOC(SegmentInfos);
479
+ sis->format = FORMAT;
480
+ sis->version = (unsigned int)time(NULL);
481
+ sis->scnt = 0;
482
+ sis->counter = 0;
483
+ sis->size = 4;
484
+ sis->segs = ALLOC_N(SegmentInfo *, sis->size);
485
+ return sis;
486
+ }
487
+
488
+ void sis_destroy_not_infos(void *p)
489
+ {
490
+ SegmentInfos *sis = (SegmentInfos *)p;
491
+ free(sis->segs);
492
+ free(p);
493
+ }
494
+
495
+ void sis_destroy(void *p)
496
+ {
497
+ int i;
498
+ SegmentInfos *sis = (SegmentInfos *)p;
499
+ for (i = 0; i < sis->scnt; i++)
500
+ si_destroy(sis->segs[i]);
501
+ free(sis->segs);
502
+ free(p);
503
+ }
504
+
505
+ void sis_add_si(SegmentInfos *sis, SegmentInfo *si)
506
+ {
507
+ if (sis->scnt >= sis->size) {
508
+ sis->size = sis->scnt * 2;
509
+ REALLOC_N(sis->segs, SegmentInfo *, sis->size);
510
+ }
511
+ sis->segs[sis->scnt] = si;
512
+ sis->scnt++;
513
+ }
514
+
515
+ void sis_del_at(SegmentInfos *sis, int at)
516
+ {
517
+ int i;
518
+ si_destroy(sis->segs[at]);
519
+ sis->scnt--;
520
+ for (i = at; i < sis->scnt; i++)
521
+ sis->segs[i] = sis->segs[i+1];
522
+ }
523
+
524
+ void sis_del_from_to(SegmentInfos *sis, int from, int to)
525
+ {
526
+ int i, num_to_del = to - from;
527
+ sis->scnt -= num_to_del;
528
+ for (i = from; i < to; i++) {
529
+ si_destroy(sis->segs[i]);
530
+ }
531
+ for (i = from; i < sis->scnt; i++) {
532
+ sis->segs[i] = sis->segs[i+num_to_del];
533
+ }
534
+ }
535
+
536
+ void sis_clear(SegmentInfos *sis)
537
+ {
538
+ int i;
539
+ for (i = 0; i < sis->scnt; i++) {
540
+ si_destroy(sis->segs[i]);
541
+ }
542
+ sis->scnt = 0;
543
+ }
544
+
545
+ void sis_read(SegmentInfos *sis, Store *store)
546
+ {
547
+ int doc_cnt;
548
+ char *name;
549
+ InStream *is = store->open_input(store, SEGMENT_FILENAME);
550
+ sis->format = is_read_int(is);
551
+
552
+ if (sis->format < 0) { // file contains explicit format info
553
+ // check that it is a format we can understand
554
+ if (sis->format < FORMAT)
555
+ eprintf(ERROR, "Unknown format version: %ld", sis->format);
556
+ sis->version = is_read_long(is);
557
+ sis->counter = is_read_int(is);
558
+ } else { // file is in old format without explicit format info
559
+ sis->counter = sis->format;
560
+ }
561
+
562
+ int seg_count = is_read_int(is);
563
+ int i;
564
+ for (i = 0; i < seg_count; i++) {
565
+ name = is_read_string(is);
566
+ doc_cnt = is_read_int(is);
567
+ sis_add_si(sis, si_create(name, doc_cnt, store));
568
+ }
569
+
570
+ if (sis->format >= 0) {
571
+ // in old format the version number may be at the end of the file
572
+ if (is_pos(is) >= is_length(is))
573
+ sis->version = 0; // old file format without version number
574
+ else
575
+ sis->version = is_read_long(is); // read version
576
+ }
577
+ is_close(is);
578
+ }
579
+
580
+ void sis_write(SegmentInfos *sis, Store *store)
581
+ {
582
+ int i;
583
+ SegmentInfo *si;
584
+ OutStream *os = store->create_output(store, TEMPORARY_SEGMENT_FILENAME);
585
+ os_write_int(os, FORMAT);
586
+ os_write_long(os, ++(sis->version)); // every write changes the index
587
+ os_write_int(os, sis->counter);
588
+ os_write_int(os, sis->scnt);
589
+ for (i = 0; i < sis->scnt; i++) {
590
+ si = sis->segs[i];
591
+ os_write_string(os, si->name);
592
+ os_write_int(os, si->doc_cnt);
593
+ }
594
+
595
+ os_close(os);
596
+
597
+ //install new segment info
598
+ store->rename(store, TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME);
599
+ }
600
+
601
+ int sis_read_current_version(Store *store)
602
+ {
603
+ if (!store->exists(store, SEGMENT_FILENAME))
604
+ return 0;
605
+ InStream *is = store->open_input(store, SEGMENT_FILENAME);
606
+ int format = 0;
607
+ int version = 0;
608
+ format = is_read_int(is);
609
+ if (format < 0) {
610
+ if (format < FORMAT)
611
+ eprintf(ERROR, "Unknown format version: %ld", format);
612
+ version = is_read_long(is);
613
+ }
614
+ is_close(is);
615
+
616
+ if (format < 0)
617
+ return version;
618
+
619
+ // We cannot be sure about the format of the file.
620
+ // Therefore we have to read the whole file and cannot simply
621
+ // seek to the version entry.
622
+
623
+ SegmentInfos *sis = sis_create();
624
+ sis_read(sis, store);
625
+ version = sis->version;
626
+ sis_destroy(sis);
627
+ return version;
628
+ }
629
+
630
+ /****************************************************************************
631
+ *
632
+ * IndexWriter
633
+ *
634
+ ****************************************************************************/
635
+
636
+ IndexWriter *iw_open(Store *store, Analyzer *analyzer,
637
+ bool create, bool close_store, bool close_analyzer)
638
+ {
639
+ IndexWriter *iw = ALLOC(IndexWriter);
640
+ if (create)
641
+ store->clear_all(store);
642
+ mutex_init(&iw->mutex, NULL);
643
+ iw->merge_factor = config.merge_factor;
644
+ iw->min_merge_docs = config.min_merge_docs;
645
+ iw->max_merge_docs = config.max_merge_docs;
646
+ iw->max_field_length = config.max_field_length;
647
+ iw->term_index_interval = config.term_index_interval;
648
+ iw->use_compound_file = true;
649
+ iw->store = store;
650
+ iw->close_store = close_store;
651
+ iw->close_analyzer = close_analyzer;
652
+ iw->analyzer = analyzer;
653
+ iw->sis = sis_create();
654
+ iw->similarity = sim_create_default();
655
+ iw->ram_store = open_ram_store();
656
+
657
+ mutex_lock(&store->mutex);
658
+ // keep the write_lock obtained until the IndexWriter is closed.
659
+ iw->write_lock = store->open_lock(store, WRITE_LOCK_NAME);
660
+ if (!iw->write_lock->obtain(iw->write_lock)) {
661
+ eprintf(STATE_ERROR,
662
+ "Could not obtain write lock when trying to write index");
663
+ }
664
+
665
+ if (create) {
666
+ Lock *commit_lock = store->open_lock(store, COMMIT_LOCK_NAME);
667
+ if (!commit_lock->obtain(commit_lock)) {
668
+ eprintf(STATE_ERROR,
669
+ "Could not obtain commit lock when trying to commit index");
670
+ }
671
+ // commit the index
672
+ store->clear(store);
673
+ sis_write(iw->sis, store);
674
+ //
675
+ commit_lock->release(commit_lock);
676
+ store->close_lock(commit_lock);
677
+ } else {
678
+ sis_read(iw->sis, store);
679
+ }
680
+ mutex_unlock(&store->mutex);
681
+ return iw;
682
+ }
683
+
684
+ const char base36_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
685
+
686
+ char *new_segment_name(int counter)
687
+ {
688
+ char buf[SEGMENT_NAME_MAX_LENGTH];
689
+ buf[SEGMENT_NAME_MAX_LENGTH - 1] = '\0';
690
+ int i;
691
+ for (i = SEGMENT_NAME_MAX_LENGTH - 2; ; i--) {
692
+ buf[i] = base36_digitmap[counter%36];
693
+ counter /= 36;
694
+ if (counter == 0) break;
695
+ }
696
+ i--;
697
+ buf[i] = '_';
698
+ return estrdup(&buf[i]);
699
+ }
700
+
701
+ int iw_doc_count(IndexWriter *iw)
702
+ {
703
+ int i, doc_cnt = 0;
704
+ mutex_lock(&iw->mutex);
705
+ for (i = 0; i < iw->sis->scnt; i++)
706
+ doc_cnt += iw->sis->segs[i]->doc_cnt;
707
+ mutex_unlock(&iw->mutex);
708
+ return doc_cnt;
709
+ }
710
+
711
+ void delete_files(Array *file_names, Store *store)
712
+ {
713
+ int i;
714
+ for (i = 0; i < file_names->size; i++) {
715
+ store->remove(store, (char *)file_names->elems[i]);
716
+ }
717
+ ary_destroy(file_names);
718
+ }
719
+
720
+
721
+ Array *sr_file_names(IndexReader *ir);
722
+ void iw_delete_segments(IndexWriter *iw, IndexReader **segment_readers, int del_cnt)
723
+ {
724
+ // The java version keeps a record of files that it couldn't delete. This
725
+ // shouldn't be a problem on linux I hope.
726
+ IndexReader *ir;
727
+ int i;
728
+ for (i = 0; i < del_cnt; i++) {
729
+ ir = segment_readers[i];
730
+ delete_files(sr_file_names(ir), ir->store);
731
+ }
732
+ }
733
+
734
+ void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merger)
735
+ {
736
+ char merged_tmp[SEGMENT_NAME_MAX_LENGTH], merged_cfs[SEGMENT_NAME_MAX_LENGTH];
737
+
738
+ mutex_lock(&iw->store->mutex);
739
+ sprintf(merged_tmp, "%s.tmp", merged_name);
740
+ sprintf(merged_cfs, "%s.cfs", merged_name);
741
+
742
+ Array *files_to_delete = sm_create_compound_file(merger, merged_tmp);
743
+ Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
744
+
745
+ if (!commit_lock->obtain(commit_lock)) {
746
+ eprintf(STATE_ERROR,
747
+ "Could not obtain commit lock when trying to commit index");
748
+ }
749
+
750
+ // make compound file visible for SegmentReaders
751
+ iw->store->rename(iw->store, merged_tmp, merged_cfs);
752
+ // delete now unused files of segment
753
+ delete_files(files_to_delete, iw->store);
754
+
755
+ commit_lock->release(commit_lock);
756
+ iw->store->close_lock(commit_lock);
757
+ mutex_unlock(&iw->store->mutex);
758
+ }
759
+
760
+ void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segment)
761
+ {
762
+ int i;
763
+ IndexReader *segments_to_delete[max_segment - min_segment];
764
+ int del_cnt = 0;
765
+
766
+ char *merged_name = new_segment_name(iw->sis->counter++);
767
+
768
+ SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
769
+ IndexReader *reader;
770
+
771
+
772
+ for (i = min_segment; i < max_segment; i++) {
773
+ reader = sr_open(iw->sis, i, false, false);
774
+ sm_add(merger, reader);
775
+ if ((reader->store == iw->store) || // if we own the directory
776
+ (reader->store == iw->ram_store)) {
777
+ segments_to_delete[del_cnt++] = reader; // queue segment for deletion
778
+ }
779
+ }
780
+
781
+ int merged_doc_count = sm_merge(merger);
782
+
783
+ sis_del_from_to(iw->sis, min_segment, max_segment);
784
+
785
+ sis_add_si(iw->sis, si_create(merged_name, merged_doc_count, iw->store));
786
+
787
+ // close readers before we attempt to delete now-obsolete segments
788
+
789
+ mutex_lock(&iw->store->mutex);
790
+ Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
791
+ if (!commit_lock->obtain(commit_lock)) {
792
+ eprintf(STATE_ERROR,
793
+ "Could not obtain commit lock when trying to commit index");
794
+ }
795
+ // commit the index
796
+ sis_write(iw->sis, iw->store);
797
+ iw_delete_segments(iw, segments_to_delete, del_cnt);
798
+ //
799
+ commit_lock->release(commit_lock);
800
+ iw->store->close_lock(commit_lock);
801
+ mutex_unlock(&iw->store->mutex);
802
+
803
+ if (iw->use_compound_file) {
804
+ make_compound_file(iw, merged_name, merger);
805
+ }
806
+
807
+ sm_destroy(merger);
808
+ }
809
+
810
+ void iw_merge_segments(IndexWriter *iw, int min_segment)
811
+ {
812
+ iw_merge_segments_with_max(iw, min_segment, iw->sis->scnt);
813
+ }
814
+
815
+ void iw_maybe_merge_segments(IndexWriter *iw)
816
+ {
817
+ int target_merge_docs = iw->min_merge_docs;
818
+ int min_segment, merge_docs;
819
+ SegmentInfo *si;
820
+
821
+ while (target_merge_docs <= iw->max_merge_docs) {
822
+ // find segments smaller than current target size
823
+ min_segment = iw->sis->scnt - 1;
824
+ merge_docs = 0;
825
+ while (min_segment >= 0) {
826
+ si = iw->sis->segs[min_segment];
827
+ if (si->doc_cnt >= target_merge_docs)
828
+ break;
829
+ merge_docs += si->doc_cnt;
830
+ min_segment -= 1;
831
+ }
832
+
833
+ if (merge_docs >= target_merge_docs) // found a merge to do
834
+ iw_merge_segments(iw, min_segment + 1);
835
+ else
836
+ break;
837
+
838
+ target_merge_docs *= iw->merge_factor; // increase target size
839
+ }
840
+ }
841
+
842
+ void iw_flush_ram_segments(IndexWriter *iw)
843
+ {
844
+ int min_segment = iw->sis->scnt-1;
845
+ int doc_count = 0;
846
+ SegmentInfo **segs = iw->sis->segs;
847
+ while ((min_segment >= 0) &&
848
+ (segs[min_segment]->store == iw->ram_store)) {
849
+ doc_count += segs[min_segment]->doc_cnt;
850
+ min_segment--;
851
+ }
852
+ /* the following if statement is actually incrementing for different
853
+ * reasons. If min_segment < 0 then we must increment as we searched
854
+ * off the end. If the top segment is not ram_store there are no
855
+ * ram segments to flush so we increment so the next check will return
856
+ * us from this function. Lastly, the min_segment stopped at a segment
857
+ * that wasn't the ram segment. But if it fit's in with the merge
858
+ * factor, why not merge it. Otherwise we leave it and increment min_seg
859
+ */
860
+ if (min_segment < 0 || // add one FS segment?
861
+ (doc_count + segs[min_segment]->doc_cnt) > iw->merge_factor ||
862
+ (segs[iw->sis->scnt-1]->store != iw->ram_store))
863
+ min_segment++;
864
+ if (min_segment >= iw->sis->scnt)
865
+ return;
866
+ iw_merge_segments(iw, min_segment);
867
+ }
868
+
869
+ void iw_add_doc(IndexWriter *iw, Document *doc)
870
+ {
871
+ DocumentWriter *dw;
872
+ char *segment_name;
873
+
874
+ mutex_lock(&iw->mutex);
875
+ dw = dw_open(iw->ram_store,
876
+ iw->analyzer,
877
+ iw->similarity,
878
+ iw->max_field_length,
879
+ iw->term_index_interval);
880
+ segment_name = new_segment_name(iw->sis->counter++);
881
+ dw_add_doc(dw, segment_name, doc);
882
+ dw_close(dw);
883
+ sis_add_si(iw->sis, si_create(segment_name, 1, iw->ram_store));
884
+ iw_maybe_merge_segments(iw);
885
+ mutex_unlock(&iw->mutex);
886
+ }
887
+
888
+ static inline void iw_optimize_internal(IndexWriter *iw)
889
+ {
890
+ int min_segment;
891
+ iw_flush_ram_segments(iw);
892
+ while (iw->sis->scnt > 1 ||
893
+ (iw->sis->scnt == 1 &&
894
+ ( si_has_deletions(iw->sis->segs[0]) ||
895
+ (iw->sis->segs[0]->store != iw->store) ||
896
+ (iw->use_compound_file &&
897
+ (!si_uses_compound_file(iw->sis->segs[0]) ||
898
+ si_has_separate_norms(iw->sis->segs[0])))))) {
899
+ min_segment = iw->sis->scnt - iw->merge_factor;
900
+ iw_merge_segments(iw, min_segment < 0 ? 0 : min_segment);
901
+ }
902
+ }
903
+ void iw_optimize(IndexWriter *iw)
904
+ {
905
+ mutex_lock(&iw->mutex);
906
+ iw_optimize_internal(iw);
907
+ mutex_unlock(&iw->mutex);
908
+ }
909
+
910
+ void iw_close(IndexWriter *iw)
911
+ {
912
+ mutex_lock(&iw->mutex);
913
+ iw_flush_ram_segments(iw);
914
+ ram_close(iw->ram_store);
915
+ sis_destroy(iw->sis);
916
+
917
+ sim_destroy(iw->similarity);
918
+ if (iw->close_analyzer) a_destroy(iw->analyzer);
919
+
920
+ iw->write_lock->release(iw->write_lock);
921
+ iw->store->close_lock(iw->write_lock);
922
+
923
+ if (iw->close_store)
924
+ store_close(iw->store);
925
+ mutex_destroy(&iw->mutex);
926
+ free(iw);
927
+ }
928
+
929
+ void iw_add_indexes(IndexWriter *iw, Store **stores, int cnt)
930
+ {
931
+ int i, j, end, start;
932
+
933
+ mutex_lock(&iw->mutex);
934
+ iw_optimize_internal(iw); // start with zero or 1 seg
935
+
936
+ start = iw->sis->scnt;
937
+
938
+ for (i = 0; i < cnt; i++) {
939
+ Store *store = stores[i];
940
+ SegmentInfos *sis = sis_create(); // read infos from dir
941
+ sis_read(sis, store);
942
+
943
+ for (j = 0; j < sis->scnt; j++) {
944
+ SegmentInfo *si = sis->segs[j];
945
+ sis_add_si(iw->sis, si);
946
+ }
947
+ sis_destroy_not_infos(sis);
948
+ }
949
+
950
+ // merge newly added segments in log(n) passes
951
+ while (iw->sis->scnt > start + iw->merge_factor) {
952
+ for (i = start + 1; i < iw->sis->scnt; i++) {
953
+ end = MIN(iw->sis->scnt, i + iw->merge_factor);
954
+ if (end - i > 1) {
955
+ iw_merge_segments_with_max(iw, i, end);
956
+ }
957
+ }
958
+ }
959
+
960
+ // final cleanup
961
+ iw_optimize_internal(iw);
962
+ mutex_unlock(&iw->mutex);
963
+ }
964
+
965
+
966
+ /**
967
+ * This adds an array of readers to the index leaving the added readers open.
968
+ */
969
+ void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
970
+ {
971
+ IndexReader *ir = NULL;
972
+ int i, del_cnt = 0;
973
+
974
+ mutex_lock(&iw->mutex);
975
+ iw_optimize_internal(iw); // start with zero or 1 seg
976
+
977
+ char *merged_name = new_segment_name(iw->sis->counter++);
978
+
979
+ SegmentMerger *merger = sm_create(iw->store, merged_name, iw->term_index_interval);
980
+ merger->readers->free_elem = NULL; // don't close readers
981
+
982
+ if (iw->sis->scnt == 1) {// add existing index, if any
983
+ ir = sr_open_si(iw->sis->segs[0]);
984
+ sm_add(merger, ir);
985
+ del_cnt = 1;
986
+ }
987
+
988
+ for (i = 0; i < cnt; i++) {
989
+ sm_add(merger, irs[i]);
990
+ }
991
+
992
+ int doc_count = sm_merge(merger); // merge 'em
993
+
994
+ // pop old infos and add new ones.
995
+ sis_clear(iw->sis);
996
+ sis_add_si(iw->sis, si_create(merged_name, doc_count, iw->store));
997
+
998
+
999
+ Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
1000
+ if (!commit_lock->obtain(commit_lock)) // obtain write lock
1001
+ eprintf(STATE_ERROR, "Index locked for commit: %s", COMMIT_LOCK_NAME);
1002
+
1003
+ sis_write(iw->sis, iw->store); // commit changes
1004
+ iw_delete_segments(iw, &ir, del_cnt);
1005
+ if (ir) ir_close(ir);
1006
+
1007
+ commit_lock->release(commit_lock);
1008
+ iw->store->close_lock(commit_lock);
1009
+
1010
+ if (iw->use_compound_file) {
1011
+ make_compound_file(iw, merged_name, merger);
1012
+ }
1013
+
1014
+ iw_optimize_internal(iw);
1015
+ sm_destroy(merger);
1016
+
1017
+ mutex_unlock(&iw->mutex);
1018
+ }
1019
+
1020
+ /****************************************************************************
1021
+ *
1022
+ * Norm
1023
+ *
1024
+ ****************************************************************************/
1025
+
1026
+ Norm *norm_create(InStream *is, int field_num)
1027
+ {
1028
+ Norm *norm = ALLOC(Norm);
1029
+ norm->is = is;
1030
+ norm->field_num = field_num;
1031
+ norm->bytes = NULL;
1032
+ norm->is_dirty = false;
1033
+ return norm;
1034
+ }
1035
+
1036
+ void norm_destroy(void *p)
1037
+ {
1038
+ Norm *norm = (Norm *)p;
1039
+ is_close(norm->is);
1040
+ if (norm->bytes != NULL) free(norm->bytes);
1041
+ free(norm);
1042
+ }
1043
+
1044
+ void norm_rewrite(Norm *norm, Store *store, char *segment,
1045
+ int doc_count, Store *cfs_store)
1046
+ {
1047
+ if (norm->bytes == NULL)
1048
+ return; // These norms do not need to be rewritten
1049
+
1050
+ char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1051
+ char norm_fname[SEGMENT_NAME_MAX_LENGTH];
1052
+ sprintf(tmp_fname, "%s.tmp", segment);
1053
+ OutStream *os = store->create_output(store, tmp_fname);
1054
+ os_write_bytes(os, norm->bytes, doc_count);
1055
+ os_close(os);
1056
+ if (cfs_store) {
1057
+ sprintf(norm_fname, "%s.s%d", segment, norm->field_num);
1058
+ } else {
1059
+ sprintf(norm_fname, "%s.f%d", segment, norm->field_num);
1060
+ }
1061
+ store->rename(store, tmp_fname, norm_fname);
1062
+ norm->is_dirty = false;
1063
+ }
1064
+
1065
+ /****************************************************************************
1066
+ *
1067
+ * SegmentReader
1068
+ *
1069
+ ****************************************************************************/
1070
+
1071
+ #define GET_SR SegmentReader *sr = (SegmentReader *)ir->data;
1072
+
1073
+ int sr_max_doc(IndexReader *ir)
1074
+ {
1075
+ return ((SegmentReader *)ir->data)->fr->len;
1076
+ }
1077
+
1078
+ static inline void sr_close_norms(SegmentReader *sr)
1079
+ {
1080
+ h_destroy(sr->norms);
1081
+ }
1082
+
1083
+ static inline TermVectorsReader *sr_tvr(SegmentReader *sr)
1084
+ {
1085
+ TermVectorsReader *tvr;
1086
+ if ((tvr = thread_getspecific(sr->thread_tvr)) == NULL) {
1087
+ tvr = tvr_clone(sr->orig_tvr);
1088
+ if (tvr == NULL) printf("scuk\n");
1089
+ ary_append(sr->tvr_bucket, tvr);
1090
+ thread_setspecific(sr->thread_tvr, tvr);
1091
+ }
1092
+ return tvr;
1093
+ }
1094
+
1095
+ void sr_close(IndexReader *ir)
1096
+ {
1097
+ GET_SR;
1098
+ fr_close(sr->fr);
1099
+ tir_close(sr->tir);
1100
+
1101
+ if (sr->freq_in) is_close(sr->freq_in);
1102
+ if (sr->prox_in) is_close(sr->prox_in);
1103
+ fis_destroy(sr->fis);
1104
+
1105
+ sr_close_norms(sr);
1106
+
1107
+ if (sr->orig_tvr) {
1108
+ tvr_close(sr->orig_tvr);
1109
+ thread_key_delete(sr->thread_tvr);
1110
+ ary_destroy(sr->tvr_bucket);
1111
+ }
1112
+ if (sr->deleted_docs) bv_destroy(sr->deleted_docs);
1113
+ if (sr->cfs_store) sr->cfs_store->close(sr->cfs_store);
1114
+ if (sr->fake_norms) free(sr->fake_norms);
1115
+ free(sr->segment);
1116
+ free(sr);
1117
+ }
1118
+
1119
+ void sr_delete_doc(IndexReader *ir, int doc_num)
1120
+ {
1121
+ GET_SR;
1122
+ if (sr->deleted_docs == NULL)
1123
+ sr->deleted_docs = bv_create();
1124
+
1125
+ sr->deleted_docs_dirty = true;
1126
+ sr->undelete_all = false;
1127
+ bv_set(sr->deleted_docs, doc_num);
1128
+ }
1129
+
1130
+ static inline bool sr_is_deleted_internal(IndexReader *ir, int doc_num)
1131
+ {
1132
+ GET_SR;
1133
+ return (sr->deleted_docs != NULL && bv_get(sr->deleted_docs, doc_num));
1134
+ }
1135
+
1136
+ bool sr_is_deleted(IndexReader *ir, int doc_num)
1137
+ {
1138
+ bool is_del;
1139
+
1140
+ mutex_lock(&ir->mutex);
1141
+ is_del = sr_is_deleted_internal(ir, doc_num);
1142
+ mutex_unlock(&ir->mutex);
1143
+
1144
+ return is_del;
1145
+ }
1146
+
1147
+ bool sr_has_norms(IndexReader *ir, char *field)
1148
+ {
1149
+ bool has_norms;
1150
+ GET_SR;
1151
+ mutex_lock(&ir->mutex);
1152
+ has_norms = h_has_key(sr->norms, field);
1153
+ mutex_unlock(&ir->mutex);
1154
+
1155
+ return has_norms;
1156
+ }
1157
+
1158
+ bool sr_has_deletions(IndexReader *ir)
1159
+ {
1160
+ GET_SR;
1161
+ return (sr->deleted_docs != NULL);
1162
+ }
1163
+
1164
+ void sr_undelete_all(IndexReader *ir)
1165
+ {
1166
+ GET_SR;
1167
+ sr->undelete_all = true;
1168
+ sr->deleted_docs_dirty = false;
1169
+ if (sr->deleted_docs != NULL) bv_destroy(sr->deleted_docs);
1170
+ sr->deleted_docs = NULL;
1171
+ }
1172
+
1173
+ TermEnum *sr_terms(IndexReader *ir)
1174
+ {
1175
+ TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1176
+ return te->clone(te);
1177
+ }
1178
+
1179
+ TermEnum *sr_terms_from(IndexReader *ir, Term *term)
1180
+ {
1181
+ TermEnum *te = ((SegmentReader *)ir->data)->tir->orig_te;
1182
+ TermEnum *ret_te = te->clone(te);
1183
+ te_skip_to(ret_te, term);
1184
+ return ret_te;
1185
+ }
1186
+
1187
+ Document *sr_get_doc(IndexReader *ir, int doc_num)
1188
+ {
1189
+ Document *doc;
1190
+ mutex_lock(&ir->mutex);
1191
+ if (sr_is_deleted_internal(ir, doc_num)) {
1192
+ mutex_unlock(&ir->mutex);
1193
+ eprintf(STATE_ERROR,
1194
+ "Tried to get doc <%ld> that has already been deleted", doc_num);
1195
+ }
1196
+ GET_SR;
1197
+ doc = fr_get_doc(sr->fr, doc_num);
1198
+ mutex_unlock(&ir->mutex);
1199
+ return doc;
1200
+ }
1201
+
1202
+ static inline void
1203
+ sr_get_norms_into_internal(IndexReader *ir, char *field, uchar *buf, int offset)
1204
+ {
1205
+ GET_SR;
1206
+ Norm *norm = h_get(sr->norms, field);
1207
+ if (norm == NULL) {
1208
+ memset(buf + offset*sizeof(uchar), 0, sr_max_doc(ir)*sizeof(uchar));
1209
+ } else if (norm->bytes != NULL) { // can copy from cache
1210
+ memcpy(buf + offset*sizeof(uchar), norm->bytes, sr_max_doc(ir)*sizeof(uchar));
1211
+ } else {
1212
+ InStream *norm_in = is_clone(norm->is);
1213
+ // read from disk
1214
+ is_seek(norm_in, 0);
1215
+ is_read_bytes(norm_in, buf, offset, sr_max_doc(ir));
1216
+ is_close(norm_in);
1217
+ }
1218
+ }
1219
+
1220
+ void sr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1221
+ {
1222
+ mutex_lock(&ir->mutex);
1223
+ sr_get_norms_into_internal(ir, field, buf, offset);
1224
+ mutex_unlock(&ir->mutex);
1225
+ }
1226
+
1227
+ static inline uchar *sr_get_norms_internal(IndexReader *ir, char *field)
1228
+ {
1229
+ GET_SR;
1230
+ Norm *norm = h_get(sr->norms, field);
1231
+ if (norm == NULL) // not an indexed field
1232
+ return NULL;
1233
+
1234
+ if (norm->bytes == NULL) { // value not yet read
1235
+ uchar *bytes = ALLOC_N(uchar, ir->max_doc(ir));
1236
+ sr_get_norms_into_internal(ir, field, bytes, 0);
1237
+ norm->bytes = bytes; // cache it
1238
+ }
1239
+ return norm->bytes;
1240
+ }
1241
+
1242
+ uchar *sr_get_norms(IndexReader *ir, char *field)
1243
+ {
1244
+ uchar *norms;
1245
+ mutex_lock(&ir->mutex);
1246
+ norms = sr_get_norms_internal(ir, field);
1247
+ mutex_unlock(&ir->mutex);
1248
+ return norms;
1249
+ }
1250
+
1251
+ static inline uchar *sr_get_norms_always(IndexReader *ir, char *field)
1252
+ {
1253
+ uchar *bytes;
1254
+ GET_SR;
1255
+ mutex_lock(&ir->mutex);
1256
+
1257
+ bytes = sr_get_norms_internal(ir, field);
1258
+ if (bytes == NULL) {
1259
+ if (sr->fake_norms) {
1260
+ bytes = sr->fake_norms;
1261
+ } else {
1262
+ int len = ir->max_doc(ir);
1263
+ sr->fake_norms = bytes = ALLOC_N(uchar, len);
1264
+ memset(bytes, 0, len);
1265
+ }
1266
+ }
1267
+ mutex_unlock(&ir->mutex);
1268
+ return bytes;
1269
+ }
1270
+
1271
+ void sr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1272
+ {
1273
+ GET_SR;
1274
+ Norm *norm;
1275
+
1276
+ norm = h_get(sr->norms, field);
1277
+ if (norm != NULL) { /* an indexed field */
1278
+ norm->is_dirty = true; // mark it dirty
1279
+ sr->norms_dirty = true;
1280
+
1281
+ sr_get_norms_internal(ir, field)[doc_num] = val;
1282
+ }
1283
+ }
1284
+
1285
+ int sr_doc_freq(IndexReader *ir, Term *t)
1286
+ {
1287
+ GET_SR;
1288
+ TermInfo *ti = tir_get_ti(sr->tir, t);
1289
+ if (ti != NULL) {
1290
+ int df = ti->doc_freq;
1291
+ ti_destroy(ti);
1292
+ return df;
1293
+ } else return 0;
1294
+ }
1295
+
1296
+ Array *sr_file_names(IndexReader *ir)
1297
+ {
1298
+ GET_SR;
1299
+ Array *file_names = ary_create(0, &efree);
1300
+ FieldInfo *fi;
1301
+ int i;
1302
+ char fname[SEGMENT_NAME_MAX_LENGTH];
1303
+
1304
+ for (i = 0; i < NELEMS(INDEX_EXTENSIONS); i++) {
1305
+ sprintf(fname, "%s.%s", sr->segment, INDEX_EXTENSIONS[i]);
1306
+ if (ir->store->exists(ir->store, fname))
1307
+ ary_append(file_names, estrdup(fname));
1308
+ }
1309
+
1310
+ for (i = 0; i < sr->fis->fcnt; i++) {
1311
+ fi = sr->fis->by_number[i];
1312
+ if (fi->is_indexed && !fi->omit_norms) {
1313
+ if (sr->cfs_store) {
1314
+ sprintf(fname, "%s.s%d", sr->segment, i);
1315
+ } else {
1316
+ sprintf(fname, "%s.f%d", sr->segment, i);
1317
+ }
1318
+ if (ir->store->exists(ir->store, fname))
1319
+ ary_append(file_names, estrdup(fname));
1320
+ }
1321
+ }
1322
+ return file_names;
1323
+ }
1324
+
1325
+ HashSet *sr_get_field_names(IndexReader *ir, int field_type)
1326
+ {
1327
+ int i;
1328
+ GET_SR;
1329
+ HashSet *field_set = hs_str_create(NULL);
1330
+ FieldInfo *fi;
1331
+ for (i = 0; i < sr->fis->fcnt; i++) {
1332
+ fi = sr->fis->by_number[i];
1333
+ switch(field_type) {
1334
+ case IR_ALL:
1335
+ hs_add(field_set, fi->name);
1336
+ break;
1337
+ case IR_UNINDEXED:
1338
+ if (!fi->is_indexed) hs_add(field_set, fi->name);
1339
+ break;
1340
+ case IR_INDEXED:
1341
+ if (fi->is_indexed) hs_add(field_set, fi->name);
1342
+ break;
1343
+ case IR_INDEXED_NO_TERM_VECTOR:
1344
+ if (fi->is_indexed && !fi->store_tv) hs_add(field_set, fi->name);
1345
+ break;
1346
+ case IR_TERM_VECTOR:
1347
+ if (fi->store_tv && !fi->store_pos && !fi->store_offset)
1348
+ hs_add(field_set, fi->name);
1349
+ break;
1350
+ case IR_INDEXED_WITH_TERM_VECTOR:
1351
+ if (fi->is_indexed && fi->store_tv) hs_add(field_set, fi->name);
1352
+ break;
1353
+ case IR_TERM_VECTOR_WITH_POSITION:
1354
+ if (fi->store_pos && !fi->store_offset) hs_add(field_set, fi->name);
1355
+ break;
1356
+ case IR_TERM_VECTOR_WITH_OFFSET:
1357
+ if (!fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1358
+ case IR_TERM_VECTOR_WITH_POSITION_OFFSET:
1359
+ if (fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
1360
+ break;
1361
+ default:
1362
+ eprintf(ARG_ERROR, "Invalid field_type <%ld>.", field_type);
1363
+ }
1364
+ }
1365
+ return field_set;
1366
+ }
1367
+
1368
+ int sr_num_docs(IndexReader *ir)
1369
+ {
1370
+ GET_SR;
1371
+
1372
+ mutex_lock(&ir->mutex);
1373
+ int num_docs = sr_max_doc(ir);
1374
+ if (sr->deleted_docs != NULL)
1375
+ num_docs -= sr->deleted_docs->count;
1376
+ mutex_unlock(&ir->mutex);
1377
+ return num_docs;
1378
+ }
1379
+
1380
+ TermDocEnum *sr_term_docs(IndexReader *ir)
1381
+ {
1382
+ return stde_create(ir);
1383
+ }
1384
+
1385
+ TermDocEnum *sr_term_positions(IndexReader *ir)
1386
+ {
1387
+ return stpe_create(ir);
1388
+ }
1389
+
1390
+ void sr_open_norms(IndexReader *ir, Store *cfs_store)
1391
+ {
1392
+ GET_SR;
1393
+ int i;
1394
+ FieldInfo *fi;
1395
+ Store *tmp_store;
1396
+ char fname[SEGMENT_NAME_MAX_LENGTH];
1397
+ for (i = 0; i < sr->fis->fcnt; i++) {
1398
+ tmp_store = ir->store;
1399
+ fi = sr->fis->by_number[i];
1400
+ if (fi->is_indexed && !fi->omit_norms) {
1401
+ sprintf(fname, "%s.s%d", sr->segment, fi->number);
1402
+ if (! tmp_store->exists(tmp_store, fname)) {
1403
+ sprintf(fname, "%s.f%d", sr->segment, fi->number);
1404
+ tmp_store = cfs_store;
1405
+ }
1406
+ h_set(sr->norms, fi->name,
1407
+ norm_create(tmp_store->open_input(tmp_store, fname), fi->number));
1408
+ }
1409
+ }
1410
+ sr->norms_dirty = false;
1411
+ }
1412
+
1413
+ TermVector *sr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1414
+ {
1415
+ GET_SR;
1416
+ FieldInfo *fi = (FieldInfo *)ht_get(sr->fis->by_name, field);
1417
+ TermVectorsReader *tvr;
1418
+
1419
+ if (fi == NULL || !fi->store_tv || !sr->orig_tvr || !(tvr = sr_tvr(sr)))
1420
+ return NULL;
1421
+
1422
+ return tvr_get_field_tv(tvr, doc_num, field);
1423
+ }
1424
+
1425
+ Array *sr_get_term_vectors(IndexReader *ir, int doc_num)
1426
+ {
1427
+ GET_SR;
1428
+ TermVectorsReader *tvr;
1429
+ if (sr->orig_tvr == NULL || (tvr = sr_tvr(sr)) == NULL)
1430
+ return NULL;
1431
+
1432
+ return tvr_get_tv(tvr, doc_num);
1433
+ }
1434
+
1435
+ void sr_commit(IndexReader *ir)
1436
+ {
1437
+ GET_SR;
1438
+ char tmp_fname[SEGMENT_NAME_MAX_LENGTH];
1439
+ char del_fname[SEGMENT_NAME_MAX_LENGTH];
1440
+ sprintf(del_fname, "%s.del", sr->segment);
1441
+
1442
+ if (sr->deleted_docs_dirty) { // re-write deleted
1443
+ sprintf(tmp_fname, "%s.tmp", sr->segment);
1444
+ bv_write(sr->deleted_docs, ir->store, tmp_fname);
1445
+ ir->store->rename(ir->store, tmp_fname, del_fname);
1446
+ }
1447
+ if (sr->undelete_all && ir->store->exists(ir->store, del_fname))
1448
+ ir->store->remove(ir->store, del_fname);
1449
+ if (sr->norms_dirty) {// re-write norms
1450
+ int i;
1451
+ FieldInfo *fi;
1452
+ for (i = 0; i < sr->fis->fcnt; i++) {
1453
+ fi = sr->fis->by_number[i];
1454
+ if (fi->is_indexed) {
1455
+ norm_rewrite((Norm *)h_get(sr->norms, fi->name), ir->store,
1456
+ sr->segment, sr_max_doc(ir), sr->cfs_store);
1457
+ }
1458
+ }
1459
+ }
1460
+ sr->deleted_docs_dirty = false;
1461
+ sr->norms_dirty = false;
1462
+ sr->undelete_all = false;
1463
+ }
1464
+
1465
+ IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
1466
+ {
1467
+ Store *store = si->store;
1468
+ SegmentReader *sr = ALLOC(SegmentReader);
1469
+ ir->get_term_vector = &sr_get_term_vector;
1470
+ ir->get_term_vectors = &sr_get_term_vectors;
1471
+ ir->num_docs = &sr_num_docs;
1472
+ ir->max_doc = &sr_max_doc;
1473
+ ir->get_doc = &sr_get_doc;
1474
+ ir->get_norms_into = &sr_get_norms_into;
1475
+ ir->get_norms = &sr_get_norms;
1476
+ ir->get_norms_always = &sr_get_norms_always;
1477
+ ir->do_set_norm = &sr_set_norm;
1478
+ ir->terms = &sr_terms;
1479
+ ir->terms_from = &sr_terms_from;
1480
+ ir->doc_freq = &sr_doc_freq;
1481
+ ir->term_docs = &sr_term_docs;
1482
+ ir->term_positions = &sr_term_positions;
1483
+ ir->do_delete_doc = &sr_delete_doc;
1484
+ ir->is_deleted = &sr_is_deleted;
1485
+ ir->has_norms = &sr_has_norms;
1486
+ ir->has_deletions = &sr_has_deletions;
1487
+ ir->do_undelete_all = &sr_undelete_all;
1488
+ ir->get_field_names = &sr_get_field_names;
1489
+ ir->do_commit = &sr_commit;
1490
+ ir->do_close = &sr_close;
1491
+ ir->data = sr;
1492
+ sr->segment = estrdup(si->name);
1493
+ char fname[SEGMENT_NAME_MAX_LENGTH];
1494
+ sr->cfs_store = NULL;
1495
+ sr->fake_norms = NULL;
1496
+ sprintf(fname, "%s.cfs", sr->segment);
1497
+ if (store->exists(store, fname)) {
1498
+ sr->cfs_store = open_cmpd_store(store, fname);
1499
+ store = sr->cfs_store;
1500
+ }
1501
+
1502
+ sprintf(fname, "%s.fnm", sr->segment);
1503
+ sr->fis = fis_open(store, fname);
1504
+ sr->fr = fr_open(store, sr->segment, sr->fis);
1505
+
1506
+ sr->tir = tir_open(store, sr->segment, sr->fis);
1507
+ sr->deleted_docs = NULL;
1508
+ sr->deleted_docs_dirty = false;
1509
+ sr->undelete_all = false;
1510
+ if (si_has_deletions(si)) {
1511
+ sprintf(fname, "%s.del", sr->segment);
1512
+ sr->deleted_docs = bv_read(si->store, fname);
1513
+ }
1514
+
1515
+ sprintf(fname, "%s.frq", sr->segment);
1516
+ sr->freq_in = store->open_input(store, fname);
1517
+ sprintf(fname, "%s.prx", sr->segment);
1518
+ sr->prox_in = store->open_input(store, fname);
1519
+ sr->norms = h_new_str(NULL, &norm_destroy);
1520
+ sr_open_norms(ir, store);
1521
+
1522
+ if (fis_has_vectors(sr->fis)) {
1523
+ sr->orig_tvr = tvr_open(store, sr->segment, sr->fis);
1524
+ thread_key_create(&sr->thread_tvr, NULL);
1525
+ sr->tvr_bucket = ary_create(1, (destroy_func_t)&tvr_close);
1526
+ } else {
1527
+ sr->orig_tvr = NULL;
1528
+ }
1529
+ return ir;
1530
+ }
1531
+
1532
+ IndexReader *sr_open_si(SegmentInfo *si)
1533
+ {
1534
+ IndexReader *ir = ir_create(si->store, NULL, false, false);
1535
+ return sr_open_internal(ir, si);
1536
+ }
1537
+
1538
+ IndexReader *sr_open(SegmentInfos *sis, int si_num, int is_owner, int close_store)
1539
+ {
1540
+ SegmentInfo *si = sis->segs[si_num];
1541
+ IndexReader *ir = ir_create(si->store, sis, is_owner, close_store);
1542
+ return sr_open_internal(ir, si);
1543
+ }
1544
+ /****************************************************************************
1545
+ *
1546
+ * MultiReader
1547
+ *
1548
+ ****************************************************************************/
1549
+
1550
+ #define GET_MR MultiReader *mr = (MultiReader *)ir->data
1551
+ #define GET_READER(doc_num) MultiReader *mr = (MultiReader *)ir->data;\
1552
+ int i = mr_reader_index(mr, doc_num);\
1553
+ IndexReader *reader = mr->sub_readers[i];
1554
+
1555
+
1556
+
1557
+ int mr_reader_index(MultiReader *mr, int doc_num)
1558
+ {
1559
+ int lo = 0; // search @starts array
1560
+ int hi = mr->rcnt - 1; // for first element less
1561
+ int mid;
1562
+ int mid_value;
1563
+
1564
+ while (hi >= lo) {
1565
+ mid = (lo + hi) >> 1;
1566
+ mid_value = mr->starts[mid];
1567
+ if (doc_num < mid_value) {
1568
+ hi = mid - 1;
1569
+ } else if (doc_num > mid_value) {
1570
+ lo = mid + 1;
1571
+ } else { // found a match
1572
+ while ((mid+1 < mr->rcnt) && (mr->starts[mid+1] == mid_value))
1573
+ mid += 1; // scan to last match in case we have empty segments
1574
+ return mid;
1575
+ }
1576
+ }
1577
+ return hi;
1578
+ }
1579
+
1580
+ TermVector *mr_get_term_vector(IndexReader *ir, int doc_num, char *field)
1581
+ {
1582
+ GET_READER(doc_num);
1583
+ return reader->get_term_vector(reader, doc_num - mr->starts[i], field);
1584
+ }
1585
+
1586
+ Array *mr_get_term_vectors(IndexReader *ir, int doc_num)
1587
+ {
1588
+ GET_READER(doc_num);
1589
+ return reader->get_term_vectors(reader, doc_num - mr->starts[i]);
1590
+ }
1591
+
1592
+ int mr_num_docs(IndexReader *ir)
1593
+ {
1594
+ int i, num_docs;
1595
+ GET_MR;
1596
+ mutex_lock(&ir->mutex);
1597
+ if (mr->num_docs_cache == -1) {
1598
+ IndexReader *reader;
1599
+ mr->num_docs_cache = 0;
1600
+ for (i = 0; i < mr->rcnt; i++) {
1601
+ reader = mr->sub_readers[i];
1602
+ mr->num_docs_cache += reader->num_docs(reader);
1603
+ }
1604
+ }
1605
+ num_docs = mr->num_docs_cache;
1606
+ mutex_unlock(&ir->mutex);
1607
+
1608
+ return num_docs;
1609
+ }
1610
+
1611
+ int mr_max_doc(IndexReader *ir)
1612
+ {
1613
+ GET_MR;
1614
+ return mr->max_doc;
1615
+ }
1616
+
1617
+ Document *mr_get_doc(IndexReader *ir, int doc_num)
1618
+ {
1619
+ GET_READER(doc_num);
1620
+ return reader->get_doc(reader, doc_num - mr->starts[i]);
1621
+ }
1622
+
1623
+ void mr_get_norms_into(IndexReader *ir, char *field, uchar *buf, int offset)
1624
+ {
1625
+ int i;
1626
+ GET_MR;
1627
+
1628
+ mutex_lock(&ir->mutex);
1629
+ uchar *bytes = h_get(mr->norms_cache, field);
1630
+ if (bytes != NULL) {
1631
+ memcpy(buf + offset, bytes, mr->max_doc);
1632
+ } else {
1633
+ IndexReader *reader;
1634
+ for (i = 0; i < mr->rcnt; i++) {
1635
+ reader = mr->sub_readers[i];
1636
+ reader->get_norms_into(reader, field, buf, offset + mr->starts[i]);
1637
+ }
1638
+ }
1639
+ mutex_unlock(&ir->mutex);
1640
+ }
1641
+
1642
+ uchar *mr_get_norms(IndexReader *ir, char *field)
1643
+ {
1644
+ int i;
1645
+ GET_MR;
1646
+ uchar *bytes;
1647
+ IndexReader *reader;
1648
+
1649
+ mutex_lock(&ir->mutex);
1650
+ bytes = h_get(mr->norms_cache, field);
1651
+ if (bytes == NULL) {
1652
+ bytes = ALLOC_N(uchar, mr->max_doc);
1653
+
1654
+ for (i = 0; i < mr->rcnt; i++) {
1655
+ reader = mr->sub_readers[i];
1656
+ reader->get_norms_into(reader, field, bytes, mr->starts[i]);
1657
+ }
1658
+ h_set(mr->norms_cache, field, bytes); // update cache
1659
+ }
1660
+ mutex_unlock(&ir->mutex);
1661
+
1662
+ return bytes;
1663
+ }
1664
+
1665
+ void mr_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
1666
+ {
1667
+ GET_READER(doc_num);
1668
+ h_del(mr->norms_cache, field); // clear cache
1669
+ ir_set_norm(reader, doc_num - mr->starts[i], field, val);
1670
+ }
1671
+
1672
+ TermEnum *mr_terms(IndexReader *ir)
1673
+ {
1674
+ GET_MR;
1675
+ return mte_create(mr->sub_readers, mr->starts, mr->rcnt, NULL);
1676
+ }
1677
+
1678
+ TermEnum *mr_terms_from(IndexReader *ir, Term *term)
1679
+ {
1680
+ GET_MR;
1681
+ return mte_create(mr->sub_readers, mr->starts, mr->rcnt, term);
1682
+ }
1683
+
1684
+ int mr_doc_freq(IndexReader *ir, Term *t)
1685
+ {
1686
+ int total = 0, i; // sum freqs in segments
1687
+ GET_MR;
1688
+
1689
+ IndexReader *reader;
1690
+ for (i = 0; i < mr->rcnt; i++) {
1691
+ reader = mr->sub_readers[i];
1692
+ total += reader->doc_freq(reader, t);
1693
+ }
1694
+ return total;
1695
+ }
1696
+
1697
+ TermDocEnum *mr_term_docs(IndexReader *ir)
1698
+ {
1699
+ GET_MR;
1700
+ return mtde_create(mr->sub_readers, mr->starts, mr->rcnt);
1701
+ }
1702
+
1703
+ TermDocEnum *mr_term_positions(IndexReader *ir)
1704
+ {
1705
+ GET_MR;
1706
+ return mtpe_create(mr->sub_readers, mr->starts, mr->rcnt);
1707
+ }
1708
+
1709
+ void mr_delete_doc(IndexReader *ir, int doc_num)
1710
+ {
1711
+ GET_READER(doc_num);
1712
+ mr->num_docs_cache = -1; // invalidate cache
1713
+
1714
+ reader->do_delete_doc(reader, doc_num - mr->starts[i]); // dispatch to segment reader
1715
+ mr->has_deletions = true;
1716
+ }
1717
+
1718
+ bool mr_is_deleted(IndexReader *ir, int doc_num)
1719
+ {
1720
+ GET_READER(doc_num);
1721
+ return reader->is_deleted(reader, doc_num - mr->starts[i]);
1722
+ }
1723
+
1724
+ bool mr_has_norms(IndexReader *ir, char *field)
1725
+ {
1726
+ bool has_norms = false;
1727
+ int i;
1728
+ GET_MR;
1729
+
1730
+ IndexReader *reader;
1731
+ for (i = 0; i < mr->rcnt; i++) {
1732
+ reader = mr->sub_readers[i];
1733
+ if (reader->has_norms(reader, field)) {
1734
+ has_norms = true;
1735
+ break;
1736
+ }
1737
+ }
1738
+
1739
+ return has_norms;
1740
+ }
1741
+
1742
+ bool mr_has_deletions(IndexReader *ir)
1743
+ {
1744
+ GET_MR;
1745
+ return mr->has_deletions;
1746
+ }
1747
+
1748
+ void mr_undelete_all(IndexReader *ir)
1749
+ {
1750
+ int i;
1751
+ GET_MR;
1752
+ mr->num_docs_cache = -1; // invalidate cache
1753
+ IndexReader *reader;
1754
+ for (i = 0; i < mr->rcnt; i++) {
1755
+ reader = mr->sub_readers[i];
1756
+ reader->do_undelete_all(reader);
1757
+ }
1758
+ mr->has_deletions = false;
1759
+ }
1760
+
1761
+ HashSet *mr_get_field_names(IndexReader *ir, int field_type)
1762
+ {
1763
+ int i;
1764
+ GET_MR;
1765
+ HashSet *field_set = hs_str_create(NULL);
1766
+ IndexReader *reader;
1767
+ for (i = 0; i < mr->rcnt; i++) {
1768
+ reader = mr->sub_readers[i];
1769
+ hs_merge(field_set, reader->get_field_names(reader, field_type));
1770
+ }
1771
+ return field_set;
1772
+ }
1773
+
1774
+ void mr_commit(IndexReader *ir)
1775
+ {
1776
+ GET_MR;
1777
+ int i;
1778
+ IndexReader *reader;
1779
+ for (i = 0; i < mr->rcnt; i++) {
1780
+ reader = mr->sub_readers[i];
1781
+ reader->do_commit(reader);
1782
+ }
1783
+ }
1784
+
1785
+ void mr_close(IndexReader *ir)
1786
+ {
1787
+ GET_MR;
1788
+ int i;
1789
+ IndexReader *reader;
1790
+ for (i = 0; i < mr->rcnt; i++) {
1791
+ reader = mr->sub_readers[i];
1792
+ ir_close(reader);
1793
+ }
1794
+ free(mr->sub_readers);
1795
+ h_destroy(mr->norms_cache);
1796
+ free(mr->starts);
1797
+ free(mr);
1798
+ }
1799
+
1800
+ IndexReader *mr_open(Store *store,
1801
+ SegmentInfos *sis,
1802
+ IndexReader **sub_readers,
1803
+ int rcnt,
1804
+ int close_store)
1805
+ {
1806
+ int i;
1807
+ MultiReader *mr = ALLOC(MultiReader);
1808
+ IndexReader *sub_reader;
1809
+ mr->sub_readers = sub_readers;
1810
+ mr->rcnt = rcnt;
1811
+
1812
+ mr->max_doc = 0;
1813
+ mr->num_docs_cache = -1;
1814
+ mr->has_deletions = false;
1815
+
1816
+ mr->starts = ALLOC_N(int, (rcnt+1));
1817
+ for (i = 0; i < rcnt; i++) {
1818
+ sub_reader = sub_readers[i];
1819
+ mr->starts[i] = mr->max_doc;
1820
+ mr->max_doc += sub_reader->max_doc(sub_reader); // compute max_docs
1821
+
1822
+ if (sub_reader->has_deletions(sub_reader))
1823
+ mr->has_deletions = true;
1824
+ }
1825
+ mr->starts[rcnt] = mr->max_doc;
1826
+ mr->norms_cache = h_new_str(NULL, &efree);
1827
+
1828
+ IndexReader *ir = ir_create(store, sis, true, close_store);
1829
+ ir->get_term_vector = &mr_get_term_vector;
1830
+ ir->get_term_vectors = &mr_get_term_vectors;
1831
+ ir->num_docs = &mr_num_docs;
1832
+ ir->max_doc = &mr_max_doc;
1833
+ ir->get_doc = &mr_get_doc;
1834
+ ir->get_norms_into = &mr_get_norms_into;
1835
+ ir->get_norms = &mr_get_norms;
1836
+ ir->get_norms_always = &mr_get_norms;
1837
+ ir->do_set_norm = &mr_set_norm;
1838
+ ir->terms = &mr_terms;
1839
+ ir->terms_from = &mr_terms_from;
1840
+ ir->doc_freq = &mr_doc_freq;
1841
+ ir->term_docs = &mr_term_docs;
1842
+ ir->term_positions = &mr_term_positions;
1843
+ ir->do_delete_doc = &mr_delete_doc;
1844
+ ir->is_deleted = &mr_is_deleted;
1845
+ ir->has_norms = &mr_has_norms;
1846
+ ir->has_deletions = &mr_has_deletions;
1847
+ ir->do_undelete_all = &mr_undelete_all;
1848
+ ir->get_field_names = &mr_get_field_names;
1849
+ ir->do_commit = &mr_commit;
1850
+ ir->do_close = &mr_close;
1851
+ ir->data = mr;
1852
+
1853
+ return ir;
1854
+ }
1855
+
1856
+ /****************************************************************************
1857
+ *
1858
+ * SegmentMergeInfo
1859
+ *
1860
+ ****************************************************************************/
1861
+
1862
+ bool smi_lt(void *p1, void *p2)
1863
+ {
1864
+ SegmentMergeInfo *smi1 = (SegmentMergeInfo *)p1;
1865
+ SegmentMergeInfo *smi2 = (SegmentMergeInfo *)p2;
1866
+
1867
+ int cmpres = tb_cmp(smi1->tb, smi2->tb);
1868
+ if (cmpres == 0) {
1869
+ return smi1->base < smi2->base;
1870
+ } else {
1871
+ return cmpres < 0;
1872
+ }
1873
+ }
1874
+
1875
+ int *smi_load_doc_map(SegmentMergeInfo *smi)
1876
+ {
1877
+ IndexReader *ir = smi->ir;
1878
+ if (ir->has_deletions(ir) && (smi->doc_map == NULL)) {
1879
+ int max_doc = ir->max_doc(ir);
1880
+ smi->doc_map = ALLOC_N(int, max_doc);
1881
+ int j = 0, i;
1882
+ for (i = 0; i < max_doc; i++) {
1883
+ if (ir->is_deleted(ir, i)) {
1884
+ smi->doc_map[i] = -1;
1885
+ } else {
1886
+ smi->doc_map[i] = j++;
1887
+ }
1888
+ }
1889
+ }
1890
+ return smi->doc_map;
1891
+ }
1892
+
1893
+ SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir)
1894
+ {
1895
+ SegmentMergeInfo *smi = ALLOC(SegmentMergeInfo);
1896
+ smi->base = base;
1897
+ smi->ir = ir;
1898
+ smi->te = te;
1899
+ smi->tb = te->tb_curr;
1900
+ smi->postings = ir->term_positions(ir);
1901
+ smi->doc_map = NULL;
1902
+ return smi;
1903
+ }
1904
+
1905
+ void smi_destroy(void *p)
1906
+ {
1907
+ SegmentMergeInfo *smi = (SegmentMergeInfo *)p;
1908
+ smi->postings->close(smi->postings);
1909
+ smi->te->close(smi->te);
1910
+ if (smi->doc_map != NULL)
1911
+ free(smi->doc_map);
1912
+ free(smi);
1913
+ }
1914
+
1915
+ TermBuffer *smi_next(SegmentMergeInfo *smi)
1916
+ {
1917
+ return (smi->tb = smi->te->next(smi->te));
1918
+ }
1919
+
1920
+ /****************************************************************************
1921
+ *
1922
+ * SegmentMerger
1923
+ *
1924
+ ****************************************************************************/
1925
+
1926
+ SegmentMerger *sm_create(Store *store, char *name, int term_index_interval)
1927
+ {
1928
+ SegmentMerger *sm = ALLOC(SegmentMerger);
1929
+ sm->store = store;
1930
+ sm->name = estrdup(name);
1931
+ sm->readers = ary_create(config.merge_factor, &ir_destroy);
1932
+ sm->fis = NULL;
1933
+ sm->freq_out = NULL;
1934
+ sm->prox_out = NULL;
1935
+ sm->tiw = NULL;
1936
+ sm->queue = NULL;
1937
+ sm->ti = ti_create(0, 0, 0, 0);
1938
+ sm->term_index_interval = term_index_interval;
1939
+ sm->skip_buffer = ram_create_buffer();
1940
+ sm->skip_interval = -1;
1941
+ return sm;
1942
+ }
1943
+
1944
+ void sm_close(SegmentMerger *sm)
1945
+ {
1946
+ int i;
1947
+ if (sm->freq_out != NULL) os_close(sm->freq_out);
1948
+ if (sm->prox_out != NULL) os_close(sm->prox_out);
1949
+ if (sm->tiw != NULL) {
1950
+ for (i = 0; i < sm->terms_buf_size; i++)
1951
+ free(sm->terms_buf[i].text);
1952
+ free(sm->terms_buf);
1953
+ tiw_close(sm->tiw);
1954
+ }
1955
+ if (sm->queue != NULL) pq_destroy(sm->queue);
1956
+ sm->freq_out = NULL;
1957
+ sm->prox_out = NULL;
1958
+ sm->tiw = NULL;
1959
+ sm->queue = NULL;
1960
+ }
1961
+
1962
+ void sm_destroy(void *p)
1963
+ {
1964
+ SegmentMerger *sm = (SegmentMerger *)p;
1965
+ if (sm->fis != NULL) fis_destroy(sm->fis);
1966
+ ary_destroy(sm->readers);
1967
+ sm_close(sm);
1968
+ free(sm->name);
1969
+ ti_destroy(sm->ti);
1970
+ ram_destroy_buffer(sm->skip_buffer);
1971
+ free(sm);
1972
+ }
1973
+
1974
+ void sm_add(SegmentMerger *sm, IndexReader *ir)
1975
+ {
1976
+ ary_append(sm->readers, ir);
1977
+ }
1978
+
1979
+ static inline void sm_add_indexed(IndexReader *ir,
1980
+ FieldInfos *fis,
1981
+ HashSet *fields,
1982
+ bool store_tv,
1983
+ bool store_pos,
1984
+ bool store_offset)
1985
+ {
1986
+ int i;
1987
+ char *field;
1988
+ for (i = 0; i < fields->size; i++) {
1989
+ field = (char *)fields->elems[i];
1990
+ fis_add(fis, field, true, store_tv, store_pos, store_offset,
1991
+ !ir->has_norms(ir, field));
1992
+ }
1993
+ hs_destroy(fields);
1994
+ }
1995
+
1996
+ int sm_merge_fields(SegmentMerger *sm)
1997
+ {
1998
+ int i, j, maxdoc;
1999
+ FieldInfos *fis = sm->fis = fis_create();
2000
+ int doc_count = 0;
2001
+ Document *doc;
2002
+ for (i = 0; i < sm->readers->size; i++) {
2003
+ IndexReader *ir = sm->readers->elems[i];
2004
+
2005
+ sm_add_indexed(ir, fis,
2006
+ ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION_OFFSET),
2007
+ true, true, true);
2008
+ sm_add_indexed(ir, fis,
2009
+ ir->get_field_names(ir, IR_TERM_VECTOR_WITH_POSITION),
2010
+ true, true, false);
2011
+ sm_add_indexed(ir, fis,
2012
+ ir->get_field_names(ir, IR_TERM_VECTOR_WITH_OFFSET),
2013
+ true, false, true);
2014
+ sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_TERM_VECTOR),
2015
+ true, false, false);
2016
+ sm_add_indexed(ir, fis, ir->get_field_names(ir, IR_INDEXED),
2017
+ false, false, false);
2018
+ fis_add_fields(fis, ir->get_field_names(ir, IR_UNINDEXED),
2019
+ false, false, false, false, false);
2020
+ }
2021
+ fis_write(fis, sm->store, sm->name, ".fnm");
2022
+
2023
+ // merge field values
2024
+ FieldsWriter *fw = fw_open(sm->store, sm->name, fis);
2025
+
2026
+ for (i = 0; i < sm->readers->size; i++) {
2027
+ IndexReader *ir = sm->readers->elems[i];
2028
+ maxdoc = ir->max_doc(ir);
2029
+ for (j = 0; j < maxdoc; j++) {
2030
+ if (!ir->is_deleted(ir, j)) { // skip deleted docs
2031
+ doc = ir->get_doc(ir, j);
2032
+ fw_add_doc(fw, doc);
2033
+ doc_destroy(doc);
2034
+ doc_count++;
2035
+ }
2036
+ }
2037
+ }
2038
+ fw_close(fw);
2039
+ return doc_count;
2040
+ }
2041
+
2042
+ void sm_reset_skip(SegmentMerger *sm)
2043
+ {
2044
+ ramo_reset(sm->skip_buffer);
2045
+ sm->last_skip_doc = 0;
2046
+ sm->last_skip_freq_pointer = os_pos(sm->freq_out);
2047
+ sm->last_skip_prox_pointer = os_pos(sm->prox_out);
2048
+ }
2049
+
2050
+ inline void sm_buffer_skip(SegmentMerger *sm, int doc)
2051
+ {
2052
+ int freq_pointer = os_pos(sm->freq_out);
2053
+ int prox_pointer = os_pos(sm->prox_out);
2054
+
2055
+ os_write_vint(sm->skip_buffer, doc - sm->last_skip_doc);
2056
+ os_write_vint(sm->skip_buffer, freq_pointer - sm->last_skip_freq_pointer);
2057
+ os_write_vint(sm->skip_buffer, prox_pointer - sm->last_skip_prox_pointer);
2058
+
2059
+ sm->last_skip_doc = doc;
2060
+ sm->last_skip_freq_pointer = freq_pointer;
2061
+ sm->last_skip_prox_pointer = prox_pointer;
2062
+ }
2063
+
2064
+ int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2065
+ {
2066
+ int i, j;
2067
+ int last_doc = 0, base, doc, doc_code, freq, last_position, position;
2068
+ int *doc_map = NULL;
2069
+ int df = 0; // number of docs w/ term
2070
+ TermDocEnum *postings;
2071
+ SegmentMergeInfo *smi;
2072
+ sm_reset_skip(sm);
2073
+ for (i = 0; i < cnt; i++) {
2074
+ smi = smis[i];
2075
+ postings = smi->postings;
2076
+ base = smi->base;
2077
+ doc_map = smi_load_doc_map(smi);
2078
+
2079
+ stde_seek_ti(postings, smi->te->ti_curr);
2080
+ while (postings->next(postings)) {
2081
+ doc = postings->doc_num(postings);
2082
+ if (doc_map != NULL)
2083
+ doc = doc_map[doc]; // work around deletions
2084
+ doc += base; // convert to merged space
2085
+
2086
+ if (doc < last_doc)
2087
+ eprintf(STATE_ERROR,
2088
+ "docs out of order curent doc = %ld and previous doc = %ld",
2089
+ doc, last_doc);
2090
+
2091
+ df++;
2092
+
2093
+ if ((df % sm->skip_interval) == 0)
2094
+ sm_buffer_skip(sm, last_doc);
2095
+
2096
+ doc_code = (doc - last_doc) << 1; // use low bit to flag freq=1
2097
+ last_doc = doc;
2098
+
2099
+ freq = postings->freq(postings);
2100
+ if (freq == 1) {
2101
+ os_write_vint(sm->freq_out, doc_code | 1); // write doc & freq=1
2102
+ } else {
2103
+ os_write_vint(sm->freq_out, doc_code); // write doc
2104
+ os_write_vint(sm->freq_out, freq); // write freqency in doc
2105
+ }
2106
+
2107
+
2108
+ last_position = 0; // write position deltas
2109
+ for (j = 0; j < freq; j++) {
2110
+ position = postings->next_position(postings);
2111
+ os_write_vint(sm->prox_out, position - last_position);
2112
+ last_position = position;
2113
+ }
2114
+ }
2115
+ }
2116
+ return df;
2117
+ }
2118
+
2119
+ int sm_write_skip(SegmentMerger *sm)
2120
+ {
2121
+ int skip_pointer = os_pos(sm->freq_out);
2122
+ ramo_write_to(sm->skip_buffer, sm->freq_out);
2123
+ return skip_pointer;
2124
+ }
2125
+
2126
+ Term *sm_tb_to_term(SegmentMerger *sm, TermBuffer *tb)
2127
+ {
2128
+ int index = sm->terms_buf_pointer % sm->terms_buf_size;
2129
+ sm->terms_buf_pointer++;
2130
+ sm->terms_buf[index].field = tb->field;
2131
+ strcpy(sm->terms_buf[index].text, tb->text);
2132
+ return &(sm->terms_buf[index]);
2133
+ }
2134
+
2135
+ void sm_merge_term_info(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
2136
+ {
2137
+ int freq_pointer = os_pos(sm->freq_out);
2138
+ int prox_pointer = os_pos(sm->prox_out);
2139
+
2140
+ int df = sm_append_postings(sm, smis, cnt); // append posting data
2141
+
2142
+ int skip_pointer = sm_write_skip(sm);
2143
+
2144
+ if (df > 0) {
2145
+ // add an entry to the dictionary with pointers to prox and freq files
2146
+ ti_set(sm->ti, df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer));
2147
+ tiw_add(sm->tiw, sm_tb_to_term(sm, smis[0]->tb), sm->ti);
2148
+ }
2149
+ }
2150
+
2151
+ void sm_merge_term_infos(SegmentMerger *sm)
2152
+ {
2153
+ int base = 0;
2154
+ int i, match_size;
2155
+ IndexReader *ir;
2156
+ TermEnum *te;
2157
+ SegmentMergeInfo *smi, *top;
2158
+ TermBuffer *tb;
2159
+
2160
+ for (i = 0; i < sm->readers->size; i++) {
2161
+ ir = sm->readers->elems[i];
2162
+ te = ir->terms(ir);
2163
+ smi = smi_create(base, te, ir);
2164
+ base += ir->num_docs(ir);
2165
+ if (smi_next(smi) != NULL)
2166
+ pq_push(sm->queue, smi); // initialize @queue
2167
+ else
2168
+ smi_destroy(smi);
2169
+ }
2170
+
2171
+ SegmentMergeInfo **match = ALLOC_N(SegmentMergeInfo *, sm->readers->size);
2172
+
2173
+ while (sm->queue->count > 0) {
2174
+ // for (i = 1; i <= sm->queue->count; i++) {
2175
+ // printf("<{%s:%s}>", ((SegmentMergeInfo *)sm->queue->heap[i])->tb->field,
2176
+ // ((SegmentMergeInfo *)sm->queue->heap[i])->tb->text);
2177
+ // }printf("\n\n");
2178
+ match_size = 0; // pop matching terms
2179
+ match[match_size] = pq_pop(sm->queue);
2180
+ match_size++;
2181
+ tb = match[0]->tb;
2182
+ top = pq_top(sm->queue);
2183
+ while ((top != NULL) && (tb_cmp(tb, top->tb) == 0)) {
2184
+ match[match_size] = pq_pop(sm->queue);
2185
+ match_size++;
2186
+ top = pq_top(sm->queue);
2187
+ }
2188
+
2189
+ //printf(">%s:%s<\n", match[0]->tb->field, match[0]->tb->text);
2190
+ sm_merge_term_info(sm, match, match_size); // add new TermInfo
2191
+
2192
+ while (match_size > 0) {
2193
+ match_size--;
2194
+ smi = match[match_size];
2195
+ if (smi_next(smi) != NULL)
2196
+ pq_push(sm->queue, smi); // restore queue
2197
+ else
2198
+ smi_destroy(smi); // done with a segment
2199
+ }
2200
+ }
2201
+ free(match);
2202
+ }
2203
+
2204
+ void sm_merge_terms(SegmentMerger *sm)
2205
+ {
2206
+ int i;
2207
+ char fname[SEGMENT_NAME_MAX_LENGTH];
2208
+ sprintf(fname, "%s.frq", sm->name);
2209
+ sm->freq_out = sm->store->create_output(sm->store, fname);
2210
+ sprintf(fname, "%s.prx", sm->name);
2211
+ sm->prox_out = sm->store->create_output(sm->store, fname);
2212
+ sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
2213
+ // terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
2214
+ // to keep the last index_interval terms so that it can compare the last term
2215
+ // put in the index with the next one. So the size of the buffer must by
2216
+ // index_interval + 2.
2217
+ sm->terms_buf_pointer = 0;
2218
+ sm->terms_buf_size = sm->tiw->index_interval + 2;
2219
+ sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
2220
+ for (i = 0; i < sm->terms_buf_size; i++) {
2221
+ sm->terms_buf[i].field = NULL;
2222
+ sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
2223
+ }
2224
+ sm->skip_interval = sm->tiw->skip_interval;
2225
+ sm->queue = pq_create(sm->readers->size, &smi_lt);
2226
+
2227
+ sm_merge_term_infos(sm);
2228
+
2229
+ sm_close(sm);
2230
+ }
2231
+
2232
+ void sm_merge_norms(SegmentMerger *sm)
2233
+ {
2234
+ int i, j, k, max_doc;
2235
+ uchar *norm_buf;
2236
+ FieldInfo *fi;
2237
+ OutStream *os;
2238
+ char fname[SEGMENT_NAME_MAX_LENGTH];
2239
+ IndexReader *ir;
2240
+ for (i = 0; i < sm->fis->fcnt; i++) {
2241
+ fi = sm->fis->by_number[i];
2242
+ if (fi->is_indexed && !fi->omit_norms) {
2243
+ sprintf(fname, "%s.f%d", sm->name, i);
2244
+ os = sm->store->create_output(sm->store, fname);
2245
+ for (j = 0; j < sm->readers->size; j++) {
2246
+ ir = sm->readers->elems[j];
2247
+ max_doc = ir->max_doc(ir);
2248
+ norm_buf = ALLOC_N(uchar, max_doc);
2249
+ memset(norm_buf, 0, sizeof(uchar) * max_doc);
2250
+ ir->get_norms_into(ir, fi->name, norm_buf, 0);
2251
+ for (k = 0; k < max_doc; k++) {
2252
+ if (!ir->is_deleted(ir, k)) {
2253
+ os_write_byte(os, norm_buf[k]);
2254
+ }
2255
+ }
2256
+ free(norm_buf);
2257
+ }
2258
+ os_close(os);
2259
+ }
2260
+ }
2261
+ }
2262
+
2263
+ void sm_merge_vectors(SegmentMerger *sm)
2264
+ {
2265
+ int i, j, max_doc;
2266
+ TermVectorsWriter *tvw = tvw_open(sm->store, sm->name, sm->fis);
2267
+ IndexReader *ir;
2268
+ Array *tvs;
2269
+ for (i = 0; i < sm->readers->size; i++) {
2270
+ ir = sm->readers->elems[i];
2271
+ max_doc = ir->max_doc(ir);
2272
+ for (j = 0; j < max_doc; j++) {
2273
+ // skip deleted docs
2274
+ if (! ir->is_deleted(ir, j)) {
2275
+ tvs = ir->get_term_vectors(ir, j);
2276
+ tvw_add_all_doc_vectors(tvw, tvs);
2277
+ ary_destroy(tvs);
2278
+ }
2279
+ }
2280
+ }
2281
+ tvw_close(tvw);
2282
+ }
2283
+
2284
+ int sm_merge(SegmentMerger *sm)
2285
+ {
2286
+ int doc_count = sm_merge_fields(sm);
2287
+ sm_merge_terms(sm);
2288
+ sm_merge_norms(sm);
2289
+ if (fis_has_vectors(sm->fis))
2290
+ sm_merge_vectors(sm);
2291
+ return doc_count;
2292
+ }
2293
+
2294
+ Array *sm_create_compound_file(SegmentMerger *sm, char *file_name)
2295
+ {
2296
+ Array *files = ary_create(0, &efree);
2297
+ CompoundWriter *cw = open_cw(sm->store, file_name);
2298
+ FieldInfo *fi;
2299
+ char fname[SEGMENT_NAME_MAX_LENGTH];
2300
+
2301
+ int i;
2302
+ for (i = 0; i < NELEMS(COMPOUND_EXTENSIONS); i++) {
2303
+ sprintf(fname, "%s.%s", sm->name, COMPOUND_EXTENSIONS[i]);
2304
+ ary_append(files, estrdup(fname));
2305
+ }
2306
+
2307
+ // Field norm files
2308
+ for (i = 0; i < sm->fis->fcnt; i++) {
2309
+ fi = sm->fis->by_number[i];
2310
+ if (fi->is_indexed && !fi->omit_norms) {
2311
+ sprintf(fname, "%s.f%d", sm->name, i);
2312
+ ary_append(files, estrdup(fname));
2313
+ }
2314
+ }
2315
+
2316
+ // Vector files
2317
+ if (fis_has_vectors(sm->fis)) {
2318
+ for (i = 0; i < NELEMS(VECTOR_EXTENSIONS); i++) {
2319
+ sprintf(fname, "%s.%s", sm->name, VECTOR_EXTENSIONS[i]);
2320
+ ary_append(files, estrdup(fname));
2321
+ }
2322
+ }
2323
+
2324
+ // Now merge all added files
2325
+ for (i = 0; i < files->size; i++) {
2326
+ cw_add_file(cw, (char *)files->elems[i]);
2327
+ }
2328
+
2329
+ // Perform the merge
2330
+ cw_close(cw);
2331
+
2332
+ return files;
2333
+ }
2334
+
2335
+ /****************************************************************************
2336
+ *
2337
+ * IndexReader
2338
+ *
2339
+ ****************************************************************************/
2340
+
2341
+ void ir_acquire_not_necessary(IndexReader *ir) {}
2342
+ void ir_acquire_write_lock(IndexReader *ir)
2343
+ {
2344
+ if (ir->is_stale)
2345
+ eprintf(STATE_ERROR, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations");
2346
+
2347
+ if (ir->write_lock == NULL) {
2348
+ ir->write_lock = ir->store->open_lock(ir->store, WRITE_LOCK_NAME);
2349
+ if (!ir->write_lock->obtain(ir->write_lock)) // obtain write lock
2350
+ eprintf(STATE_ERROR, "Index locked for write: %s", WRITE_LOCK_NAME);
2351
+
2352
+ // we have to check whether index has changed since this reader was opened.
2353
+ // if so, this reader is no longer valid for deletion
2354
+ if (sis_read_current_version(ir->store) > ir->sis->version) {
2355
+ ir->is_stale = true;
2356
+ ir->write_lock->release(ir->write_lock);
2357
+ ir->store->close_lock(ir->write_lock);
2358
+ ir->write_lock = NULL;
2359
+ eprintf(STATE_ERROR, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations");
2360
+ }
2361
+ }
2362
+ }
2363
+
2364
+ IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner, int close_store)
2365
+ {
2366
+ IndexReader *ir = ALLOC(IndexReader);
2367
+
2368
+ mutex_init(&ir->mutex, NULL);
2369
+ ir->is_owner = is_owner;
2370
+ if (is_owner) {
2371
+ ir->acquire_write_lock = &ir_acquire_write_lock;
2372
+ } else {
2373
+ ir->acquire_write_lock = &ir_acquire_not_necessary;
2374
+ }
2375
+
2376
+ ir->store = store;
2377
+ ir->close_store = close_store;
2378
+ ir->sis = sis;
2379
+ ir->has_changes = false;
2380
+ ir->is_stale = false;
2381
+ ir->write_lock = NULL;
2382
+ ir->cache = NULL;
2383
+ ir->sort_cache = NULL;
2384
+ return ir;
2385
+ }
2386
+
2387
+ IndexReader *ir_open(Store *store, int close_store)
2388
+ {
2389
+ int i;
2390
+ IndexReader *ir;
2391
+ SegmentInfos *sis;
2392
+
2393
+ mutex_lock(&store->mutex);
2394
+ sis = sis_create();
2395
+ sis_read(sis, store);
2396
+ if (sis->scnt == 1) {
2397
+ ir = sr_open(sis, 0, true, close_store);
2398
+ } else {
2399
+ IndexReader **readers = ALLOC_N(IndexReader *, sis->scnt);
2400
+ for (i = 0; i < sis->scnt; i++) {
2401
+ readers[i] = sr_open(sis, i, false, false);
2402
+ }
2403
+ ir = mr_open(store, sis, readers, sis->scnt, close_store);
2404
+ }
2405
+ mutex_unlock(&store->mutex);
2406
+ return ir;
2407
+ }
2408
+
2409
+ bool ir_index_exists(Store *store)
2410
+ {
2411
+ return store->exists(store, "segments");
2412
+ }
2413
+
2414
+ void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val)
2415
+ {
2416
+ mutex_lock(&ir->mutex);
2417
+ ir->acquire_write_lock(ir);
2418
+ ir->do_set_norm(ir, doc_num, field, val);
2419
+ ir->has_changes = true;
2420
+ mutex_unlock(&ir->mutex);
2421
+ }
2422
+
2423
+ void ir_undelete_all(IndexReader *ir)
2424
+ {
2425
+ mutex_lock(&ir->mutex);
2426
+ ir->acquire_write_lock(ir);
2427
+ ir->do_undelete_all(ir);
2428
+ ir->has_changes = true;
2429
+ mutex_unlock(&ir->mutex);
2430
+ }
2431
+
2432
+ void ir_delete_doc(IndexReader *ir, int doc_num)
2433
+ {
2434
+ mutex_lock(&ir->mutex);
2435
+ ir->acquire_write_lock(ir);
2436
+ ir->do_delete_doc(ir, doc_num);
2437
+ ir->has_changes = true;
2438
+ mutex_unlock(&ir->mutex);
2439
+ }
2440
+
2441
+ Document *ir_get_doc_with_term(IndexReader *ir, Term *term)
2442
+ {
2443
+ TermDocEnum *tde = ir_term_docs_for(ir, term);
2444
+ if (!tde) return NULL;
2445
+
2446
+ Document *doc = NULL;
2447
+ if (tde->next(tde))
2448
+ doc = ir->get_doc(ir, tde->doc_num(tde));
2449
+ tde->close(tde);
2450
+ return doc;
2451
+ }
2452
+
2453
+ TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term)
2454
+ {
2455
+ TermDocEnum *tde = ir->term_docs(ir);
2456
+ tde->seek(tde, term);
2457
+ return tde;
2458
+ }
2459
+
2460
+ TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term)
2461
+ {
2462
+ TermDocEnum *tde = ir->term_positions(ir);
2463
+ tde->seek(tde, term);
2464
+ return tde;
2465
+ }
2466
+
2467
+ void ir_commit_internal(IndexReader *ir)
2468
+ {
2469
+ if (ir->has_changes) {
2470
+ if (ir->is_owner) {
2471
+
2472
+ mutex_lock(&ir->store->mutex);
2473
+ Lock *commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
2474
+ if (!commit_lock->obtain(commit_lock)) // obtain write lock
2475
+ eprintf(STATE_ERROR, "Index locked for commit: %s", COMMIT_LOCK_NAME);
2476
+
2477
+ ir->do_commit(ir);
2478
+ sis_write(ir->sis, ir->store);
2479
+
2480
+ commit_lock->release(commit_lock);
2481
+ ir->store->close_lock(commit_lock);
2482
+ mutex_unlock(&ir->store->mutex);
2483
+
2484
+ if (ir->write_lock != NULL) {
2485
+ ir->write_lock->release(ir->write_lock); // release write lock
2486
+ ir->store->close_lock(ir->write_lock);
2487
+ ir->write_lock = NULL;
2488
+ }
2489
+ } else {
2490
+ ir->do_commit(ir);
2491
+ }
2492
+ ir->has_changes = false;
2493
+ }
2494
+ }
2495
+
2496
+ void ir_commit(IndexReader *ir)
2497
+ {
2498
+ mutex_lock(&ir->mutex);
2499
+ ir_commit_internal(ir);
2500
+ mutex_unlock(&ir->mutex);
2501
+ }
2502
+
2503
+ void ir_close(IndexReader *ir)
2504
+ {
2505
+ mutex_lock(&ir->mutex);
2506
+ ir_commit_internal(ir);
2507
+ ir->do_close(ir);
2508
+ if (ir->close_store) {
2509
+ ir->store->close(ir->store);
2510
+ }
2511
+ if (ir->is_owner) {
2512
+ sis_destroy(ir->sis);
2513
+ }
2514
+ if (ir->cache) {
2515
+ h_destroy(ir->cache);
2516
+ }
2517
+ if (ir->sort_cache) {
2518
+ h_destroy(ir->sort_cache);
2519
+ }
2520
+
2521
+ mutex_destroy(&ir->mutex);
2522
+ free(ir);
2523
+ }
2524
+
2525
+ void ir_destroy(void *p)
2526
+ {
2527
+ IndexReader *ir = (IndexReader *)p;
2528
+ ir_close(ir);
2529
+ }
2530
+
2531
+ /**
2532
+ * Don't call this method if the cache already exists
2533
+ **/
2534
+ void ir_add_cache(IndexReader *ir)
2535
+ {
2536
+ ir->cache = co_hsh_create();
2537
+ }
2538
+
2539
+ bool ir_is_latest(IndexReader *ir)
2540
+ {
2541
+ return sis_read_current_version(ir->store) == ir->sis->version;
2542
+ }
2543
+