isomorfeus-ferret 0.12.7 → 0.13.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +85 -13
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  11. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib_blocksort.c +1094 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_huffman.c +205 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +497 -495
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +91 -200
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -18
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +603 -410
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
  47. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  48. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  49. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +68 -91
  50. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  51. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  52. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  53. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  54. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  55. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  56. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  57. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  58. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  59. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  60. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  61. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  62. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  63. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  64. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  66. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  67. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  68. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  69. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  70. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +22 -112
  72. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  73. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  74. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  76. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  78. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  80. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  81. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  82. data/ext/isomorfeus_ferret_ext/test.c +0 -17
  83. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  84. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  85. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  86. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  87. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  88. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  89. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  90. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  91. data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
  92. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  93. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  94. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  95. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  96. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  97. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  98. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  99. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  100. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  101. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  102. data/ext/isomorfeus_ferret_ext/test_search.c +60 -64
  103. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  104. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  105. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  106. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  107. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  109. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  110. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  111. data/lib/isomorfeus/ferret/version.rb +1 -1
  112. metadata +27 -57
  113. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  114. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  115. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  116. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  117. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  118. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  119. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  120. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  121. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  122. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  160. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  162. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  163. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  164. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  165. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  166. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -8,7 +8,13 @@
8
8
  #include <ctype.h>
9
9
  #include "brotli_decode.h"
10
10
  #include "brotli_encode.h"
11
+ #include "bzlib.h"
12
+ #include "lz4frame.h"
11
13
 
14
+ #undef close
15
+ #undef read
16
+
17
+ extern rb_encoding *utf8_encoding;
12
18
  extern void frt_micro_sleep(const int micro_seconds);
13
19
 
14
20
  #define GET_LOCK(lock, name, store, err_msg) do {\
@@ -41,8 +47,9 @@ static char *ste_next(FrtTermEnum *te);
41
47
  #define FORMAT 0
42
48
  #define SEGMENTS_GEN_FILE_NAME "segments"
43
49
  #define MAX_EXT_LEN 10
44
- #define COMPRESSION_BUFFER_SIZE 16348
45
- #define COMPRESSION_LEVEL 9
50
+ #define FRT_COMPRESSION_BUFFER_SIZE 16348
51
+ #define FRT_BROTLI_COMPRESSION_LEVEL 4
52
+ #define FRT_BZIP_COMPRESSION_LEVEL 9
46
53
 
47
54
  /* *** Must be three characters *** */
48
55
  static const char *INDEX_EXTENSIONS[] = {
@@ -103,29 +110,22 @@ static frt_u64 str36_to_u64(char *p)
103
110
  * @param ext extension of the filename (including .)
104
111
  * @param gen generation
105
112
  */
106
- char *frt_fn_for_generation(char *buf,
107
- const char *base,
108
- const char *ext,
109
- frt_i64 gen)
110
- {
113
+ char *frt_fn_for_generation(char *buf, const char *base, const char *ext, frt_i64 gen) {
111
114
  if (-1 == gen) {
112
115
  return NULL;
113
- }
114
- else {
116
+ } else {
115
117
  char b[FRT_SEGMENT_NAME_MAX_LENGTH];
116
118
  char *u = u64_to_str36(b, FRT_SEGMENT_NAME_MAX_LENGTH, (frt_u64)gen);
117
119
  if (ext == NULL) {
118
120
  sprintf(buf, "%s_%s", base, u);
119
- }
120
- else {
121
+ } else {
121
122
  sprintf(buf, "%s_%s.%s", base, u, ext);
122
123
  }
123
124
  return buf;
124
125
  }
125
126
  }
126
127
 
127
- static char *segfn_for_generation(char *buf, frt_u64 generation)
128
- {
128
+ static char *segfn_for_generation(char *buf, frt_u64 generation) {
129
129
  char b[FRT_SEGMENT_NAME_MAX_LENGTH];
130
130
  char *u = u64_to_str36(b, FRT_SEGMENT_NAME_MAX_LENGTH, generation);
131
131
  sprintf(buf, FRT_SEGMENTS_FILE_NAME"_%s", u);
@@ -203,8 +203,7 @@ FrtCacheObject *frt_co_create(FrtHash *ref_tab1, FrtHash *ref_tab2,
203
203
  return self;
204
204
  }
205
205
 
206
- FrtHash *frt_co_hash_create()
207
- {
206
+ FrtHash *frt_co_hash_create(void) {
208
207
  return frt_h_new(&co_hash, &co_eq, (frt_free_ft)NULL, (frt_free_ft)&co_destroy);
209
208
  }
210
209
 
@@ -214,22 +213,33 @@ FrtHash *frt_co_hash_create()
214
213
  *
215
214
  ****************************************************************************/
216
215
 
217
- static void fi_set_store(FrtFieldInfo *fi, int store)
218
- {
216
+ static void fi_set_store(FrtFieldInfo *fi, FrtStoreValue store) {
219
217
  switch (store) {
220
218
  case FRT_STORE_NO:
221
219
  break;
222
220
  case FRT_STORE_YES:
223
221
  fi->bits |= FRT_FI_IS_STORED_BM;
224
222
  break;
225
- case FRT_STORE_COMPRESS:
226
- fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_IS_STORED_BM;
223
+ }
224
+ }
225
+
226
+ static void fi_set_compression(FrtFieldInfo *fi, FrtCompressionType compression) {
227
+ switch (compression) {
228
+ case FRT_COMPRESSION_NONE:
229
+ break;
230
+ case FRT_COMPRESSION_BROTLI:
231
+ fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_BROTLI_BM;
232
+ break;
233
+ case FRT_COMPRESSION_BZ2:
234
+ fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_BZ2_BM;
235
+ break;
236
+ case FRT_COMPRESSION_LZ4:
237
+ fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_LZ4_BM;
227
238
  break;
228
239
  }
229
240
  }
230
241
 
231
- static void fi_set_index(FrtFieldInfo *fi, int index)
232
- {
242
+ static void fi_set_index(FrtFieldInfo *fi, FrtIndexValue index) {
233
243
  switch (index) {
234
244
  case FRT_INDEX_NO:
235
245
  break;
@@ -249,8 +259,7 @@ static void fi_set_index(FrtFieldInfo *fi, int index)
249
259
  }
250
260
  }
251
261
 
252
- static void fi_set_term_vector(FrtFieldInfo *fi, int term_vector)
253
- {
262
+ static void fi_set_term_vector(FrtFieldInfo *fi, FrtTermVectorValue term_vector) {
254
263
  switch (term_vector) {
255
264
  case FRT_TERM_VECTOR_NO:
256
265
  break;
@@ -270,33 +279,40 @@ static void fi_set_term_vector(FrtFieldInfo *fi, int term_vector)
270
279
  }
271
280
  }
272
281
 
273
- static void fi_check_params(int store, int index, int term_vector)
274
- {
282
+ static void fi_check_params(FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
275
283
  (void)store;
276
284
  if ((index == FRT_INDEX_NO) && (term_vector != FRT_TERM_VECTOR_NO)) {
277
- FRT_RAISE(FRT_ARG_ERROR,
278
- "You can't store the term vectors of an unindexed field");
285
+ FRT_RAISE(FRT_ARG_ERROR, "You can't store the term vectors of an unindexed field.");
286
+ }
287
+ if ((compression != FRT_COMPRESSION_NONE) && (store == FRT_STORE_NO)) {
288
+ FRT_RAISE(FRT_ARG_ERROR, "Field must be stored for compression to be useful.");
279
289
  }
280
290
  }
281
291
 
282
- FrtFieldInfo *frt_fi_new(FrtSymbol name,
283
- FrtStoreValue store,
284
- FrtIndexValue index,
285
- FrtTermVectorValue term_vector)
286
- {
287
- FrtFieldInfo *fi = FRT_ALLOC(FrtFieldInfo);
292
+ FrtFieldInfo *frt_fi_alloc(void) {
293
+ return FRT_ALLOC(FrtFieldInfo);
294
+ }
295
+
296
+ FrtFieldInfo *frt_fi_init(FrtFieldInfo *fi, ID name, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
288
297
  assert(NULL != name);
289
- fi_check_params(store, index, term_vector);
298
+ fi_check_params(store, compression, index, term_vector);
290
299
  fi->name = name;
291
300
  fi->boost = 1.0f;
292
301
  fi->bits = 0;
293
302
  fi_set_store(fi, store);
303
+ fi_set_compression(fi, compression);
294
304
  fi_set_index(fi, index);
295
305
  fi_set_term_vector(fi, term_vector);
296
306
  fi->ref_cnt = 1;
307
+ fi->rfi = Qnil;
297
308
  return fi;
298
309
  }
299
310
 
311
+ FrtFieldInfo *frt_fi_new(ID name, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
312
+ FrtFieldInfo *fi = frt_fi_alloc();
313
+ return frt_fi_init(fi, name, store, compression, index, term_vector);
314
+ }
315
+
300
316
  void frt_fi_deref(FrtFieldInfo *fi)
301
317
  {
302
318
  if (0 == --(fi->ref_cnt)) {
@@ -304,6 +320,22 @@ void frt_fi_deref(FrtFieldInfo *fi)
304
320
  }
305
321
  }
306
322
 
323
+ FrtCompressionType frt_fi_get_compression(FrtFieldInfo *fi) {
324
+ if (fi_is_compressed(fi)) {
325
+ if (fi_is_compressed_brotli(fi)) {
326
+ return FRT_COMPRESSION_BROTLI;
327
+ } else if (fi_is_compressed_bz2(fi)) {
328
+ return FRT_COMPRESSION_BZ2;
329
+ } else if (fi_is_compressed_lz4(fi)) {
330
+ return FRT_COMPRESSION_LZ4;
331
+ } else {
332
+ return FRT_COMPRESSION_BROTLI;
333
+ }
334
+ } else {
335
+ return FRT_COMPRESSION_NONE;
336
+ }
337
+ }
338
+
307
339
  char *frt_fi_to_s(FrtFieldInfo *fi)
308
340
  {
309
341
  const char *fi_name = rb_id2name(fi->name);
@@ -333,24 +365,31 @@ char *frt_fi_to_s(FrtFieldInfo *fi)
333
365
  *
334
366
  ****************************************************************************/
335
367
 
336
- FrtFieldInfos *frt_fis_new(FrtStoreValue store, FrtIndexValue index,
337
- FrtTermVectorValue term_vector)
338
- {
339
- FrtFieldInfos *fis = FRT_ALLOC(FrtFieldInfos);
340
- fi_check_params(store, index, term_vector);
368
+ FrtFieldInfos *frt_fis_alloc(void) {
369
+ return FRT_ALLOC(FrtFieldInfos);
370
+ }
371
+
372
+ FrtFieldInfos *frt_fis_init(FrtFieldInfos *fis, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
373
+ fi_check_params(store, compression, index, term_vector);
341
374
  fis->field_dict = frt_h_new_ptr((frt_free_ft)&frt_fi_deref);
342
375
  fis->size = 0;
343
376
  fis->capa = FIELD_INFOS_INIT_CAPA;
344
377
  fis->fields = FRT_ALLOC_N(FrtFieldInfo *, fis->capa);
345
378
  fis->store = store;
379
+ fis->compression = compression;
346
380
  fis->index = index;
347
381
  fis->term_vector = term_vector;
348
382
  fis->ref_cnt = 1;
383
+ fis->rfis = Qnil;
349
384
  return fis;
350
385
  }
351
386
 
352
- FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi)
353
- {
387
+ FrtFieldInfos *frt_fis_new(FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
388
+ FrtFieldInfos *fis = frt_fis_alloc();
389
+ return frt_fis_init(fis, store, compression, index, term_vector);
390
+ }
391
+
392
+ FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi) {
354
393
  if (fis->size == fis->capa) {
355
394
  fis->capa <<= 1;
356
395
  FRT_REALLOC_N(fis->fields, FrtFieldInfo *, fis->capa);
@@ -364,23 +403,20 @@ FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi)
364
403
  return fi;
365
404
  }
366
405
 
367
- FrtFieldInfo *frt_fis_get_field(FrtFieldInfos *fis, FrtSymbol name)
368
- {
406
+ FrtFieldInfo *frt_fis_get_field(FrtFieldInfos *fis, ID name) {
369
407
  return (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
370
408
  }
371
409
 
372
- int frt_fis_get_field_num(FrtFieldInfos *fis, FrtSymbol name)
373
- {
410
+ int frt_fis_get_field_num(FrtFieldInfos *fis, ID name) {
374
411
  FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
375
412
  if (fi) { return fi->number; }
376
413
  else { return -1; }
377
414
  }
378
415
 
379
- FrtFieldInfo *frt_fis_get_or_add_field(FrtFieldInfos *fis, FrtSymbol name)
380
- {
416
+ FrtFieldInfo *frt_fis_get_or_add_field(FrtFieldInfos *fis, ID name) {
381
417
  FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
382
418
  if (!fi) {
383
- fi = (FrtFieldInfo*)frt_fi_new(name, fis->store, fis->index, fis->term_vector);
419
+ fi = (FrtFieldInfo*)frt_fi_new(name, fis->store, fis->compression, fis->index, fis->term_vector);
384
420
  frt_fis_add_field(fis, fi);
385
421
  }
386
422
  return fi;
@@ -392,16 +428,14 @@ FrtFieldInfos *frt_fis_read(FrtInStream *is)
392
428
  char *field_name;
393
429
  FRT_TRY
394
430
  do {
395
- FrtStoreValue store_val;
396
- FrtIndexValue index_val;
397
431
  FrtTermVectorValue term_vector_val;
398
432
  volatile int i;
399
433
  union { frt_u32 i; float f; } tmp;
400
434
  FrtFieldInfo *volatile fi;
401
- store_val = (FrtStoreValue)frt_is_read_vint(is);
402
- index_val = (FrtIndexValue)frt_is_read_vint(is);
435
+ FrtStoreValue store_val = (FrtStoreValue)frt_is_read_vint(is);
436
+ FrtIndexValue index_val = (FrtIndexValue)frt_is_read_vint(is);
403
437
  term_vector_val = (FrtTermVectorValue)frt_is_read_vint(is);
404
- fis = frt_fis_new(store_val, index_val, term_vector_val);
438
+ fis = frt_fis_new(store_val, FRT_COMPRESSION_NONE, index_val, term_vector_val); // TODO compression, read from store?
405
439
  for (i = frt_is_read_vint(is); i > 0; i--) {
406
440
  fi = FRT_ALLOC_AND_ZERO(FrtFieldInfo);
407
441
  FRT_TRY
@@ -803,8 +837,7 @@ static char *sis_next_seg_file_name(char *buf, FrtStore *store)
803
837
 
804
838
  #define GEN_FILE_RETRY_COUNT 10
805
839
  #define GEN_LOOK_AHEAD_COUNT 10
806
- static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
807
- void (*run)(FrtStore *store, FindSegmentsFile *fsf))
840
+ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf, void (*run)(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir), FrtIndexReader *ir)
808
841
  {
809
842
  volatile int i;
810
843
  volatile int gen_look_ahead_count = 0;
@@ -911,7 +944,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
911
944
  last_gen = gen;
912
945
  FRT_TRY
913
946
  fsf->generation = gen;
914
- run(store, fsf);
947
+ run(store, fsf, ir);
915
948
  FRT_RETURN_EARLY();
916
949
  return;
917
950
  case FRT_IO_ERROR: case FRT_FILE_NOT_FOUND_ERROR: case FRT_EOF_ERROR:
@@ -957,7 +990,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
957
990
  * prevSegmentFileName + "'" */
958
991
  FRT_TRY
959
992
  fsf->generation = gen - 1;
960
- run(store, fsf);
993
+ run(store, fsf, ir);
961
994
  /* TODO:LOG "success on fallback " +
962
995
  * prev_seg_file_name */
963
996
 
@@ -1040,7 +1073,7 @@ void frt_sis_del_from_to(FrtSegmentInfos *sis, int from, int to)
1040
1073
  }
1041
1074
  }
1042
1075
 
1043
- static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf)
1076
+ static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir_)
1044
1077
  {
1045
1078
  int seg_cnt;
1046
1079
  int i;
@@ -1079,7 +1112,7 @@ static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf)
1079
1112
  FrtSegmentInfos *frt_sis_read(FrtStore *store)
1080
1113
  {
1081
1114
  FindSegmentsFile fsf;
1082
- sis_find_segments_file(store, &fsf, &frt_sis_read_i);
1115
+ sis_find_segments_file(store, &fsf, &frt_sis_read_i, NULL);
1083
1116
  return fsf.ret.sis;
1084
1117
  }
1085
1118
 
@@ -1119,7 +1152,7 @@ void frt_sis_write(FrtSegmentInfos *sis, FrtStore *store, FrtDeleter *deleter)
1119
1152
  }
1120
1153
  }
1121
1154
 
1122
- static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf)
1155
+ static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir_)
1123
1156
  {
1124
1157
  FrtInStream *is;
1125
1158
  frt_u64 version;
@@ -1142,7 +1175,7 @@ static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf)
1142
1175
  frt_u64 frt_sis_read_current_version(FrtStore *store)
1143
1176
  {
1144
1177
  FindSegmentsFile fsf;
1145
- sis_find_segments_file(store, &fsf, &frt_sis_read_ver_i);
1178
+ sis_find_segments_file(store, &fsf, &frt_sis_read_ver_i, NULL);
1146
1179
  return fsf.ret.uint64;
1147
1180
  }
1148
1181
 
@@ -1152,18 +1185,17 @@ frt_u64 frt_sis_read_current_version(FrtStore *store)
1152
1185
  *
1153
1186
  ****************************************************************************/
1154
1187
 
1155
- static FrtLazyDocField *lazy_df_new(FrtSymbol name, const int size, bool is_compressed)
1156
- {
1188
+ static FrtLazyDocField *lazy_df_new(ID name, const int size, FrtCompressionType compression) {
1157
1189
  FrtLazyDocField *self = FRT_ALLOC(FrtLazyDocField);
1158
1190
  self->name = name;
1159
1191
  self->size = size;
1160
1192
  self->data = FRT_ALLOC_AND_ZERO_N(FrtLazyDocFieldData, size);
1161
- self->is_compressed = is_compressed;
1193
+ self->compression = compression;
1194
+ self->decompressed = false;
1162
1195
  return self;
1163
1196
  }
1164
1197
 
1165
- static void lazy_df_destroy(FrtLazyDocField *self)
1166
- {
1198
+ static void lazy_df_destroy(FrtLazyDocField *self) {
1167
1199
  int i;
1168
1200
  for (i = self->size - 1; i >= 0; i--) {
1169
1201
  if (self->data[i].text) {
@@ -1174,16 +1206,14 @@ static void lazy_df_destroy(FrtLazyDocField *self)
1174
1206
  free(self);
1175
1207
  }
1176
1208
 
1177
- static void comp_raise()
1178
- {
1209
+ static void comp_raise(void) {
1179
1210
  FRT_RAISE(EXCEPTION, "Compression error");
1180
1211
  }
1181
1212
 
1182
- static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len)
1183
- {
1213
+ static char *is_read_brotli_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
1184
1214
  int buf_out_idx = 0;
1185
1215
  int read_len;
1186
- frt_uchar buf_in[COMPRESSION_BUFFER_SIZE];
1216
+ frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1187
1217
  const frt_uchar *next_in;
1188
1218
  size_t available_in;
1189
1219
  frt_uchar *buf_out = NULL;
@@ -1195,20 +1225,20 @@ static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *
1195
1225
  if (!b_state) { comp_raise(); return NULL; }
1196
1226
 
1197
1227
  do {
1198
- read_len = compressed_len > COMPRESSION_BUFFER_SIZE ? COMPRESSION_BUFFER_SIZE : compressed_len;
1228
+ read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1199
1229
  frt_is_read_bytes(is, buf_in, read_len);
1200
1230
  compressed_len -= read_len;
1201
1231
  available_in = read_len;
1202
1232
  next_in = buf_in;
1203
- available_out = COMPRESSION_BUFFER_SIZE;
1233
+ available_out = FRT_COMPRESSION_BUFFER_SIZE;
1204
1234
  do {
1205
- FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + COMPRESSION_BUFFER_SIZE);
1235
+ FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
1206
1236
  next_out = buf_out + buf_out_idx;
1207
1237
  b_result = BrotliDecoderDecompressStream(b_state,
1208
1238
  &available_in, &next_in,
1209
1239
  &available_out, &next_out, NULL);
1210
1240
  if (b_result == BROTLI_DECODER_RESULT_ERROR) { comp_raise(); return NULL; }
1211
- buf_out_idx += COMPRESSION_BUFFER_SIZE - available_out;
1241
+ buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - available_out;
1212
1242
  } while (b_result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT);
1213
1243
  } while (b_result != BROTLI_DECODER_RESULT_SUCCESS && compressed_len > 0);
1214
1244
 
@@ -1220,16 +1250,180 @@ static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *
1220
1250
  return (char *)buf_out;
1221
1251
  }
1222
1252
 
1223
- char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
1224
- {
1253
+ static void zraise(int ret) {
1254
+ switch (ret) {
1255
+ case BZ_IO_ERROR:
1256
+ if (ferror(stdin))
1257
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: error reading stdin");
1258
+ if (ferror(stdout))
1259
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: error writing stdout");
1260
+ break;
1261
+ case BZ_CONFIG_ERROR:
1262
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: system configuration error");
1263
+ break;
1264
+ case BZ_SEQUENCE_ERROR: /* shouldn't occur if code is correct */
1265
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! sequence error");
1266
+ break;
1267
+ case BZ_PARAM_ERROR: /* shouldn't occur if code is correct */
1268
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! parameter error");
1269
+ break;
1270
+ case BZ_MEM_ERROR:
1271
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: memory error");
1272
+ break;
1273
+ case BZ_DATA_ERROR:
1274
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check error");
1275
+ break;
1276
+ case BZ_DATA_ERROR_MAGIC:
1277
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check - non-matching magic");
1278
+ break;
1279
+ case BZ_UNEXPECTED_EOF:
1280
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: unexpected end-of-file");
1281
+ break;
1282
+ case BZ_OUTBUFF_FULL:
1283
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: output buffer full");
1284
+ break;
1285
+ default:
1286
+ FRT_RAISE(FRT_EXCEPTION, "bzlib: unknown error");
1287
+ }
1288
+ }
1289
+
1290
+ static char *is_read_bz2_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
1291
+ int buf_out_idx = 0, ret, read_len;
1292
+ char *buf_out = NULL;
1293
+ char buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1294
+ bz_stream zstrm;
1295
+ zstrm.bzalloc = NULL;
1296
+ zstrm.bzfree = NULL;
1297
+ zstrm.opaque = NULL;
1298
+ zstrm.next_in = NULL;
1299
+ zstrm.avail_in = 0;
1300
+ if ((ret = BZ2_bzDecompressInit(&zstrm, 0, 0)) != BZ_OK) zraise(ret);
1301
+
1302
+ do {
1303
+ read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1304
+ frt_is_read_bytes(is, (frt_uchar *)buf_in, read_len);
1305
+ compressed_len -= read_len;
1306
+ zstrm.avail_in = read_len;
1307
+ zstrm.next_in = buf_in;
1308
+ zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
1309
+
1310
+ do {
1311
+ REALLOC_N(buf_out, char, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
1312
+ zstrm.next_out = buf_out + buf_out_idx;
1313
+ ret = BZ2_bzDecompress(&zstrm);
1314
+ assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
1315
+ if (ret != BZ_OK && ret != BZ_STREAM_END) {
1316
+ (void)BZ2_bzDecompressEnd(&zstrm);
1317
+ zraise(ret);
1318
+ }
1319
+ buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
1320
+ } while (zstrm.avail_out == 0);
1321
+ } while (ret != BZ_STREAM_END && compressed_len != 0);
1322
+
1323
+ (void)BZ2_bzDecompressEnd(&zstrm);
1324
+
1325
+ FRT_REALLOC_N(buf_out, char, buf_out_idx + 1);
1326
+ buf_out[buf_out_idx] = '\0';
1327
+
1328
+ *len = buf_out_idx;
1329
+ return (char *)buf_out;
1330
+ }
1331
+
1332
+ static char *is_read_lz4_compressed_bytes(FrtInStream *is, int compressed_len, int *length) {
1333
+ frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1334
+ char *buf_out = NULL;
1335
+ int dc_length = 0;
1336
+ LZ4F_dctx *dctx;
1337
+ LZ4F_frameInfo_t frame_info;
1338
+ LZ4F_errorCode_t dctx_status = LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION);
1339
+ if (LZ4F_isError(dctx_status)) { *length = -1; return NULL; }
1340
+
1341
+ /* header and buffer */
1342
+ int read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1343
+ frt_is_read_bytes(is, buf_in, read_length);
1344
+ compressed_len -= read_length;
1345
+
1346
+ size_t consumed_size = read_length;
1347
+ size_t res = LZ4F_getFrameInfo(dctx, &frame_info, buf_in, &consumed_size);
1348
+ if (LZ4F_isError(res)) { *length = -1; return NULL; }
1349
+ size_t buf_out_length;
1350
+ switch(frame_info.blockSizeID) {
1351
+ case LZ4F_default:
1352
+ case LZ4F_max64KB:
1353
+ buf_out_length = 1 << 16;
1354
+ break;
1355
+ case LZ4F_max256KB:
1356
+ buf_out_length = 1 << 18;
1357
+ break;
1358
+ case LZ4F_max1MB:
1359
+ buf_out_length = 1 << 20;
1360
+ break;
1361
+ case LZ4F_max4MB:
1362
+ buf_out_length = 1 << 22;
1363
+ break;
1364
+ default:
1365
+ buf_out_length = 0;
1366
+ }
1367
+
1368
+ res = 1;
1369
+ int first_chunk = 1;
1370
+
1371
+ /* decompress data */
1372
+ while (res != 0) {
1373
+ if (!first_chunk) {
1374
+ read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1375
+ frt_is_read_bytes(is, buf_in, read_length);
1376
+ compressed_len -= read_length;
1377
+ consumed_size = 0;
1378
+ }
1379
+ first_chunk = 0;
1380
+
1381
+ char *src = (char *)(buf_in + consumed_size);
1382
+ char *src_end = (char *)buf_in + read_length;
1383
+
1384
+ while (src < src_end && res != 0){
1385
+ size_t dest_length = buf_out_length;
1386
+ size_t consumed_size = read_length;
1387
+ FRT_REALLOC_N(buf_out, char, dc_length + buf_out_length);
1388
+ res = LZ4F_decompress(dctx, buf_out + dc_length, &dest_length, src, &consumed_size, NULL);
1389
+ if (LZ4F_isError(res)) { *length = -1; return NULL; }
1390
+ dc_length += dest_length;
1391
+ src = src + consumed_size;
1392
+ }
1393
+ }
1394
+
1395
+ /* finish up */
1396
+ LZ4F_freeDecompressionContext(dctx);
1397
+
1398
+ FRT_REALLOC_N(buf_out, char, dc_length + 1);
1399
+ buf_out[dc_length] = '\0';
1400
+
1401
+ *length = dc_length;
1402
+ return buf_out;
1403
+ }
1404
+
1405
+ static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len, FrtCompressionType compression) {
1406
+ switch (compression) {
1407
+ case FRT_COMPRESSION_BROTLI:
1408
+ return is_read_brotli_compressed_bytes(is, compressed_len, len);
1409
+ case FRT_COMPRESSION_BZ2:
1410
+ return is_read_bz2_compressed_bytes(is, compressed_len, len);
1411
+ case FRT_COMPRESSION_LZ4:
1412
+ return is_read_lz4_compressed_bytes(is, compressed_len, len);
1413
+ default:
1414
+ return NULL;
1415
+ }
1416
+ }
1417
+
1418
+ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i) {
1225
1419
  char *text = NULL;
1226
1420
  if (i < self->size && i >= 0) {
1227
1421
  text = self->data[i].text;
1228
1422
  if (NULL == text) {
1229
1423
  const int read_len = self->data[i].length + 1;
1230
1424
  frt_is_seek(self->doc->fields_in, self->data[i].start);
1231
- if (self->is_compressed) {
1232
- self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length));
1425
+ if (self->data[i].compression != FRT_COMPRESSION_NONE) {
1426
+ self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length), self->data[i].compression);
1233
1427
  } else {
1234
1428
  self->data[i].text = text = FRT_ALLOC_N(char, read_len);
1235
1429
  frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
@@ -1241,9 +1435,8 @@ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
1241
1435
  return text;
1242
1436
  }
1243
1437
 
1244
- void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
1245
- {
1246
- if (self->is_compressed == 1) {
1438
+ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len) {
1439
+ if (self->compression != FRT_COMPRESSION_NONE && !self->decompressed) {
1247
1440
  int i;
1248
1441
  self->len = 0;
1249
1442
  for (i = self->size-1; i >= 0; i--) {
@@ -1251,7 +1444,7 @@ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
1251
1444
  self->len += self->data[i].length + 1;
1252
1445
  }
1253
1446
  self->len--; /* each field separated by ' ' but no need to add to end */
1254
- self->is_compressed = 2;
1447
+ self->decompressed = true;
1255
1448
  }
1256
1449
  if (start < 0 || start >= self->len) {
1257
1450
  FRT_RAISE(FRT_IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
@@ -1264,7 +1457,7 @@ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
1264
1457
  FRT_RAISE(FRT_IO_ERROR, "Tried to read past end of field. Field is only %d "
1265
1458
  "bytes long but tried to read to %d", self->len, start + len);
1266
1459
  }
1267
- if (self->is_compressed) {
1460
+ if (self->compression != FRT_COMPRESSION_NONE) {
1268
1461
  int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
1269
1462
  for (i = 0; i < self->size; i++) {
1270
1463
  cur_end = cur_start + self->data[i].length;
@@ -1328,21 +1521,17 @@ static void lazy_doc_add_field(FrtLazyDoc *self, FrtLazyDocField *lazy_df, int i
1328
1521
  lazy_df->doc = self;
1329
1522
  }
1330
1523
 
1331
- FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self, FrtSymbol field)
1332
- {
1524
+ FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self, ID field) {
1333
1525
  return (FrtLazyDocField *)frt_h_get(self->field_dictionary, (void *)field);
1334
1526
  }
1335
1527
 
1336
1528
  /****************************************************************************
1337
- *
1338
1529
  * FrtFieldsReader
1339
- *
1340
1530
  ****************************************************************************/
1341
1531
 
1342
1532
  #define FIELDS_IDX_PTR_SIZE 12
1343
1533
 
1344
- FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
1345
- {
1534
+ FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
1346
1535
  FrtFieldsReader *fr = FRT_ALLOC(FrtFieldsReader);
1347
1536
  FrtInStream *fdx_in;
1348
1537
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
@@ -1362,8 +1551,7 @@ FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos
1362
1551
  return fr;
1363
1552
  }
1364
1553
 
1365
- FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig)
1366
- {
1554
+ FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig) {
1367
1555
  FrtFieldsReader *fr = FRT_ALLOC(FrtFieldsReader);
1368
1556
 
1369
1557
  memcpy(fr, orig, sizeof(FrtFieldsReader));
@@ -1373,35 +1561,33 @@ FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig)
1373
1561
  return fr;
1374
1562
  }
1375
1563
 
1376
- void frt_fr_close(FrtFieldsReader *fr)
1377
- {
1564
+ void frt_fr_close(FrtFieldsReader *fr) {
1378
1565
  frt_is_close(fr->fdt_in);
1379
1566
  frt_is_close(fr->fdx_in);
1380
1567
  free(fr);
1381
1568
  }
1382
1569
 
1383
- static FrtDocField *frt_fr_df_new(FrtSymbol name, int size, bool is_compressed)
1384
- {
1570
+ static FrtDocField *frt_fr_df_new(ID name, int size, FrtCompressionType compression) {
1385
1571
  FrtDocField *df = FRT_ALLOC(FrtDocField);
1386
1572
  df->name = name;
1387
1573
  df->capa = df->size = size;
1388
1574
  df->data = FRT_ALLOC_N(char *, df->capa);
1389
1575
  df->lengths = FRT_ALLOC_N(int, df->capa);
1576
+ df->encodings = FRT_ALLOC_N(rb_encoding *, df->capa);
1390
1577
  df->destroy_data = true;
1391
1578
  df->boost = 1.0f;
1392
- df->is_compressed = is_compressed;
1579
+ df->compression = compression;
1393
1580
  return df;
1394
1581
  }
1395
1582
 
1396
- static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df)
1397
- {
1583
+ static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df, FrtCompressionType compression) {
1398
1584
  int i;
1399
1585
  const int df_size = df->size;
1400
1586
  FrtInStream *fdt_in = fr->fdt_in;
1401
1587
 
1402
1588
  for (i = 0; i < df_size; i++) {
1403
1589
  const int compressed_len = df->lengths[i] + 1;
1404
- df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]));
1590
+ df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]), compression);
1405
1591
  }
1406
1592
  }
1407
1593
 
@@ -1423,18 +1609,20 @@ FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
1423
1609
  const int field_num = frt_is_read_vint(fdt_in);
1424
1610
  FrtFieldInfo *fi = fr->fis->fields[field_num];
1425
1611
  const int df_size = frt_is_read_vint(fdt_in);
1426
- FrtDocField *df = frt_fr_df_new(fi->name, df_size, fi_is_compressed(fi));
1612
+ FrtDocField *df = frt_fr_df_new(fi->name, df_size, frt_fi_get_compression(fi));
1427
1613
 
1428
1614
  for (j = 0; j < df_size; j++) {
1429
1615
  df->lengths[j] = frt_is_read_vint(fdt_in);
1616
+ df->encodings[j] = rb_enc_from_index(frt_is_read_vint(fdt_in));
1617
+ df->compression = frt_is_read_vint(fdt_in);
1430
1618
  }
1431
1619
 
1432
1620
  frt_doc_add_field(doc, df);
1433
1621
  }
1434
1622
  for (i = 0; i < stored_cnt; i++) {
1435
1623
  FrtDocField *df = doc->fields[i];
1436
- if (df->is_compressed) {
1437
- frt_fr_read_compressed_fields(fr, df);
1624
+ if (df->compression != FRT_COMPRESSION_NONE) {
1625
+ frt_fr_read_compressed_fields(fr, df, df->compression);
1438
1626
  } else {
1439
1627
  const int df_size = df->size;
1440
1628
  for (j = 0; j < df_size; j++) {
@@ -1458,31 +1646,37 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
1458
1646
  FrtLazyDoc *lazy_doc;
1459
1647
  FrtInStream *fdx_in = fr->fdx_in;
1460
1648
  FrtInStream *fdt_in = fr->fdt_in;
1649
+
1461
1650
  frt_is_seek(fdx_in, doc_num * FIELDS_IDX_PTR_SIZE);
1462
1651
  pos = (off_t)frt_is_read_u64(fdx_in);
1463
1652
  frt_is_seek(fdt_in, pos);
1464
1653
  stored_cnt = frt_is_read_vint(fdt_in);
1654
+
1465
1655
  lazy_doc = lazy_doc_new(stored_cnt, fdt_in);
1466
1656
  for (i = 0; i < stored_cnt; i++) {
1467
1657
  FrtFieldInfo *fi = fr->fis->fields[frt_is_read_vint(fdt_in)];
1468
- const int data_cnt = frt_is_read_vint(fdt_in);
1469
- FrtLazyDocField *lazy_df = lazy_df_new(fi->name, data_cnt, fi_is_compressed(fi));
1658
+ const int df_size = frt_is_read_vint(fdt_in);
1659
+ FrtLazyDocField *lazy_df = lazy_df_new(fi->name, df_size, frt_fi_get_compression(fi));
1470
1660
  const int field_start = start;
1471
1661
  /* get the starts relative positions this time around */
1472
- for (j = 0; j < data_cnt; j++) {
1662
+
1663
+ for (j = 0; j < df_size; j++) {
1473
1664
  lazy_df->data[j].start = start;
1474
1665
  start += 1 + (lazy_df->data[j].length = frt_is_read_vint(fdt_in));
1666
+ lazy_df->data[j].encoding = rb_enc_from_index(frt_is_read_vint(fdt_in));
1667
+ lazy_df->data[j].compression = frt_is_read_vint(fdt_in);
1475
1668
  }
1669
+
1476
1670
  lazy_df->len = start - field_start - 1;
1477
1671
  lazy_doc_add_field(lazy_doc, lazy_df, i);
1478
1672
  }
1479
1673
  /* correct the starts to their correct absolute positions */
1674
+ const off_t abs_start = frt_is_pos(fdt_in);
1480
1675
  for (i = 0; i < stored_cnt; i++) {
1481
1676
  FrtLazyDocField *lazy_df = lazy_doc->fields[i];
1482
- const int data_cnt = lazy_df->size;
1483
- const off_t start = frt_is_pos(fdt_in);
1484
- for (j = 0; j < data_cnt; j++) {
1485
- lazy_df->data[j].start += start;
1677
+ const int df_size = lazy_df->size;
1678
+ for (j = 0; j < df_size; j++) {
1679
+ lazy_df->data[j].start += abs_start;
1486
1680
  }
1487
1681
  }
1488
1682
 
@@ -1517,8 +1711,7 @@ static FrtTermVector *frt_fr_read_term_vector(FrtFieldsReader *fr, int field_num
1517
1711
  total_len = delta_start + delta_len;
1518
1712
  frt_is_read_bytes(fdt_in, buffer + delta_start, delta_len);
1519
1713
  buffer[total_len++] = '\0';
1520
- term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len),
1521
- buffer, total_len);
1714
+ term->text = (char *)memcpy(FRT_ALLOC_N(char, total_len), buffer, total_len);
1522
1715
 
1523
1716
  /* read freq */
1524
1717
  freq = term->freq = frt_is_read_vint(fdt_in);
@@ -1629,8 +1822,7 @@ FrtTermVector *frt_fr_get_field_tv(FrtFieldsReader *fr, int doc_num, int field_n
1629
1822
  *
1630
1823
  ****************************************************************************/
1631
1824
 
1632
- FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
1633
- {
1825
+ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
1634
1826
  FrtFieldsWriter *fw = FRT_ALLOC(FrtFieldsWriter);
1635
1827
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
1636
1828
  size_t segment_len = strlen(segment);
@@ -1651,8 +1843,7 @@ FrtFieldsWriter *frt_fw_open(FrtStore *store, const char *segment, FrtFieldInfos
1651
1843
  return fw;
1652
1844
  }
1653
1845
 
1654
- void frt_fw_close(FrtFieldsWriter *fw)
1655
- {
1846
+ void frt_fw_close(FrtFieldsWriter *fw) {
1656
1847
  frt_os_close(fw->fdt_out);
1657
1848
  frt_os_close(fw->fdx_out);
1658
1849
  frt_ram_destroy_buffer(fw->buffer);
@@ -1660,42 +1851,150 @@ void frt_fw_close(FrtFieldsWriter *fw)
1660
1851
  free(fw);
1661
1852
  }
1662
1853
 
1663
- static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length)
1664
- {
1665
- size_t compressed_len = 0;
1854
+ static int frt_os_write_brotli_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1855
+ size_t compressed_length = 0;
1666
1856
  const frt_uchar *next_in = data;
1667
1857
  size_t available_in = length;
1668
1858
  size_t available_out;
1669
- frt_uchar compression_buffer[COMPRESSION_BUFFER_SIZE];
1859
+ frt_uchar compression_buffer[FRT_COMPRESSION_BUFFER_SIZE];
1670
1860
  frt_uchar *next_out;
1671
1861
  BrotliEncoderState *b_state = BrotliEncoderCreateInstance(NULL, NULL, NULL);
1672
1862
  if (!b_state) { comp_raise(); return -1; }
1673
1863
 
1674
- BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, COMPRESSION_LEVEL);
1864
+ BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, FRT_BROTLI_COMPRESSION_LEVEL);
1675
1865
 
1676
1866
  do {
1677
- available_out = COMPRESSION_BUFFER_SIZE;
1867
+ available_out = FRT_COMPRESSION_BUFFER_SIZE;
1678
1868
  next_out = compression_buffer;
1679
1869
  if (!BrotliEncoderCompressStream(b_state, BROTLI_OPERATION_FINISH,
1680
1870
  &available_in, &next_in,
1681
- &available_out, &next_out, &compressed_len)) {
1871
+ &available_out, &next_out, &compressed_length)) {
1682
1872
  BrotliEncoderDestroyInstance(b_state);
1683
1873
  comp_raise();
1684
1874
  return -1;
1685
1875
  }
1686
- frt_os_write_bytes(out_stream, compression_buffer, COMPRESSION_BUFFER_SIZE - available_out);
1876
+ frt_os_write_bytes(out_stream, compression_buffer, FRT_COMPRESSION_BUFFER_SIZE - available_out);
1687
1877
  } while (!BrotliEncoderIsFinished(b_state));
1688
1878
 
1689
1879
  BrotliEncoderDestroyInstance(b_state);
1690
- // fprintf(stderr, "Compressed: %i -> %i\n", length, (int)compressed_len);
1691
- return (int)compressed_len;
1880
+
1881
+ return (int)compressed_length;
1692
1882
  }
1693
1883
 
1694
- void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
1695
- {
1884
+ static int frt_os_write_bz2_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1885
+ int ret, buf_size, compressed_len = 0;
1886
+ char out_buffer[FRT_COMPRESSION_BUFFER_SIZE];
1887
+ bz_stream zstrm;
1888
+ zstrm.bzalloc = NULL;
1889
+ zstrm.bzfree = NULL;
1890
+ zstrm.opaque = NULL;
1891
+ if ((ret = BZ2_bzCompressInit(&zstrm, FRT_BZIP_COMPRESSION_LEVEL, 0, 0)) != BZ_OK) zraise(ret);
1892
+
1893
+ zstrm.avail_in = length;
1894
+ zstrm.next_in = (char *)data;
1895
+ zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
1896
+ zstrm.next_out = out_buffer;
1897
+
1898
+ do {
1899
+ ret = BZ2_bzCompress(&zstrm, BZ_FINISH); /* no bad return value */
1900
+ assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
1901
+ compressed_len += buf_size = FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
1902
+ frt_os_write_bytes(out_stream, (frt_uchar *)out_buffer, buf_size);
1903
+ } while (zstrm.avail_out == 0);
1904
+ assert(zstrm.avail_in == 0); /* all input will be used */
1905
+
1906
+ (void)BZ2_bzCompressEnd(&zstrm);
1907
+ return compressed_len;
1908
+ }
1909
+
1910
+ static const LZ4F_preferences_t lz4_prefs = {
1911
+ {
1912
+ LZ4F_default,
1913
+ LZ4F_blockLinked,
1914
+ LZ4F_noContentChecksum,
1915
+ LZ4F_frame,
1916
+ 0, /* unknown content size */
1917
+ 0, /* no dictID */
1918
+ LZ4F_noBlockChecksum
1919
+ },
1920
+ 0,
1921
+ 1,
1922
+ 1,
1923
+ {0,0,0}
1924
+ };
1925
+
1926
+ static int frt_os_write_lz4_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1927
+ int compressed_length = 0;
1928
+ int remaining_length = length;
1929
+ size_t ccmp_length = 0;
1930
+ LZ4F_compressionContext_t ctx;
1931
+ size_t out_buf_length = LZ4F_compressBound(FRT_COMPRESSION_BUFFER_SIZE, &lz4_prefs);
1932
+ frt_uchar *out_buf = frt_ecalloc(out_buf_length);
1933
+
1934
+ size_t ctx_creation = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
1935
+ if (LZ4F_isError(ctx_creation)) {
1936
+ compressed_length = -1;
1937
+ goto finish;
1938
+ }
1939
+
1940
+ /* create header */
1941
+ ccmp_length = LZ4F_compressBegin(ctx, out_buf, out_buf_length, &lz4_prefs);
1942
+ if (LZ4F_isError(ccmp_length)) {
1943
+ compressed_length = -1;
1944
+ goto finish;
1945
+ }
1946
+ compressed_length = ccmp_length;
1947
+ frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1948
+
1949
+ /* compress data */
1950
+ do {
1951
+ int read_length = (FRT_COMPRESSION_BUFFER_SIZE > remaining_length) ? remaining_length : FRT_COMPRESSION_BUFFER_SIZE;
1952
+ ccmp_length = LZ4F_compressUpdate(ctx, out_buf, out_buf_length, data + (length - remaining_length), read_length, NULL);
1953
+ if (LZ4F_isError(ccmp_length)) {
1954
+ compressed_length = -1;
1955
+ goto finish;
1956
+ }
1957
+ frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1958
+ compressed_length += ccmp_length;
1959
+ remaining_length -= read_length;
1960
+ } while (remaining_length > 0);
1961
+
1962
+ /* finish up */
1963
+ ccmp_length = LZ4F_compressEnd(ctx, out_buf, out_buf_length, NULL);
1964
+ if (LZ4F_isError(ccmp_length)) {
1965
+ compressed_length = -1;
1966
+ goto finish;
1967
+ }
1968
+
1969
+ frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1970
+ compressed_length += ccmp_length;
1971
+
1972
+ finish:
1973
+ LZ4F_freeCompressionContext(ctx);
1974
+ free(out_buf);
1975
+
1976
+ return compressed_length;
1977
+ }
1978
+
1979
+ static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length, FrtCompressionType compression) {
1980
+ switch (compression) {
1981
+ case FRT_COMPRESSION_BROTLI:
1982
+ return frt_os_write_brotli_compressed_bytes(out_stream, data, length);
1983
+ case FRT_COMPRESSION_BZ2:
1984
+ return frt_os_write_bz2_compressed_bytes(out_stream, data, length);
1985
+ case FRT_COMPRESSION_LZ4:
1986
+ return frt_os_write_lz4_compressed_bytes(out_stream, data, length);
1987
+ default:
1988
+ return -1;
1989
+ }
1990
+
1991
+ }
1992
+
1993
+ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
1696
1994
  int i, j, stored_cnt = 0;
1697
1995
  FrtDocField *df;
1698
1996
  FrtFieldInfo *fi;
1997
+ FrtCompressionType compression;
1699
1998
  FrtOutStream *fdt_out = fw->fdt_out, *fdx_out = fw->fdx_out;
1700
1999
  const int doc_size = doc->size;
1701
2000
 
@@ -1719,16 +2018,22 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
1719
2018
  const int df_size = df->size;
1720
2019
  frt_os_write_vint(fdt_out, fi->number);
1721
2020
  frt_os_write_vint(fdt_out, df_size);
2021
+
1722
2022
  if (fi_is_compressed(fi)) {
2023
+ compression = frt_fi_get_compression(fi);
1723
2024
  for (j = 0; j < df_size; j++) {
1724
2025
  const int length = df->lengths[j];
1725
- int compressed_len = frt_os_write_compressed_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
2026
+ int compressed_len = frt_os_write_compressed_bytes(fw->buffer, (frt_uchar*)df->data[j], length, compression);
1726
2027
  frt_os_write_vint(fdt_out, compressed_len - 1);
2028
+ frt_os_write_vint(fdt_out, rb_enc_to_index(df->encodings[j]));
2029
+ frt_os_write_vint(fdt_out, compression);
1727
2030
  }
1728
2031
  } else {
1729
2032
  for (j = 0; j < df_size; j++) {
1730
2033
  const int length = df->lengths[j];
1731
2034
  frt_os_write_vint(fdt_out, length);
2035
+ frt_os_write_vint(fdt_out, rb_enc_to_index(df->encodings[j]));
2036
+ frt_os_write_vint(fdt_out, FRT_COMPRESSION_NONE);
1732
2037
  frt_os_write_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
1733
2038
  /* leave a space between fields as that is how they are analyzed */
1734
2039
  frt_os_write_byte(fw->buffer, ' ');
@@ -1739,8 +2044,7 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
1739
2044
  frt_ramo_write_to(fw->buffer, fdt_out);
1740
2045
  }
1741
2046
 
1742
- void frt_fw_write_tv_index(FrtFieldsWriter *fw)
1743
- {
2047
+ void frt_fw_write_tv_index(FrtFieldsWriter *fw) {
1744
2048
  int i;
1745
2049
  const int tv_cnt = frt_ary_size(fw->tv_fields);
1746
2050
  FrtOutStream *fdt_out = fw->fdt_out;
@@ -2087,8 +2391,7 @@ static char *ste_scan_to(FrtTermEnum *te, const char *term)
2087
2391
  }
2088
2392
  }
2089
2393
 
2090
- static FrtSegmentTermEnum *ste_allocate()
2091
- {
2394
+ static FrtSegmentTermEnum *ste_allocate(void) {
2092
2395
  FrtSegmentTermEnum *ste = FRT_ALLOC_AND_ZERO(FrtSegmentTermEnum);
2093
2396
 
2094
2397
  TE(ste)->next = &ste_next;
@@ -2113,7 +2416,6 @@ void frt_ste_close(FrtTermEnum *te)
2113
2416
  free(te);
2114
2417
  }
2115
2418
 
2116
-
2117
2419
  static char *frt_ste_get_term(FrtTermEnum *te, int pos)
2118
2420
  {
2119
2421
  FrtSegmentTermEnum *ste = STE(te);
@@ -2228,9 +2530,7 @@ static void tew_destroy(TermEnumWrapper *tew)
2228
2530
  tew->te->close(tew->te);
2229
2531
  }
2230
2532
 
2231
- static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *te,
2232
- FrtIndexReader *ir)
2233
- {
2533
+ static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *te, FrtIndexReader *ir) {
2234
2534
  tew->index = index;
2235
2535
  tew->ir = ir;
2236
2536
  tew->te = te;
@@ -2239,9 +2539,7 @@ static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *
2239
2539
  return tew;
2240
2540
  }
2241
2541
 
2242
-
2243
- static char *mte_next(FrtTermEnum *te)
2244
- {
2542
+ static char *mte_next(FrtTermEnum *te) {
2245
2543
  TermEnumWrapper *top =
2246
2544
  (TermEnumWrapper *)frt_pq_top(MTE(te)->tew_queue);
2247
2545
 
@@ -2271,8 +2569,7 @@ static char *mte_next(FrtTermEnum *te)
2271
2569
  return te->curr_term;
2272
2570
  }
2273
2571
 
2274
- static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num)
2275
- {
2572
+ static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num) {
2276
2573
  MultiTermEnum *mte = MTE(te);
2277
2574
  int i;
2278
2575
  const int size = mte->size;
@@ -2300,8 +2597,7 @@ static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num)
2300
2597
  return te;
2301
2598
  }
2302
2599
 
2303
- static char *mte_skip_to(FrtTermEnum *te, const char *term)
2304
- {
2600
+ static char *mte_skip_to(FrtTermEnum *te, const char *term) {
2305
2601
  MultiTermEnum *mte = MTE(te);
2306
2602
  int i;
2307
2603
  const int size = mte->size;
@@ -2317,8 +2613,7 @@ static char *mte_skip_to(FrtTermEnum *te, const char *term)
2317
2613
  return mte_next(te);
2318
2614
  }
2319
2615
 
2320
- static void mte_close(FrtTermEnum *te)
2321
- {
2616
+ static void mte_close(FrtTermEnum *te) {
2322
2617
  int i;
2323
2618
  const int size = MTE(te)->size;
2324
2619
  for (i = 0; i < size; i++) {
@@ -2331,10 +2626,9 @@ static void mte_close(FrtTermEnum *te)
2331
2626
  free(te);
2332
2627
  }
2333
2628
 
2334
- FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2335
- {
2336
- FrtIndexReader **readers = mr->sub_readers;
2337
- int r_cnt = mr->r_cnt;
2629
+ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term) {
2630
+ FrtIndexReader **readers = mr->sub_readers;
2631
+ int r_cnt = mr->r_cnt;
2338
2632
  int i;
2339
2633
  FrtIndexReader *reader;
2340
2634
  MultiTermEnum *mte = FRT_ALLOC_AND_ZERO(MultiTermEnum);
@@ -2362,8 +2656,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2362
2656
 
2363
2657
  if (NULL != term) {
2364
2658
  sub_te = reader->terms_from(reader, fnum, term);
2365
- }
2366
- else {
2659
+ } else {
2367
2660
  sub_te = reader->terms(reader, fnum);
2368
2661
  }
2369
2662
 
@@ -2372,8 +2665,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2372
2665
  || (tew->term && (tew->term[0] != '\0'))) {
2373
2666
  frt_pq_push(mte->tew_queue, tew); /* initialize queue */
2374
2667
  }
2375
- }
2376
- else {
2668
+ } else {
2377
2669
  /* add the term_enum_wrapper just in case */
2378
2670
  sub_te = reader->terms(reader, 0);
2379
2671
  sub_te->field_num = -1;
@@ -2395,9 +2687,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2395
2687
  *
2396
2688
  ****************************************************************************/
2397
2689
 
2398
- FrtTermInfosReader *frt_tir_open(FrtStore *store,
2399
- FrtSegmentFieldIndex *sfi, const char *segment)
2400
- {
2690
+ FrtTermInfosReader *frt_tir_open(FrtStore *store, FrtSegmentFieldIndex *sfi, const char *segment) {
2401
2691
  FrtTermInfosReader *tir = FRT_ALLOC(FrtTermInfosReader);
2402
2692
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
2403
2693
 
@@ -2410,8 +2700,7 @@ FrtTermInfosReader *frt_tir_open(FrtStore *store,
2410
2700
  return tir;
2411
2701
  }
2412
2702
 
2413
- static FrtTermEnum *tir_enum(FrtTermInfosReader *tir)
2414
- {
2703
+ static FrtTermEnum *tir_enum(FrtTermInfosReader *tir) {
2415
2704
  FrtTermEnum *te;
2416
2705
  if (NULL == (te = (FrtTermEnum *)frt_thread_getspecific(tir->thread_te))) {
2417
2706
  te = frt_ste_clone(tir->orig_te);
@@ -2422,8 +2711,7 @@ static FrtTermEnum *tir_enum(FrtTermInfosReader *tir)
2422
2711
  return te;
2423
2712
  }
2424
2713
 
2425
- FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num)
2426
- {
2714
+ FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num) {
2427
2715
  if (field_num != tir->field_num) {
2428
2716
  ste_set_field(tir_enum(tir), field_num);
2429
2717
  tir->field_num = field_num;
@@ -2431,8 +2719,7 @@ FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num)
2431
2719
  return tir;
2432
2720
  }
2433
2721
 
2434
- FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term)
2435
- {
2722
+ FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term) {
2436
2723
  FrtTermEnum *te = tir_enum(tir);
2437
2724
  char *match;
2438
2725
 
@@ -2443,9 +2730,7 @@ FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term)
2443
2730
  return NULL;
2444
2731
  }
2445
2732
 
2446
- static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num,
2447
- const char *term)
2448
- {
2733
+ static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num, const char *term) {
2449
2734
  FrtTermEnum *te = tir_enum(tir);
2450
2735
  char *match;
2451
2736
 
@@ -2461,19 +2746,16 @@ static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num,
2461
2746
  return NULL;
2462
2747
  }
2463
2748
 
2464
- char *frt_tir_get_term(FrtTermInfosReader *tir, int pos)
2465
- {
2749
+ char *frt_tir_get_term(FrtTermInfosReader *tir, int pos) {
2466
2750
  if (pos < 0) {
2467
2751
  return NULL;
2468
- }
2469
- else {
2752
+ } else {
2470
2753
  return frt_ste_get_term(tir_enum(tir), pos);
2471
2754
  }
2472
2755
  }
2473
2756
 
2474
2757
 
2475
- void frt_tir_close(FrtTermInfosReader *tir)
2476
- {
2758
+ void frt_tir_close(FrtTermInfosReader *tir) {
2477
2759
  frt_ary_destroy(tir->te_bucket, (frt_free_ft)&frt_ste_close);
2478
2760
  frt_ste_close(tir->orig_te);
2479
2761
 
@@ -2490,25 +2772,19 @@ void frt_tir_close(FrtTermInfosReader *tir)
2490
2772
  *
2491
2773
  ****************************************************************************/
2492
2774
 
2493
- static FrtTermWriter *tw_new(FrtStore *store, char *file_name)
2494
- {
2775
+ static FrtTermWriter *tw_new(FrtStore *store, char *file_name) {
2495
2776
  FrtTermWriter *tw = FRT_ALLOC_AND_ZERO(FrtTermWriter);
2496
2777
  tw->os = store->new_output(store, file_name);
2497
2778
  tw->last_term = FRT_EMPTY_STRING;
2498
2779
  return tw;
2499
2780
  }
2500
2781
 
2501
- static void tw_close(FrtTermWriter *tw)
2502
- {
2782
+ static void tw_close(FrtTermWriter *tw) {
2503
2783
  frt_os_close(tw->os);
2504
2784
  free(tw);
2505
2785
  }
2506
2786
 
2507
- FrtTermInfosWriter *frt_tiw_open(FrtStore *store,
2508
- const char *segment,
2509
- int index_interval,
2510
- int skip_interval)
2511
- {
2787
+ FrtTermInfosWriter *frt_tiw_open(FrtStore *store, const char *segment, int index_interval, int skip_interval) {
2512
2788
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
2513
2789
  FrtTermInfosWriter *tiw = FRT_ALLOC(FrtTermInfosWriter);
2514
2790
  size_t segment_len = strlen(segment);
@@ -2537,11 +2813,7 @@ FrtTermInfosWriter *frt_tiw_open(FrtStore *store,
2537
2813
  return tiw;
2538
2814
  }
2539
2815
 
2540
- static void tw_write_term(FrtTermWriter *tw,
2541
- FrtOutStream *os,
2542
- const char *term,
2543
- int term_len)
2544
- {
2816
+ static void tw_write_term(FrtTermWriter *tw, FrtOutStream *os, const char *term, int term_len) {
2545
2817
  int start = frt_hlp_string_diff(tw->last_term, term);
2546
2818
  int length = term_len - start;
2547
2819
 
@@ -2552,12 +2824,7 @@ static void tw_write_term(FrtTermWriter *tw,
2552
2824
  tw->last_term = term;
2553
2825
  }
2554
2826
 
2555
- static void tw_add(FrtTermWriter *tw,
2556
- const char *term,
2557
- int term_len,
2558
- FrtTermInfo *ti,
2559
- int skip_interval)
2560
- {
2827
+ static void tw_add(FrtTermWriter *tw, const char *term, int term_len, FrtTermInfo *ti, int skip_interval) {
2561
2828
  FrtOutStream *os = tw->os;
2562
2829
 
2563
2830
  #ifdef DEBUG
@@ -2587,11 +2854,7 @@ static void tw_add(FrtTermWriter *tw,
2587
2854
  tw->counter++;
2588
2855
  }
2589
2856
 
2590
- void frt_tiw_add(FrtTermInfosWriter *tiw,
2591
- const char *term,
2592
- int term_len,
2593
- FrtTermInfo *ti)
2594
- {
2857
+ void frt_tiw_add(FrtTermInfosWriter *tiw, const char *term, int term_len, FrtTermInfo *ti) {
2595
2858
  off_t tis_pos;
2596
2859
 
2597
2860
  if (0 == (tiw->tis_writer->counter % tiw->index_interval)) {
@@ -2609,15 +2872,13 @@ void frt_tiw_add(FrtTermInfosWriter *tiw,
2609
2872
  tw_add(tiw->tis_writer, term, term_len, ti, tiw->skip_interval);
2610
2873
  }
2611
2874
 
2612
- static void tw_reset(FrtTermWriter *tw)
2613
- {
2875
+ static void tw_reset(FrtTermWriter *tw) {
2614
2876
  tw->counter = 0;
2615
2877
  tw->last_term = FRT_EMPTY_STRING;
2616
2878
  FRT_ZEROSET(&(tw->last_term_info), FrtTermInfo);
2617
2879
  }
2618
2880
 
2619
- void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num)
2620
- {
2881
+ void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num) {
2621
2882
  FrtOutStream *tfx_out = tiw->tfx_out;
2622
2883
  frt_os_write_vint(tfx_out, tiw->tix_writer->counter); /* write tix size */
2623
2884
  frt_os_write_vint(tfx_out, tiw->tis_writer->counter); /* write tis size */
@@ -2630,8 +2891,7 @@ void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num)
2630
2891
  tiw->field_count++;
2631
2892
  }
2632
2893
 
2633
- void frt_tiw_close(FrtTermInfosWriter *tiw)
2634
- {
2894
+ void frt_tiw_close(FrtTermInfosWriter *tiw) {
2635
2895
  FrtOutStream *tfx_out = tiw->tfx_out;
2636
2896
  frt_os_write_vint(tfx_out, tiw->tix_writer->counter);
2637
2897
  frt_os_write_vint(tfx_out, tiw->tis_writer->counter);
@@ -2665,8 +2925,7 @@ void frt_tiw_close(FrtTermInfosWriter *tiw)
2665
2925
  }\
2666
2926
  } while (0)
2667
2927
 
2668
- static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
2669
- {
2928
+ static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti) {
2670
2929
  if (NULL == ti) {
2671
2930
  stde->doc_freq = 0;
2672
2931
  } else {
@@ -2684,14 +2943,12 @@ static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
2684
2943
  }
2685
2944
  }
2686
2945
 
2687
- static void stde_seek(FrtTermDocEnum *tde, int field_num, const char *term)
2688
- {
2946
+ static void stde_seek(FrtTermDocEnum *tde, int field_num, const char *term) {
2689
2947
  FrtTermInfo *ti = tir_get_ti_field(STDE(tde)->tir, field_num, term);
2690
2948
  stde_seek_ti(STDE(tde), ti);
2691
2949
  }
2692
2950
 
2693
- static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te)
2694
- {
2951
+ static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te) {
2695
2952
  #ifdef DEBUG
2696
2953
  if (te->set_field != &ste_set_field) {
2697
2954
  FRT_RAISE(FRT_ARG_ERROR, "Passed an incorrect TermEnum type");
@@ -2700,20 +2957,17 @@ static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te)
2700
2957
  stde_seek_ti(STDE(tde), &(te->curr_ti));
2701
2958
  }
2702
2959
 
2703
- static int stde_doc_num(FrtTermDocEnum *tde)
2704
- {
2960
+ static int stde_doc_num(FrtTermDocEnum *tde) {
2705
2961
  CHECK_STATE("doc_num");
2706
2962
  return STDE(tde)->doc_num;
2707
2963
  }
2708
2964
 
2709
- static int stde_freq(FrtTermDocEnum *tde)
2710
- {
2965
+ static int stde_freq(FrtTermDocEnum *tde) {
2711
2966
  CHECK_STATE("freq");
2712
2967
  return STDE(tde)->freq;
2713
2968
  }
2714
2969
 
2715
- static bool stde_next(FrtTermDocEnum *tde)
2716
- {
2970
+ static bool stde_next(FrtTermDocEnum *tde) {
2717
2971
  int doc_code;
2718
2972
  FrtSegmentTermDocEnum *stde = STDE(tde);
2719
2973
 
@@ -2741,8 +2995,7 @@ static bool stde_next(FrtTermDocEnum *tde)
2741
2995
  return true;
2742
2996
  }
2743
2997
 
2744
- static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num)
2745
- {
2998
+ static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num) {
2746
2999
  FrtSegmentTermDocEnum *stde = STDE(tde);
2747
3000
  int i = 0;
2748
3001
  int doc_code;
@@ -2769,8 +3022,7 @@ static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num)
2769
3022
  return i;
2770
3023
  }
2771
3024
 
2772
- static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num)
2773
- {
3025
+ static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num) {
2774
3026
  FrtSegmentTermDocEnum *stde = STDE(tde);
2775
3027
 
2776
3028
  if (stde->doc_freq >= stde->skip_interval
@@ -2834,8 +3086,7 @@ static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num)
2834
3086
  return true;
2835
3087
  }
2836
3088
 
2837
- static void stde_close(FrtTermDocEnum *tde)
2838
- {
3089
+ static void stde_close(FrtTermDocEnum *tde) {
2839
3090
  frt_is_close(STDE(tde)->frq_in);
2840
3091
 
2841
3092
  if (NULL != STDE(tde)->skip_in) {
@@ -2845,23 +3096,17 @@ static void stde_close(FrtTermDocEnum *tde)
2845
3096
  free(tde);
2846
3097
  }
2847
3098
 
2848
- static void stde_skip_prox(FrtSegmentTermDocEnum *stde)
2849
- {
3099
+ static void stde_skip_prox(FrtSegmentTermDocEnum *stde) {
2850
3100
  (void)stde;
2851
3101
  }
2852
3102
 
2853
- static void stde_seek_prox(FrtSegmentTermDocEnum *stde, off_t prx_ptr)
2854
- {
3103
+ static void stde_seek_prox(FrtSegmentTermDocEnum *stde, off_t prx_ptr) {
2855
3104
  (void)stde;
2856
3105
  (void)prx_ptr;
2857
3106
  }
2858
3107
 
2859
3108
 
2860
- FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir,
2861
- FrtInStream *frq_in,
2862
- FrtBitVector *deleted_docs,
2863
- int skip_interval)
2864
- {
3109
+ FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir, FrtInStream *frq_in, FrtBitVector *deleted_docs, int skip_interval) {
2865
3110
  FrtSegmentTermDocEnum *stde = FRT_ALLOC_AND_ZERO(FrtSegmentTermDocEnum);
2866
3111
  FrtTermDocEnum *tde = (FrtTermDocEnum *)stde;
2867
3112
 
@@ -2893,27 +3138,23 @@ FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir,
2893
3138
  * SegmentTermPosEnum
2894
3139
  ****************************************************************************/
2895
3140
 
2896
- static void stpe_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
2897
- {
3141
+ static void stpe_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti) {
2898
3142
  if (NULL == ti) {
2899
3143
  stde->doc_freq = 0;
2900
- }
2901
- else {
3144
+ } else {
2902
3145
  stde_seek_ti(stde, ti);
2903
3146
  frt_is_seek(stde->prx_in, ti->prx_ptr);
2904
3147
  }
2905
3148
  }
2906
3149
 
2907
- static void stpe_seek(FrtTermDocEnum *tde, int field_num, const char *term)
2908
- {
3150
+ static void stpe_seek(FrtTermDocEnum *tde, int field_num, const char *term) {
2909
3151
  FrtSegmentTermDocEnum *stde = STDE(tde);
2910
3152
  FrtTermInfo *ti = tir_get_ti_field(stde->tir, field_num, term);
2911
3153
  stpe_seek_ti(stde, ti);
2912
3154
  stde->prx_cnt = 0;
2913
3155
  }
2914
3156
 
2915
- static bool stpe_next(FrtTermDocEnum *tde)
2916
- {
3157
+ static bool stpe_next(FrtTermDocEnum *tde) {
2917
3158
  FrtSegmentTermDocEnum *stde = STDE(tde);
2918
3159
  frt_is_skip_vints(stde->prx_in, stde->prx_cnt);
2919
3160
 
@@ -3387,8 +3628,8 @@ FrtTermDocEnum *frt_mtdpe_new(FrtIndexReader *ir, int field_num, char **terms, i
3387
3628
  ****************************************************************************/
3388
3629
 
3389
3630
  static FrtHash *fn_extensions = NULL;
3390
- static void file_name_filter_init()
3391
- {
3631
+
3632
+ static void file_name_filter_init(void) {
3392
3633
  int i;
3393
3634
  fn_extensions = frt_h_new_str((frt_free_ft)NULL, (frt_free_ft)NULL);
3394
3635
  for (i = 0; i < FRT_NELEMS(INDEX_EXTENSIONS); i++) {
@@ -3687,9 +3928,8 @@ static void ir_acquire_write_lock(FrtIndexReader *ir)
3687
3928
  }
3688
3929
  }
3689
3930
 
3690
- static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentInfos *sis,
3691
- FrtFieldInfos *fis, int is_owner)
3692
- {
3931
+ static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentInfos *sis, FrtFieldInfos *fis, int is_owner) {
3932
+ ir->type = FRT_INDEX_READER;
3693
3933
  frt_mutex_init(&ir->mutex, NULL);
3694
3934
  frt_mutex_init(&ir->field_index_mutex, NULL);
3695
3935
 
@@ -3712,8 +3952,7 @@ static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentI
3712
3952
  return ir;
3713
3953
  }
3714
3954
 
3715
- int frt_ir_doc_freq(FrtIndexReader *ir, FrtSymbol field, const char *term)
3716
- {
3955
+ int frt_ir_doc_freq(FrtIndexReader *ir, ID field, const char *term) {
3717
3956
  int field_num = frt_fis_get_field_num(ir->fis, field);
3718
3957
  if (field_num >= 0) {
3719
3958
  return ir->doc_freq(ir, field_num, term);
@@ -3723,8 +3962,7 @@ int frt_ir_doc_freq(FrtIndexReader *ir, FrtSymbol field, const char *term)
3723
3962
  }
3724
3963
  }
3725
3964
 
3726
- static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uchar val)
3727
- {
3965
+ static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uchar val) {
3728
3966
  frt_mutex_lock(&ir->mutex);
3729
3967
  ir->acquire_write_lock(ir);
3730
3968
  ir->set_norm_i(ir, doc_num, field_num, val);
@@ -3732,8 +3970,7 @@ static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uc
3732
3970
  frt_mutex_unlock(&ir->mutex);
3733
3971
  }
3734
3972
 
3735
- void frt_ir_set_norm(FrtIndexReader *ir, int doc_num, FrtSymbol field, frt_uchar val)
3736
- {
3973
+ void frt_ir_set_norm(FrtIndexReader *ir, int doc_num, ID field, frt_uchar val) {
3737
3974
  int field_num = frt_fis_get_field_num(ir->fis, field);
3738
3975
  if (field_num >= 0) {
3739
3976
  ir_set_norm_i(ir, doc_num, field_num, val);
@@ -3755,14 +3992,12 @@ frt_uchar *frt_ir_get_norms_i(FrtIndexReader *ir, int field_num)
3755
3992
  return norms;
3756
3993
  }
3757
3994
 
3758
- frt_uchar *frt_ir_get_norms(FrtIndexReader *ir, FrtSymbol field)
3759
- {
3995
+ frt_uchar *frt_ir_get_norms(FrtIndexReader *ir, ID field) {
3760
3996
  int field_num = frt_fis_get_field_num(ir->fis, field);
3761
3997
  return frt_ir_get_norms_i(ir, field_num);
3762
3998
  }
3763
3999
 
3764
- frt_uchar *frt_ir_get_norms_into(FrtIndexReader *ir, FrtSymbol field, frt_uchar *buf)
3765
- {
4000
+ frt_uchar *frt_ir_get_norms_into(FrtIndexReader *ir, ID field, frt_uchar *buf) {
3766
4001
  int field_num = frt_fis_get_field_num(ir->fis, field);
3767
4002
  if (field_num >= 0) {
3768
4003
  ir->get_norms_into(ir, field_num, buf);
@@ -3793,7 +4028,7 @@ void frt_ir_delete_doc(FrtIndexReader *ir, int doc_num)
3793
4028
  }
3794
4029
  }
3795
4030
 
3796
- FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, FrtSymbol field, const char *term) {
4031
+ FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, ID field, const char *term) {
3797
4032
  FrtTermDocEnum *tde = ir_term_docs_for(ir, field, term);
3798
4033
  FrtDocument *doc = NULL;
3799
4034
 
@@ -3806,8 +4041,7 @@ FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, FrtSymbol field, const
3806
4041
  return doc;
3807
4042
  }
3808
4043
 
3809
- FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, FrtSymbol field)
3810
- {
4044
+ FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, ID field) {
3811
4045
  FrtTermEnum *te = NULL;
3812
4046
  int field_num = frt_fis_get_field_num(ir->fis, field);
3813
4047
  if (field_num >= 0) {
@@ -3816,9 +4050,7 @@ FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, FrtSymbol field)
3816
4050
  return te;
3817
4051
  }
3818
4052
 
3819
- FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, FrtSymbol field,
3820
- const char *term)
3821
- {
4053
+ FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, ID field, const char *term) {
3822
4054
  FrtTermEnum *te = NULL;
3823
4055
  int field_num = frt_fis_get_field_num(ir->fis, field);
3824
4056
  if (field_num >= 0) {
@@ -3827,9 +4059,7 @@ FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, FrtSymbol field,
3827
4059
  return te;
3828
4060
  }
3829
4061
 
3830
- FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, FrtSymbol field,
3831
- const char *term)
3832
- {
4062
+ FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, ID field, const char *term) {
3833
4063
  int field_num = frt_fis_get_field_num(ir->fis, field);
3834
4064
  FrtTermDocEnum *tde = ir->term_docs(ir);
3835
4065
  if (field_num >= 0) {
@@ -3838,9 +4068,7 @@ FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, FrtSymbol field,
3838
4068
  return tde;
3839
4069
  }
3840
4070
 
3841
- FrtTermDocEnum *frt_ir_term_positions_for(FrtIndexReader *ir, FrtSymbol field,
3842
- const char *term)
3843
- {
4071
+ FrtTermDocEnum *frt_ir_term_positions_for(FrtIndexReader *ir, ID field, const char *term) {
3844
4072
  int field_num = frt_fis_get_field_num(ir->fis, field);
3845
4073
  FrtTermDocEnum *tde = ir->term_positions(ir);
3846
4074
  if (field_num >= 0) {
@@ -3854,7 +4082,7 @@ static void ir_commit_i(FrtIndexReader *ir)
3854
4082
  if (ir->has_changes) {
3855
4083
  if (NULL == ir->deleter && NULL != ir->store) {
3856
4084
  /* In the MultiReader case, we share this deleter across all
3857
- * SegmentReaders: */
4085
+ * FrtSegmentReaders: */
3858
4086
  ir->set_deleter_i(ir, frt_deleter_new(ir->sis, ir->store));
3859
4087
  }
3860
4088
  if (ir->is_owner) {
@@ -3990,34 +4218,14 @@ static void norm_rewrite(Norm *norm, FrtStore *store, FrtDeleter *dlr,
3990
4218
  }
3991
4219
 
3992
4220
  /****************************************************************************
3993
- * SegmentReader
4221
+ * FrtSegmentReader
3994
4222
  ****************************************************************************/
3995
4223
 
3996
- typedef struct SegmentReader {
3997
- FrtIndexReader ir;
3998
- FrtSegmentInfo *si;
3999
- char *segment;
4000
- FrtFieldsReader *fr;
4001
- FrtBitVector *deleted_docs;
4002
- FrtInStream *frq_in;
4003
- FrtInStream *prx_in;
4004
- FrtSegmentFieldIndex *sfi;
4005
- FrtTermInfosReader *tir;
4006
- frt_thread_key_t thread_fr;
4007
- void **fr_bucket;
4008
- FrtHash *norms;
4009
- FrtStore *cfs_store;
4010
- bool deleted_docs_dirty : 1;
4011
- bool undelete_all : 1;
4012
- bool norms_dirty : 1;
4013
- } SegmentReader;
4014
-
4015
4224
  #define IR(ir) ((FrtIndexReader *)(ir))
4016
-
4017
- #define SR(ir) ((SegmentReader *)(ir))
4225
+ #define SR(ir) ((FrtSegmentReader *)(ir))
4018
4226
  #define SR_SIZE(ir) (SR(ir)->fr->size)
4019
4227
 
4020
- static FrtFieldsReader *sr_fr(SegmentReader *sr)
4228
+ static FrtFieldsReader *sr_fr(FrtSegmentReader *sr)
4021
4229
  {
4022
4230
  FrtFieldsReader *fr;
4023
4231
 
@@ -4029,12 +4237,12 @@ static FrtFieldsReader *sr_fr(SegmentReader *sr)
4029
4237
  return fr;
4030
4238
  }
4031
4239
 
4032
- static bool sr_is_deleted_i(SegmentReader *sr, int doc_num)
4240
+ static bool sr_is_deleted_i(FrtSegmentReader *sr, int doc_num)
4033
4241
  {
4034
4242
  return (NULL != sr->deleted_docs && frt_bv_get(sr->deleted_docs, doc_num));
4035
4243
  }
4036
4244
 
4037
- static void sr_get_norms_into_i(SegmentReader *sr, int field_num,
4245
+ static void sr_get_norms_into_i(FrtSegmentReader *sr, int field_num,
4038
4246
  frt_uchar *buf)
4039
4247
  {
4040
4248
  Norm *norm = (Norm *)frt_h_get_int(sr->norms, field_num);
@@ -4053,7 +4261,7 @@ static void sr_get_norms_into_i(SegmentReader *sr, int field_num,
4053
4261
  }
4054
4262
  }
4055
4263
 
4056
- static frt_uchar *sr_get_norms_i(SegmentReader *sr, int field_num)
4264
+ static frt_uchar *sr_get_norms_i(FrtSegmentReader *sr, int field_num)
4057
4265
  {
4058
4266
  Norm *norm = (Norm *)frt_h_get_int(sr->norms, field_num);
4059
4267
  if (NULL == norm) { /* not an indexed field */
@@ -4189,7 +4397,7 @@ static void sr_commit_i(FrtIndexReader *ir)
4189
4397
 
4190
4398
  static void sr_close_i(FrtIndexReader *ir)
4191
4399
  {
4192
- SegmentReader *sr = SR(ir);
4400
+ FrtSegmentReader *sr = SR(ir);
4193
4401
 
4194
4402
  if (sr->fr) frt_fr_close(sr->fr);
4195
4403
  if (sr->tir) frt_tir_close(sr->tir);
@@ -4298,14 +4506,12 @@ static FrtTermDocEnum *sr_term_docs(FrtIndexReader *ir)
4298
4506
 
4299
4507
  static FrtTermDocEnum *sr_term_positions(FrtIndexReader *ir)
4300
4508
  {
4301
- SegmentReader *sr = SR(ir);
4509
+ FrtSegmentReader *sr = SR(ir);
4302
4510
  return frt_stpe_new(sr->tir, sr->frq_in, sr->prx_in, sr->deleted_docs,
4303
4511
  STE(sr->tir->orig_te)->skip_interval);
4304
4512
  }
4305
4513
 
4306
- static FrtTermVector *sr_term_vector(FrtIndexReader *ir, int doc_num,
4307
- FrtSymbol field)
4308
- {
4514
+ static FrtTermVector *sr_term_vector(FrtIndexReader *ir, int doc_num, ID field) {
4309
4515
  FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(ir->fis->field_dict, (void *)field);
4310
4516
  FrtFieldsReader *fr;
4311
4517
 
@@ -4360,7 +4566,7 @@ static void sr_open_norms(FrtIndexReader *ir, FrtStore *cfs_store)
4360
4566
  SR(ir)->norms_dirty = false;
4361
4567
  }
4362
4568
 
4363
- static FrtIndexReader *sr_setup_i(SegmentReader *sr)
4569
+ static FrtIndexReader *sr_setup_i(FrtSegmentReader *sr)
4364
4570
  {
4365
4571
  FrtStore *volatile store = sr->si->store;
4366
4572
  FrtIndexReader *ir = IR(sr);
@@ -4391,6 +4597,8 @@ static FrtIndexReader *sr_setup_i(SegmentReader *sr)
4391
4597
  ir->commit_i = &sr_commit_i;
4392
4598
  ir->close_i = &sr_close_i;
4393
4599
 
4600
+ ir->type = FRT_SEGMENT_READER;
4601
+
4394
4602
  sr->cfs_store = NULL;
4395
4603
 
4396
4604
  FRT_TRY
@@ -4430,10 +4638,13 @@ static FrtIndexReader *sr_setup_i(SegmentReader *sr)
4430
4638
  return ir;
4431
4639
  }
4432
4640
 
4433
- static FrtIndexReader *sr_open(FrtSegmentInfos *sis, FrtFieldInfos *fis, int si_num,
4434
- bool is_owner)
4435
- {
4436
- SegmentReader *sr = FRT_ALLOC_AND_ZERO(SegmentReader);
4641
+ FrtSegmentReader *frt_sr_alloc(void) {
4642
+ return FRT_ALLOC_AND_ZERO(FrtSegmentReader);
4643
+ }
4644
+
4645
+ static FrtIndexReader *sr_open(FrtSegmentInfos *sis, FrtFieldInfos *fis, int si_num, bool is_owner, FrtSegmentReader *sr) {
4646
+ if (sr == NULL)
4647
+ sr = frt_sr_alloc();
4437
4648
  sr->si = sis->segs[si_num];
4438
4649
  ir_setup(IR(sr), sr->si->store, sis, fis, is_owner);
4439
4650
  return sr_setup_i(sr);
@@ -4604,9 +4815,7 @@ static FrtTermDocEnum *mr_term_positions(FrtIndexReader *ir)
4604
4815
  return mtpe_new(MR(ir));
4605
4816
  }
4606
4817
 
4607
- static FrtTermVector *mr_term_vector(FrtIndexReader *ir, int doc_num,
4608
- FrtSymbol field)
4609
- {
4818
+ static FrtTermVector *mr_term_vector(FrtIndexReader *ir, int doc_num, ID field) {
4610
4819
  GET_READER();
4611
4820
  return reader->term_vector(reader, doc_num - MR(ir)->starts[i], field);
4612
4821
  }
@@ -4710,10 +4919,12 @@ static void mr_close_i(FrtIndexReader *ir)
4710
4919
  free(MR(ir)->starts);
4711
4920
  }
4712
4921
 
4713
- static FrtIndexReader *mr_new(FrtIndexReader **sub_readers, const int r_cnt)
4714
- {
4922
+ FrtMultiReader *frt_mr_alloc(void) {
4923
+ return FRT_ALLOC_AND_ZERO(FrtMultiReader);
4924
+ }
4925
+
4926
+ FrtMultiReader *frt_mr_init(FrtMultiReader *mr, FrtIndexReader **sub_readers, const int r_cnt) {
4715
4927
  int i;
4716
- FrtMultiReader *mr = FRT_ALLOC_AND_ZERO(FrtMultiReader);
4717
4928
  FrtIndexReader *ir = IR(mr);
4718
4929
 
4719
4930
  mr->sub_readers = sub_readers;
@@ -4760,21 +4971,19 @@ static FrtIndexReader *mr_new(FrtIndexReader **sub_readers, const int r_cnt)
4760
4971
  ir->commit_i = &mr_commit_i;
4761
4972
  ir->close_i = &mr_close_i;
4762
4973
 
4763
- return ir;
4974
+ ir->type = FRT_MULTI_READER;
4975
+
4976
+ return mr;
4764
4977
  }
4765
4978
 
4766
- static FrtIndexReader *frt_mr_open_i(FrtStore *store,
4767
- FrtSegmentInfos *sis,
4768
- FrtFieldInfos *fis,
4769
- FrtIndexReader **sub_readers,
4770
- const int r_cnt)
4771
- {
4772
- FrtIndexReader *ir = mr_new(sub_readers, r_cnt);
4979
+ static FrtIndexReader *frt_mr_open_i(FrtStore *store, FrtSegmentInfos *sis, FrtFieldInfos *fis, FrtIndexReader **sub_readers, const int r_cnt, FrtIndexReader *ir) {
4980
+ if (ir == NULL)
4981
+ ir = (FrtIndexReader *)frt_mr_alloc();
4982
+ ir = (FrtIndexReader *)frt_mr_init((FrtMultiReader *)ir, sub_readers, r_cnt);
4773
4983
  return ir_setup(ir, store, sis, fis, true);
4774
4984
  }
4775
4985
 
4776
- static void mr_close_ext_i(FrtIndexReader *ir)
4777
- {
4986
+ static void mr_close_ext_i(FrtIndexReader *ir) {
4778
4987
  int **field_num_map = MR(ir)->field_num_map;
4779
4988
  if (field_num_map) {
4780
4989
  int i;
@@ -4787,12 +4996,13 @@ static void mr_close_ext_i(FrtIndexReader *ir)
4787
4996
  mr_close_i(ir);
4788
4997
  }
4789
4998
 
4790
- FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
4791
- {
4792
- FrtIndexReader *ir = mr_new(sub_readers, r_cnt);
4999
+ FrtIndexReader *frt_mr_open(FrtIndexReader *ir, FrtIndexReader **sub_readers, const int r_cnt) {
5000
+ if (ir == NULL)
5001
+ ir = (FrtIndexReader *)frt_mr_alloc();
5002
+ ir = (FrtIndexReader *)frt_mr_init((FrtMultiReader *)ir, sub_readers, r_cnt);
4793
5003
  FrtMultiReader *mr = MR(ir);
4794
5004
  /* defaults don't matter, this is just for reading fields, not adding */
4795
- FrtFieldInfos *fis = frt_fis_new(FRT_STORE_NO, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
5005
+ FrtFieldInfos *fis = frt_fis_new(FRT_STORE_NO, FRT_COMPRESSION_NONE, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
4796
5006
  int i, j;
4797
5007
  bool need_field_map = false;
4798
5008
 
@@ -4827,12 +5037,10 @@ FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
4827
5037
  mr->field_num_map[i][j] = fi_sub ? fi_sub->number : -1;
4828
5038
  }
4829
5039
  }
4830
- }
4831
- else {
5040
+ } else {
4832
5041
  mr->field_num_map = NULL;
4833
5042
  }
4834
5043
 
4835
-
4836
5044
  ir->close_i = &mr_close_ext_i;
4837
5045
 
4838
5046
  return ir_setup(ir, NULL, NULL, fis, false);
@@ -4842,21 +5050,19 @@ FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
4842
5050
  * IndexReader
4843
5051
  ****************************************************************************/
4844
5052
 
4845
-
4846
- static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4847
- {
5053
+ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir) {
4848
5054
  volatile bool success = false;
4849
- FrtIndexReader *volatile ir = NULL;
5055
+ // FrtIndexReader *volatile ir = NULL;
4850
5056
  FrtSegmentInfos *volatile sis = NULL;
4851
5057
  FRT_TRY
4852
5058
  do {
4853
5059
  FrtFieldInfos *fis;
4854
5060
  frt_mutex_lock(&store->mutex);
4855
- frt_sis_read_i(store, fsf);
5061
+ frt_sis_read_i(store, fsf, NULL);
4856
5062
  sis = fsf->ret.sis;
4857
5063
  fis = sis->fis;
4858
5064
  if (sis->size == 1) {
4859
- ir = sr_open(sis, fis, 0, true);
5065
+ ir = sr_open(sis, fis, 0, true, (FrtSegmentReader *)ir);
4860
5066
  }
4861
5067
  else {
4862
5068
  volatile int i;
@@ -4864,7 +5070,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4864
5070
  int num_segments = sis->size;
4865
5071
  for (i = num_segments - 1; i >= 0; i--) {
4866
5072
  FRT_TRY
4867
- readers[i] = sr_open(sis, fis, i, false);
5073
+ readers[i] = sr_open(sis, fis, i, false, NULL);
4868
5074
  FRT_XCATCHALL
4869
5075
  for (i++; i < num_segments; i++) {
4870
5076
  frt_ir_close(readers[i]);
@@ -4872,7 +5078,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4872
5078
  free(readers);
4873
5079
  FRT_XENDTRY
4874
5080
  }
4875
- ir = frt_mr_open_i(store, sis, fis, readers, sis->size);
5081
+ ir = frt_mr_open_i(store, sis, fis, readers, sis->size, ir);
4876
5082
  }
4877
5083
  fsf->ret.ir = ir;
4878
5084
  success = true;
@@ -4881,8 +5087,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4881
5087
  if (!success) {
4882
5088
  if (ir) {
4883
5089
  frt_ir_close(ir);
4884
- }
4885
- else if (sis) {
5090
+ } else if (sis) {
4886
5091
  frt_sis_destroy(sis);
4887
5092
  }
4888
5093
  }
@@ -4894,15 +5099,12 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4894
5099
  * Will keep a reference to the store. To let this method delete the store
4895
5100
  * make sure you deref the store that you pass to it
4896
5101
  */
4897
- FrtIndexReader *frt_ir_open(FrtStore *store)
4898
- {
5102
+ FrtIndexReader *frt_ir_open(FrtIndexReader *ir, FrtStore *store) {
4899
5103
  FindSegmentsFile fsf;
4900
- sis_find_segments_file(store, &fsf, &ir_open_i);
5104
+ sis_find_segments_file(store, &fsf, &ir_open_i, ir);
4901
5105
  return fsf.ret.ir;
4902
5106
  }
4903
5107
 
4904
-
4905
-
4906
5108
  /****************************************************************************
4907
5109
  *
4908
5110
  * Occurence
@@ -5292,10 +5494,7 @@ static void dw_add_offsets(FrtDocWriter *dw, int pos, off_t start, off_t end)
5292
5494
  dw->offsets_size = pos + 1;
5293
5495
  }
5294
5496
 
5295
- FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5296
- FrtFieldInverter *fld_inv,
5297
- FrtDocField *df)
5298
- {
5497
+ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDocField *df) {
5299
5498
  FrtMemoryPool *mp = dw->mp;
5300
5499
  FrtAnalyzer *a = dw->analyzer;
5301
5500
  FrtHash *curr_plists = dw->curr_plists;
@@ -5311,7 +5510,7 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5311
5510
  int pos = -1, num_terms = 0;
5312
5511
 
5313
5512
  for (i = 0; i < df_size; i++) {
5314
- FrtTokenStream *ts = frt_a_get_ts(a, df->name, df->data[i]);
5513
+ FrtTokenStream *ts = frt_a_get_ts(a, df->name, df->data[i], df->encodings[i]);
5315
5514
  /* ts->reset(ts, df->data[i]); no longer being called */
5316
5515
  if (store_offsets) {
5317
5516
  while (NULL != (tk = ts->next(ts))) {
@@ -5321,21 +5520,16 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5321
5520
  if (pos < 0) {
5322
5521
  pos = 0;
5323
5522
  }
5324
- dw_add_posting(mp, curr_plists, fld_plists, doc_num,
5325
- tk->text, tk->len, pos);
5326
- dw_add_offsets(dw, pos,
5327
- start_offset + tk->start,
5328
- start_offset + tk->end);
5523
+ dw_add_posting(mp, curr_plists, fld_plists, doc_num, tk->text, tk->len, pos);
5524
+ dw_add_offsets(dw, pos, start_offset + tk->start, start_offset + tk->end);
5329
5525
  if (num_terms++ >= dw->max_field_length) {
5330
5526
  break;
5331
5527
  }
5332
5528
  }
5333
- }
5334
- else {
5529
+ } else {
5335
5530
  while (NULL != (tk = ts->next(ts))) {
5336
5531
  pos += tk->pos_inc;
5337
- dw_add_posting(mp, curr_plists, fld_plists, doc_num,
5338
- tk->text, tk->len, pos);
5532
+ dw_add_posting(mp, curr_plists, fld_plists, doc_num, tk->text, tk->len, pos);
5339
5533
  if (num_terms++ >= dw->max_field_length) {
5340
5534
  break;
5341
5535
  }
@@ -5345,22 +5539,34 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5345
5539
  start_offset += df->lengths[i] + 1;
5346
5540
  }
5347
5541
  fld_inv->length = num_terms;
5348
- }
5349
- else {
5542
+ } else {
5350
5543
  char buf[FRT_MAX_WORD_SIZE];
5351
5544
  buf[FRT_MAX_WORD_SIZE - 1] = '\0';
5352
5545
  for (i = 0; i < df_size; i++) {
5353
5546
  int len = df->lengths[i];
5354
5547
  char *data_ptr = df->data[i];
5355
- if (len > FRT_MAX_WORD_SIZE) {
5356
- len = FRT_MAX_WORD_SIZE - 1;
5357
- data_ptr = (char *)memcpy(buf, df->data[i], len);
5548
+ if (df->encodings[i] == utf8_encoding) {
5549
+ if (len >= FRT_MAX_WORD_SIZE) {
5550
+ len = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
5551
+ data_ptr = (char *)memcpy(buf, df->data[i], len);
5552
+ buf[len] = '\0';
5553
+ }
5554
+ } else if (df->encodings[i] != utf8_encoding) {
5555
+ if (len >= FRT_MAX_WORD_SIZE)
5556
+ len = FRT_MAX_WORD_SIZE - 1;
5557
+ const unsigned char *sp = (unsigned char *)df->data[i];
5558
+ unsigned char *dp = (unsigned char *)&buf;
5559
+ rb_econv_t *ec = rb_econv_open(rb_enc_name(df->encodings[i]), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
5560
+ assert(ec != NULL);
5561
+ rb_econv_convert(ec, &sp, (unsigned char *)df->data[i] + len, &dp, (unsigned char *)&buf + FRT_MAX_WORD_SIZE - 1, 0);
5562
+ rb_econv_close(ec);
5563
+ len = dp - (unsigned char *)&buf;
5564
+ buf[len] = '\0';
5565
+ data_ptr = buf;
5358
5566
  }
5359
- dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr,
5360
- len, i);
5567
+ dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
5361
5568
  if (store_offsets) {
5362
- dw_add_offsets(dw, i, start_offset,
5363
- start_offset + df->lengths[i]);
5569
+ dw_add_offsets(dw, i, start_offset, start_offset + df->lengths[i]);
5364
5570
  }
5365
5571
  start_offset += df->lengths[i] + 1;
5366
5572
  }
@@ -5369,14 +5575,12 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5369
5575
  return curr_plists;
5370
5576
  }
5371
5577
 
5372
- void frt_dw_reset_postings(FrtHash *postings)
5373
- {
5578
+ void frt_dw_reset_postings(FrtHash *postings) {
5374
5579
  FRT_ZEROSET_N(postings->table, FrtHashEntry, postings->mask + 1);
5375
5580
  postings->fill = postings->size = 0;
5376
5581
  }
5377
5582
 
5378
- void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc)
5379
- {
5583
+ void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc) {
5380
5584
  int i;
5381
5585
  float boost;
5382
5586
  FrtDocField *df;
@@ -5398,16 +5602,12 @@ void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc)
5398
5602
 
5399
5603
  postings = frt_dw_invert_field(dw, fld_inv, df);
5400
5604
  if (fld_inv->store_term_vector) {
5401
- frt_fw_add_postings(dw->fw, fld_inv->fi->number,
5402
- dw_sort_postings(postings), postings->size,
5403
- dw->offsets, dw->offsets_size);
5605
+ frt_fw_add_postings(dw->fw, fld_inv->fi->number, dw_sort_postings(postings), postings->size, dw->offsets, dw->offsets_size);
5404
5606
  }
5405
5607
 
5406
5608
  if (fld_inv->has_norms) {
5407
- boost = fld_inv->fi->boost * doc->boost * df->boost *
5408
- frt_sim_length_norm(dw->similarity, fi->name, fld_inv->length);
5409
- fld_inv->norms[dw->doc_num] =
5410
- frt_sim_encode_norm(dw->similarity, boost);
5609
+ boost = fld_inv->fi->boost * doc->boost * df->boost * frt_sim_length_norm(dw->similarity, fi->name, fld_inv->length);
5610
+ fld_inv->norms[dw->doc_num] = frt_sim_encode_norm(dw->similarity, boost);
5411
5611
  }
5412
5612
  frt_dw_reset_postings(postings);
5413
5613
  if (dw->offsets_size > 0) {
@@ -5960,15 +6160,12 @@ static void iw_commit_compound_file(FrtIndexWriter *iw, FrtSegmentInfo *si)
5960
6160
  iw_create_compound_file(iw->store, iw->fis, si, cfs_name, iw->deleter);
5961
6161
  }
5962
6162
 
5963
- static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg,
5964
- const int max_seg)
5965
- {
6163
+ static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg, const int max_seg) {
5966
6164
  int i;
5967
6165
  FrtSegmentInfos *sis = iw->sis;
5968
6166
  FrtSegmentInfo *si = frt_sis_new_segment(sis, 0, iw->store);
5969
6167
 
5970
- SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg],
5971
- max_seg - min_seg);
6168
+ SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg], max_seg - min_seg);
5972
6169
 
5973
6170
  /* This is where all the action happens. */
5974
6171
  si->doc_cnt = sm_merge(merger);
@@ -6080,8 +6277,7 @@ void frt_iw_commit(FrtIndexWriter *iw)
6080
6277
  frt_mutex_unlock(&iw->mutex);
6081
6278
  }
6082
6279
 
6083
- void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
6084
- {
6280
+ void frt_iw_delete_term(FrtIndexWriter *iw, ID field, const char *term) {
6085
6281
  int field_num = frt_fis_get_field_num(iw->fis, field);
6086
6282
  if (field_num >= 0) {
6087
6283
  int i;
@@ -6092,7 +6288,7 @@ void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
6092
6288
  const int seg_cnt = sis->size;
6093
6289
  bool did_delete = false;
6094
6290
  for (i = 0; i < seg_cnt; i++) {
6095
- FrtIndexReader *ir = sr_open(sis, iw->fis, i, false);
6291
+ FrtIndexReader *ir = sr_open(sis, iw->fis, i, false, NULL);
6096
6292
  FrtTermDocEnum *tde = ir->term_docs(ir);
6097
6293
  ir->deleter = iw->deleter;
6098
6294
  stde_seek(tde, field_num, term);
@@ -6114,9 +6310,7 @@ void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
6114
6310
  }
6115
6311
  }
6116
6312
 
6117
- void frt_iw_delete_terms(FrtIndexWriter *iw, FrtSymbol field,
6118
- char **terms, const int term_cnt)
6119
- {
6313
+ void frt_iw_delete_terms(FrtIndexWriter *iw, ID field, char **terms, const int term_cnt) {
6120
6314
  int field_num = frt_fis_get_field_num(iw->fis, field);
6121
6315
  if (field_num >= 0) {
6122
6316
  int i;
@@ -6127,7 +6321,7 @@ void frt_iw_delete_terms(FrtIndexWriter *iw, FrtSymbol field,
6127
6321
  const int seg_cnt = sis->size;
6128
6322
  bool did_delete = false;
6129
6323
  for (i = 0; i < seg_cnt; i++) {
6130
- FrtIndexReader *ir = sr_open(sis, iw->fis, i, false);
6324
+ FrtIndexReader *ir = sr_open(sis, iw->fis, i, false, NULL);
6131
6325
  FrtTermDocEnum *tde = ir->term_docs(ir);
6132
6326
  int j;
6133
6327
  for (j = 0 ; j < term_cnt; j++) {
@@ -6196,10 +6390,13 @@ void frt_iw_close(FrtIndexWriter *iw)
6196
6390
  free(iw);
6197
6391
  }
6198
6392
 
6199
- FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
6200
- const FrtConfig *config)
6201
- {
6202
- FrtIndexWriter *iw = FRT_ALLOC_AND_ZERO(FrtIndexWriter);
6393
+ FrtIndexWriter *frt_iw_alloc(void) {
6394
+ return FRT_ALLOC_AND_ZERO(FrtIndexWriter);
6395
+ }
6396
+
6397
+ FrtIndexWriter *frt_iw_open(FrtIndexWriter *iw, FrtStore *store, FrtAnalyzer *volatile analyzer, const FrtConfig *config) {
6398
+ if (iw == NULL)
6399
+ iw = frt_iw_alloc();
6203
6400
  frt_mutex_init(&iw->mutex, NULL);
6204
6401
  iw->store = store;
6205
6402
  if (!config) {
@@ -6230,7 +6427,7 @@ FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
6230
6427
 
6231
6428
  iw->similarity = frt_sim_create_default();
6232
6429
  iw->analyzer = analyzer ? (FrtAnalyzer *)analyzer
6233
- : frt_mb_standard_analyzer_new(true);
6430
+ : frt_standard_analyzer_new(true);
6234
6431
 
6235
6432
  iw->deleter = frt_deleter_new(iw->sis, store);
6236
6433
  deleter_delete_deletable_files(iw->deleter);
@@ -6242,9 +6439,7 @@ FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
6242
6439
  /*******************/
6243
6440
  /*** Add Indexes ***/
6244
6441
  /*******************/
6245
- static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6246
- const char *segment, int *map)
6247
- {
6442
+ static void iw_cp_fields(FrtIndexWriter *iw, FrtSegmentReader *sr, const char *segment, int *map) {
6248
6443
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
6249
6444
  FrtOutStream *fdt_out, *fdx_out;
6250
6445
  FrtInStream *fdt_in, *fdx_in;
@@ -6271,7 +6466,6 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6271
6466
  frt_is2os_copy_bytes(del_in, del_out, frt_is_length(del_in));
6272
6467
  }
6273
6468
 
6274
-
6275
6469
  if (map) {
6276
6470
  int i;
6277
6471
  const int max_doc = sr_max_doc(IR(sr));
@@ -6292,10 +6486,14 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6292
6486
  frt_os_write_vint(fdt_out, df_size);
6293
6487
  /* sum total lengths of FrtDocField */
6294
6488
  for (k = 0; k < df_size; k++) {
6295
- /* Each field has one ' ' byte so add 1 */
6296
- const int flen = frt_is_read_vint(fdt_in);
6489
+ const int flen = frt_is_read_vint(fdt_in); /* length */
6490
+ const int fenc = frt_is_read_vint(fdt_in); /* encoding */
6491
+ const int fcmp = frt_is_read_vint(fdt_in); /* compression */
6297
6492
  frt_os_write_vint(fdt_out, flen);
6298
- data_len += flen + 1;
6493
+ frt_os_write_vint(fdt_out, fenc);
6494
+ frt_os_write_vint(fdt_out, fcmp);
6495
+ /* Each field has one ' ' byte so add 1 */
6496
+ data_len += flen + 1;
6299
6497
  }
6300
6498
  }
6301
6499
  frt_is2os_copy_bytes(fdt_in, fdt_out, data_len);
@@ -6318,8 +6516,7 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6318
6516
  frt_os_write_vint(fdt_out, tv_size);
6319
6517
  }
6320
6518
  }
6321
- }
6322
- else {
6519
+ } else {
6323
6520
  frt_is2os_copy_bytes(fdt_in, fdt_out, frt_is_length(fdt_in));
6324
6521
  frt_is2os_copy_bytes(fdx_in, fdx_out, frt_is_length(fdx_in));
6325
6522
  }
@@ -6329,7 +6526,7 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6329
6526
  frt_os_close(fdx_out);
6330
6527
  }
6331
6528
 
6332
- static void iw_cp_terms(FrtIndexWriter *iw, SegmentReader *sr,
6529
+ static void iw_cp_terms(FrtIndexWriter *iw, FrtSegmentReader *sr,
6333
6530
  const char *segment, int *map)
6334
6531
  {
6335
6532
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
@@ -6398,7 +6595,7 @@ static void iw_cp_terms(FrtIndexWriter *iw, SegmentReader *sr,
6398
6595
  frt_os_close(prx_out);
6399
6596
  }
6400
6597
 
6401
- static void iw_cp_norms(FrtIndexWriter *iw, SegmentReader *sr,
6598
+ static void iw_cp_norms(FrtIndexWriter *iw, FrtSegmentReader *sr,
6402
6599
  FrtSegmentInfo *si, int *map)
6403
6600
  {
6404
6601
  int i;
@@ -6429,9 +6626,7 @@ static void iw_cp_norms(FrtIndexWriter *iw, SegmentReader *sr,
6429
6626
  }
6430
6627
  }
6431
6628
 
6432
- static void iw_cp_map_files(FrtIndexWriter *iw, SegmentReader *sr,
6433
- FrtSegmentInfo *si)
6434
- {
6629
+ static void iw_cp_map_files(FrtIndexWriter *iw, FrtSegmentReader *sr, FrtSegmentInfo *si) {
6435
6630
  int i;
6436
6631
  FrtFieldInfos *from_fis = IR(sr)->fis;
6437
6632
  FrtFieldInfos *to_fis = iw->fis;
@@ -6449,15 +6644,13 @@ static void iw_cp_map_files(FrtIndexWriter *iw, SegmentReader *sr,
6449
6644
  free(field_map);
6450
6645
  }
6451
6646
 
6452
- static void iw_cp_files(FrtIndexWriter *iw, SegmentReader *sr,
6453
- FrtSegmentInfo *si)
6454
- {
6647
+ static void iw_cp_files(FrtIndexWriter *iw, FrtSegmentReader *sr, FrtSegmentInfo *si) {
6455
6648
  iw_cp_fields(iw, sr, si->name, NULL);
6456
6649
  iw_cp_terms( iw, sr, si->name, NULL);
6457
6650
  iw_cp_norms( iw, sr, si, NULL);
6458
6651
  }
6459
6652
 
6460
- static void iw_add_segment(FrtIndexWriter *iw, SegmentReader *sr)
6653
+ static void iw_add_segment(FrtIndexWriter *iw, FrtSegmentReader *sr)
6461
6654
  {
6462
6655
  FrtSegmentInfo *si = frt_sis_new_segment(iw->sis, 0, iw->store);
6463
6656
  FrtFieldInfos *fis = iw->fis;
@@ -6472,7 +6665,7 @@ static void iw_add_segment(FrtIndexWriter *iw, SegmentReader *sr)
6472
6665
  FrtFieldInfo *fi = sub_fis->fields[j];
6473
6666
  FrtFieldInfo *new_fi = frt_fis_get_field(fis, fi->name);
6474
6667
  if (NULL == new_fi) {
6475
- new_fi = frt_fi_new(fi->name, FRT_STORE_NO, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
6668
+ new_fi = frt_fi_new(fi->name, FRT_STORE_NO, FRT_COMPRESSION_NONE, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
6476
6669
  new_fi->bits = fi->bits;
6477
6670
  frt_fis_add_field(fis, new_fi);
6478
6671
  }