isomorfeus-ferret 0.12.6 → 0.13.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (249) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +85 -16
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_constants.c +15 -0
  9. data/ext/isomorfeus_ferret_ext/brotli_common_constants.h +200 -0
  10. data/ext/isomorfeus_ferret_ext/brotli_common_context.c +156 -0
  11. data/ext/isomorfeus_ferret_ext/brotli_common_context.h +113 -0
  12. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +5914 -0
  13. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.h +64 -0
  14. data/ext/isomorfeus_ferret_ext/brotli_common_platform.c +22 -0
  15. data/ext/isomorfeus_ferret_ext/brotli_common_platform.h +594 -0
  16. data/ext/isomorfeus_ferret_ext/brotli_common_transform.c +291 -0
  17. data/ext/isomorfeus_ferret_ext/brotli_common_transform.h +85 -0
  18. data/ext/isomorfeus_ferret_ext/brotli_common_version.h +26 -0
  19. data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.c +76 -0
  20. data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.h +351 -0
  21. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +2608 -0
  22. data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.c +339 -0
  23. data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.h +121 -0
  24. data/ext/isomorfeus_ferret_ext/brotli_dec_prefix.h +732 -0
  25. data/ext/isomorfeus_ferret_ext/brotli_dec_state.c +159 -0
  26. data/ext/isomorfeus_ferret_ext/brotli_dec_state.h +365 -0
  27. data/ext/isomorfeus_ferret_ext/brotli_decode.h +344 -0
  28. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.c +145 -0
  29. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.h +39 -0
  30. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.c +843 -0
  31. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.h +95 -0
  32. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_inc.h +163 -0
  33. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.c +35 -0
  34. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.h +63 -0
  35. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost_inc.h +127 -0
  36. data/ext/isomorfeus_ferret_ext/brotli_enc_block_encoder_inc.h +34 -0
  37. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.c +194 -0
  38. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.h +51 -0
  39. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter_inc.h +440 -0
  40. data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.c +1314 -0
  41. data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.h +84 -0
  42. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.c +56 -0
  43. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.h +48 -0
  44. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster_inc.h +320 -0
  45. data/ext/isomorfeus_ferret_ext/brotli_enc_command.c +28 -0
  46. data/ext/isomorfeus_ferret_ext/brotli_enc_command.h +190 -0
  47. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.c +790 -0
  48. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.h +61 -0
  49. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.c +645 -0
  50. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.h +54 -0
  51. data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.c +1846 -0
  52. data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.h +25 -0
  53. data/ext/isomorfeus_ferret_ext/brotli_enc_encode.c +1927 -0
  54. data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.c +33 -0
  55. data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.h +43 -0
  56. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.c +503 -0
  57. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.h +122 -0
  58. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode_static.h +539 -0
  59. data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.c +105 -0
  60. data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.h +66 -0
  61. data/ext/isomorfeus_ferret_ext/brotli_enc_find_match_length.h +79 -0
  62. data/ext/isomorfeus_ferret_ext/brotli_enc_hash.h +488 -0
  63. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_composite_inc.h +125 -0
  64. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_forgetful_chain_inc.h +293 -0
  65. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match64_inc.h +267 -0
  66. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_inc.h +262 -0
  67. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_quickly_inc.h +266 -0
  68. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_rolling_inc.h +212 -0
  69. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_to_binary_tree_inc.h +329 -0
  70. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.c +100 -0
  71. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.h +63 -0
  72. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram_inc.h +51 -0
  73. data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.c +175 -0
  74. data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.h +30 -0
  75. data/ext/isomorfeus_ferret_ext/brotli_enc_memory.c +170 -0
  76. data/ext/isomorfeus_ferret_ext/brotli_enc_memory.h +114 -0
  77. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.c +663 -0
  78. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.h +105 -0
  79. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock_inc.h +183 -0
  80. data/ext/isomorfeus_ferret_ext/brotli_enc_params.h +46 -0
  81. data/ext/isomorfeus_ferret_ext/brotli_enc_prefix.h +53 -0
  82. data/ext/isomorfeus_ferret_ext/brotli_enc_quality.h +165 -0
  83. data/ext/isomorfeus_ferret_ext/brotli_enc_ringbuffer.h +167 -0
  84. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.c +486 -0
  85. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.h +40 -0
  86. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict_lut.h +5864 -0
  87. data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.c +85 -0
  88. data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.h +32 -0
  89. data/ext/isomorfeus_ferret_ext/brotli_enc_write_bits.h +87 -0
  90. data/ext/isomorfeus_ferret_ext/brotli_encode.h +448 -0
  91. data/ext/isomorfeus_ferret_ext/brotli_port.h +288 -0
  92. data/ext/isomorfeus_ferret_ext/brotli_types.h +83 -0
  93. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  94. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  95. data/ext/isomorfeus_ferret_ext/bzlib_blocksort.c +1094 -0
  96. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  97. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  98. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  99. data/ext/isomorfeus_ferret_ext/bzlib_huffman.c +205 -0
  100. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  101. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  102. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  103. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  104. data/ext/isomorfeus_ferret_ext/frb_index.c +513 -464
  105. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  106. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  107. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  108. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  109. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  110. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  111. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  112. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  113. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  114. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  115. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  116. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -9
  117. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  118. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  119. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  120. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  121. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +2 -0
  122. data/ext/isomorfeus_ferret_ext/frt_global.c +91 -200
  123. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -18
  124. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  125. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  126. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  127. data/ext/isomorfeus_ferret_ext/frt_index.c +714 -384
  128. data/ext/isomorfeus_ferret_ext/frt_index.h +274 -290
  129. data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
  130. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  131. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  132. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +68 -91
  133. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  134. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  135. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  136. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  137. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  138. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  139. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  140. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  141. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  142. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  143. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  144. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  145. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +46 -84
  146. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  147. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  148. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  149. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  150. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  151. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  152. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  153. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  154. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +22 -112
  155. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  156. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  157. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  158. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  159. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  160. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  161. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  162. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  163. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  164. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  165. data/ext/isomorfeus_ferret_ext/test.c +0 -17
  166. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  167. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  168. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  169. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  170. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  171. data/ext/isomorfeus_ferret_ext/test_fields.c +111 -100
  172. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  173. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  174. data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
  175. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  176. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  177. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  178. data/ext/isomorfeus_ferret_ext/test_index.c +373 -363
  179. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  180. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  181. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  182. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  183. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  184. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  185. data/ext/isomorfeus_ferret_ext/test_search.c +60 -64
  186. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  187. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  188. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  189. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  190. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  191. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  192. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  193. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  194. data/lib/isomorfeus/ferret/version.rb +1 -1
  195. metadata +113 -58
  196. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  197. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  198. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  199. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  200. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  201. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  202. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  203. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  204. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  205. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  206. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  207. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  208. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  209. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  210. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  211. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  212. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  213. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  214. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  215. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  216. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  217. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  218. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  219. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  220. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  221. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  222. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  223. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  224. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  225. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  226. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  227. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  228. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  229. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  230. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  231. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  232. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  233. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  234. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  235. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  236. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  237. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  238. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  239. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  240. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  241. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  242. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  243. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  244. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  245. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  246. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  247. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  248. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  249. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -6,6 +6,13 @@
6
6
  #include <string.h>
7
7
  #include <limits.h>
8
8
  #include <ctype.h>
9
+ #include "brotli_decode.h"
10
+ #include "brotli_encode.h"
11
+ #include "bzlib.h"
12
+ #include "lz4frame.h"
13
+
14
+ #undef close
15
+ #undef read
9
16
 
10
17
  extern void frt_micro_sleep(const int micro_seconds);
11
18
 
@@ -39,8 +46,9 @@ static char *ste_next(FrtTermEnum *te);
39
46
  #define FORMAT 0
40
47
  #define SEGMENTS_GEN_FILE_NAME "segments"
41
48
  #define MAX_EXT_LEN 10
42
- #define ZIP_BUFFER_SIZE 16348
43
- #define ZIP_LEVEL 9
49
+ #define FRT_COMPRESSION_BUFFER_SIZE 16348
50
+ #define FRT_BROTLI_COMPRESSION_LEVEL 4
51
+ #define FRT_BZIP_COMPRESSION_LEVEL 9
44
52
 
45
53
  /* *** Must be three characters *** */
46
54
  static const char *INDEX_EXTENSIONS[] = {
@@ -101,29 +109,22 @@ static frt_u64 str36_to_u64(char *p)
101
109
  * @param ext extension of the filename (including .)
102
110
  * @param gen generation
103
111
  */
104
- char *frt_fn_for_generation(char *buf,
105
- const char *base,
106
- const char *ext,
107
- frt_i64 gen)
108
- {
112
+ char *frt_fn_for_generation(char *buf, const char *base, const char *ext, frt_i64 gen) {
109
113
  if (-1 == gen) {
110
114
  return NULL;
111
- }
112
- else {
115
+ } else {
113
116
  char b[FRT_SEGMENT_NAME_MAX_LENGTH];
114
117
  char *u = u64_to_str36(b, FRT_SEGMENT_NAME_MAX_LENGTH, (frt_u64)gen);
115
118
  if (ext == NULL) {
116
119
  sprintf(buf, "%s_%s", base, u);
117
- }
118
- else {
120
+ } else {
119
121
  sprintf(buf, "%s_%s.%s", base, u, ext);
120
122
  }
121
123
  return buf;
122
124
  }
123
125
  }
124
126
 
125
- static char *segfn_for_generation(char *buf, frt_u64 generation)
126
- {
127
+ static char *segfn_for_generation(char *buf, frt_u64 generation) {
127
128
  char b[FRT_SEGMENT_NAME_MAX_LENGTH];
128
129
  char *u = u64_to_str36(b, FRT_SEGMENT_NAME_MAX_LENGTH, generation);
129
130
  sprintf(buf, FRT_SEGMENTS_FILE_NAME"_%s", u);
@@ -201,8 +202,7 @@ FrtCacheObject *frt_co_create(FrtHash *ref_tab1, FrtHash *ref_tab2,
201
202
  return self;
202
203
  }
203
204
 
204
- FrtHash *frt_co_hash_create()
205
- {
205
+ FrtHash *frt_co_hash_create(void) {
206
206
  return frt_h_new(&co_hash, &co_eq, (frt_free_ft)NULL, (frt_free_ft)&co_destroy);
207
207
  }
208
208
 
@@ -212,8 +212,7 @@ FrtHash *frt_co_hash_create()
212
212
  *
213
213
  ****************************************************************************/
214
214
 
215
- static void fi_set_store(FrtFieldInfo *fi, int store)
216
- {
215
+ static void fi_set_store(FrtFieldInfo *fi, FrtStoreValue store) {
217
216
  switch (store) {
218
217
  case FRT_STORE_NO:
219
218
  break;
@@ -223,8 +222,23 @@ static void fi_set_store(FrtFieldInfo *fi, int store)
223
222
  }
224
223
  }
225
224
 
226
- static void fi_set_index(FrtFieldInfo *fi, int index)
227
- {
225
+ static void fi_set_compression(FrtFieldInfo *fi, FrtCompressionType compression) {
226
+ switch (compression) {
227
+ case FRT_COMPRESSION_NONE:
228
+ break;
229
+ case FRT_COMPRESSION_BROTLI:
230
+ fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_BROTLI_BM;
231
+ break;
232
+ case FRT_COMPRESSION_BZ2:
233
+ fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_BZ2_BM;
234
+ break;
235
+ case FRT_COMPRESSION_LZ4:
236
+ fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_COMPRESSION_LZ4_BM;
237
+ break;
238
+ }
239
+ }
240
+
241
+ static void fi_set_index(FrtFieldInfo *fi, FrtIndexValue index) {
228
242
  switch (index) {
229
243
  case FRT_INDEX_NO:
230
244
  break;
@@ -244,8 +258,7 @@ static void fi_set_index(FrtFieldInfo *fi, int index)
244
258
  }
245
259
  }
246
260
 
247
- static void fi_set_term_vector(FrtFieldInfo *fi, int term_vector)
248
- {
261
+ static void fi_set_term_vector(FrtFieldInfo *fi, FrtTermVectorValue term_vector) {
249
262
  switch (term_vector) {
250
263
  case FRT_TERM_VECTOR_NO:
251
264
  break;
@@ -265,33 +278,40 @@ static void fi_set_term_vector(FrtFieldInfo *fi, int term_vector)
265
278
  }
266
279
  }
267
280
 
268
- static void fi_check_params(int store, int index, int term_vector)
269
- {
281
+ static void fi_check_params(FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
270
282
  (void)store;
271
283
  if ((index == FRT_INDEX_NO) && (term_vector != FRT_TERM_VECTOR_NO)) {
272
- FRT_RAISE(FRT_ARG_ERROR,
273
- "You can't store the term vectors of an unindexed field");
284
+ FRT_RAISE(FRT_ARG_ERROR, "You can't store the term vectors of an unindexed field.");
285
+ }
286
+ if ((compression != FRT_COMPRESSION_NONE) && (store == FRT_STORE_NO)) {
287
+ FRT_RAISE(FRT_ARG_ERROR, "Field must be stored for compression to be useful.");
274
288
  }
275
289
  }
276
290
 
277
- FrtFieldInfo *frt_fi_new(FrtSymbol name,
278
- FrtStoreValue store,
279
- FrtIndexValue index,
280
- FrtTermVectorValue term_vector)
281
- {
282
- FrtFieldInfo *fi = FRT_ALLOC(FrtFieldInfo);
291
+ FrtFieldInfo *frt_fi_alloc(void) {
292
+ return FRT_ALLOC(FrtFieldInfo);
293
+ }
294
+
295
+ FrtFieldInfo *frt_fi_init(FrtFieldInfo *fi, ID name, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
283
296
  assert(NULL != name);
284
- fi_check_params(store, index, term_vector);
297
+ fi_check_params(store, compression, index, term_vector);
285
298
  fi->name = name;
286
299
  fi->boost = 1.0f;
287
300
  fi->bits = 0;
288
301
  fi_set_store(fi, store);
302
+ fi_set_compression(fi, compression);
289
303
  fi_set_index(fi, index);
290
304
  fi_set_term_vector(fi, term_vector);
291
305
  fi->ref_cnt = 1;
306
+ fi->rfi = Qnil;
292
307
  return fi;
293
308
  }
294
309
 
310
+ FrtFieldInfo *frt_fi_new(ID name, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
311
+ FrtFieldInfo *fi = frt_fi_alloc();
312
+ return frt_fi_init(fi, name, store, compression, index, term_vector);
313
+ }
314
+
295
315
  void frt_fi_deref(FrtFieldInfo *fi)
296
316
  {
297
317
  if (0 == --(fi->ref_cnt)) {
@@ -299,13 +319,30 @@ void frt_fi_deref(FrtFieldInfo *fi)
299
319
  }
300
320
  }
301
321
 
322
+ FrtCompressionType frt_fi_get_compression(FrtFieldInfo *fi) {
323
+ if (fi_is_compressed(fi)) {
324
+ if (fi_is_compressed_brotli(fi)) {
325
+ return FRT_COMPRESSION_BROTLI;
326
+ } else if (fi_is_compressed_bz2(fi)) {
327
+ return FRT_COMPRESSION_BZ2;
328
+ } else if (fi_is_compressed_lz4(fi)) {
329
+ return FRT_COMPRESSION_LZ4;
330
+ } else {
331
+ return FRT_COMPRESSION_BROTLI;
332
+ }
333
+ } else {
334
+ return FRT_COMPRESSION_NONE;
335
+ }
336
+ }
337
+
302
338
  char *frt_fi_to_s(FrtFieldInfo *fi)
303
339
  {
304
340
  const char *fi_name = rb_id2name(fi->name);
305
341
  char *str = FRT_ALLOC_N(char, strlen(fi_name) + 200);
306
342
  char *s = str;
307
- s += sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s", fi_name,
343
+ s += sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s%s", fi_name,
308
344
  fi_is_stored(fi) ? "is_stored, " : "",
345
+ fi_is_compressed(fi) ? "is_compressed, " : "",
309
346
  fi_is_indexed(fi) ? "is_indexed, " : "",
310
347
  fi_is_tokenized(fi) ? "is_tokenized, " : "",
311
348
  fi_omit_norms(fi) ? "omit_norms, " : "",
@@ -327,24 +364,31 @@ char *frt_fi_to_s(FrtFieldInfo *fi)
327
364
  *
328
365
  ****************************************************************************/
329
366
 
330
- FrtFieldInfos *frt_fis_new(FrtStoreValue store, FrtIndexValue index,
331
- FrtTermVectorValue term_vector)
332
- {
333
- FrtFieldInfos *fis = FRT_ALLOC(FrtFieldInfos);
334
- fi_check_params(store, index, term_vector);
367
+ FrtFieldInfos *frt_fis_alloc(void) {
368
+ return FRT_ALLOC(FrtFieldInfos);
369
+ }
370
+
371
+ FrtFieldInfos *frt_fis_init(FrtFieldInfos *fis, FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
372
+ fi_check_params(store, compression, index, term_vector);
335
373
  fis->field_dict = frt_h_new_ptr((frt_free_ft)&frt_fi_deref);
336
374
  fis->size = 0;
337
375
  fis->capa = FIELD_INFOS_INIT_CAPA;
338
376
  fis->fields = FRT_ALLOC_N(FrtFieldInfo *, fis->capa);
339
377
  fis->store = store;
378
+ fis->compression = compression;
340
379
  fis->index = index;
341
380
  fis->term_vector = term_vector;
342
381
  fis->ref_cnt = 1;
382
+ fis->rfis = Qnil;
343
383
  return fis;
344
384
  }
345
385
 
346
- FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi)
347
- {
386
+ FrtFieldInfos *frt_fis_new(FrtStoreValue store, FrtCompressionType compression, FrtIndexValue index, FrtTermVectorValue term_vector) {
387
+ FrtFieldInfos *fis = frt_fis_alloc();
388
+ return frt_fis_init(fis, store, compression, index, term_vector);
389
+ }
390
+
391
+ FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi) {
348
392
  if (fis->size == fis->capa) {
349
393
  fis->capa <<= 1;
350
394
  FRT_REALLOC_N(fis->fields, FrtFieldInfo *, fis->capa);
@@ -358,23 +402,20 @@ FrtFieldInfo *frt_fis_add_field(FrtFieldInfos *fis, FrtFieldInfo *fi)
358
402
  return fi;
359
403
  }
360
404
 
361
- FrtFieldInfo *frt_fis_get_field(FrtFieldInfos *fis, FrtSymbol name)
362
- {
405
+ FrtFieldInfo *frt_fis_get_field(FrtFieldInfos *fis, ID name) {
363
406
  return (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
364
407
  }
365
408
 
366
- int frt_fis_get_field_num(FrtFieldInfos *fis, FrtSymbol name)
367
- {
409
+ int frt_fis_get_field_num(FrtFieldInfos *fis, ID name) {
368
410
  FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
369
411
  if (fi) { return fi->number; }
370
412
  else { return -1; }
371
413
  }
372
414
 
373
- FrtFieldInfo *frt_fis_get_or_add_field(FrtFieldInfos *fis, FrtSymbol name)
374
- {
415
+ FrtFieldInfo *frt_fis_get_or_add_field(FrtFieldInfos *fis, ID name) {
375
416
  FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(fis->field_dict, (void *)name);
376
417
  if (!fi) {
377
- fi = (FrtFieldInfo*)frt_fi_new(name, fis->store, fis->index, fis->term_vector);
418
+ fi = (FrtFieldInfo*)frt_fi_new(name, fis->store, fis->compression, fis->index, fis->term_vector);
378
419
  frt_fis_add_field(fis, fi);
379
420
  }
380
421
  return fi;
@@ -386,16 +427,14 @@ FrtFieldInfos *frt_fis_read(FrtInStream *is)
386
427
  char *field_name;
387
428
  FRT_TRY
388
429
  do {
389
- FrtStoreValue store_val;
390
- FrtIndexValue index_val;
391
430
  FrtTermVectorValue term_vector_val;
392
431
  volatile int i;
393
432
  union { frt_u32 i; float f; } tmp;
394
433
  FrtFieldInfo *volatile fi;
395
- store_val = (FrtStoreValue)frt_is_read_vint(is);
396
- index_val = (FrtIndexValue)frt_is_read_vint(is);
434
+ FrtStoreValue store_val = (FrtStoreValue)frt_is_read_vint(is);
435
+ FrtIndexValue index_val = (FrtIndexValue)frt_is_read_vint(is);
397
436
  term_vector_val = (FrtTermVectorValue)frt_is_read_vint(is);
398
- fis = frt_fis_new(store_val, index_val, term_vector_val);
437
+ fis = frt_fis_new(store_val, FRT_COMPRESSION_NONE, index_val, term_vector_val); // TODO compression, read from store?
399
438
  for (i = frt_is_read_vint(is); i > 0; i--) {
400
439
  fi = FRT_ALLOC_AND_ZERO(FrtFieldInfo);
401
440
  FRT_TRY
@@ -443,7 +482,8 @@ void frt_fis_write(FrtFieldInfos *fis, FrtOutStream *os)
443
482
  static const char *store_str[] = {
444
483
  ":no",
445
484
  ":yes",
446
- ""
485
+ "",
486
+ ":compressed"
447
487
  };
448
488
 
449
489
  static const char *fi_store_str(FrtFieldInfo *fi)
@@ -796,8 +836,7 @@ static char *sis_next_seg_file_name(char *buf, FrtStore *store)
796
836
 
797
837
  #define GEN_FILE_RETRY_COUNT 10
798
838
  #define GEN_LOOK_AHEAD_COUNT 10
799
- static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
800
- void (*run)(FrtStore *store, FindSegmentsFile *fsf))
839
+ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf, void (*run)(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir), FrtIndexReader *ir)
801
840
  {
802
841
  volatile int i;
803
842
  volatile int gen_look_ahead_count = 0;
@@ -904,7 +943,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
904
943
  last_gen = gen;
905
944
  FRT_TRY
906
945
  fsf->generation = gen;
907
- run(store, fsf);
946
+ run(store, fsf, ir);
908
947
  FRT_RETURN_EARLY();
909
948
  return;
910
949
  case FRT_IO_ERROR: case FRT_FILE_NOT_FOUND_ERROR: case FRT_EOF_ERROR:
@@ -950,7 +989,7 @@ static void sis_find_segments_file(FrtStore *store, FindSegmentsFile *fsf,
950
989
  * prevSegmentFileName + "'" */
951
990
  FRT_TRY
952
991
  fsf->generation = gen - 1;
953
- run(store, fsf);
992
+ run(store, fsf, ir);
954
993
  /* TODO:LOG "success on fallback " +
955
994
  * prev_seg_file_name */
956
995
 
@@ -1033,7 +1072,7 @@ void frt_sis_del_from_to(FrtSegmentInfos *sis, int from, int to)
1033
1072
  }
1034
1073
  }
1035
1074
 
1036
- static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf)
1075
+ static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir_)
1037
1076
  {
1038
1077
  int seg_cnt;
1039
1078
  int i;
@@ -1072,7 +1111,7 @@ static void frt_sis_read_i(FrtStore *store, FindSegmentsFile *fsf)
1072
1111
  FrtSegmentInfos *frt_sis_read(FrtStore *store)
1073
1112
  {
1074
1113
  FindSegmentsFile fsf;
1075
- sis_find_segments_file(store, &fsf, &frt_sis_read_i);
1114
+ sis_find_segments_file(store, &fsf, &frt_sis_read_i, NULL);
1076
1115
  return fsf.ret.sis;
1077
1116
  }
1078
1117
 
@@ -1112,7 +1151,7 @@ void frt_sis_write(FrtSegmentInfos *sis, FrtStore *store, FrtDeleter *deleter)
1112
1151
  }
1113
1152
  }
1114
1153
 
1115
- static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf)
1154
+ static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir_)
1116
1155
  {
1117
1156
  FrtInStream *is;
1118
1157
  frt_u64 version;
@@ -1135,7 +1174,7 @@ static void frt_sis_read_ver_i(FrtStore *store, FindSegmentsFile *fsf)
1135
1174
  frt_u64 frt_sis_read_current_version(FrtStore *store)
1136
1175
  {
1137
1176
  FindSegmentsFile fsf;
1138
- sis_find_segments_file(store, &fsf, &frt_sis_read_ver_i);
1177
+ sis_find_segments_file(store, &fsf, &frt_sis_read_ver_i, NULL);
1139
1178
  return fsf.ret.uint64;
1140
1179
  }
1141
1180
 
@@ -1145,17 +1184,17 @@ frt_u64 frt_sis_read_current_version(FrtStore *store)
1145
1184
  *
1146
1185
  ****************************************************************************/
1147
1186
 
1148
- static FrtLazyDocField *lazy_df_new(FrtSymbol name, const int size)
1149
- {
1187
+ static FrtLazyDocField *lazy_df_new(ID name, const int size, FrtCompressionType compression) {
1150
1188
  FrtLazyDocField *self = FRT_ALLOC(FrtLazyDocField);
1151
1189
  self->name = name;
1152
1190
  self->size = size;
1153
1191
  self->data = FRT_ALLOC_AND_ZERO_N(FrtLazyDocFieldData, size);
1192
+ self->compression = compression;
1193
+ self->decompressed = false;
1154
1194
  return self;
1155
1195
  }
1156
1196
 
1157
- static void lazy_df_destroy(FrtLazyDocField *self)
1158
- {
1197
+ static void lazy_df_destroy(FrtLazyDocField *self) {
1159
1198
  int i;
1160
1199
  for (i = self->size - 1; i >= 0; i--) {
1161
1200
  if (self->data[i].text) {
@@ -1166,25 +1205,246 @@ static void lazy_df_destroy(FrtLazyDocField *self)
1166
1205
  free(self);
1167
1206
  }
1168
1207
 
1169
- char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
1170
- {
1208
+ static void comp_raise(void) {
1209
+ FRT_RAISE(EXCEPTION, "Compression error");
1210
+ }
1211
+
1212
+ static char *is_read_brotli_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
1213
+ int buf_out_idx = 0;
1214
+ int read_len;
1215
+ frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1216
+ const frt_uchar *next_in;
1217
+ size_t available_in;
1218
+ frt_uchar *buf_out = NULL;
1219
+ frt_uchar *next_out;
1220
+ size_t available_out;
1221
+
1222
+ BrotliDecoderState *b_state = BrotliDecoderCreateInstance(NULL, NULL, NULL);
1223
+ BrotliDecoderResult b_result = BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
1224
+ if (!b_state) { comp_raise(); return NULL; }
1225
+
1226
+ do {
1227
+ read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1228
+ frt_is_read_bytes(is, buf_in, read_len);
1229
+ compressed_len -= read_len;
1230
+ available_in = read_len;
1231
+ next_in = buf_in;
1232
+ available_out = FRT_COMPRESSION_BUFFER_SIZE;
1233
+ do {
1234
+ FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
1235
+ next_out = buf_out + buf_out_idx;
1236
+ b_result = BrotliDecoderDecompressStream(b_state,
1237
+ &available_in, &next_in,
1238
+ &available_out, &next_out, NULL);
1239
+ if (b_result == BROTLI_DECODER_RESULT_ERROR) { comp_raise(); return NULL; }
1240
+ buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - available_out;
1241
+ } while (b_result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT);
1242
+ } while (b_result != BROTLI_DECODER_RESULT_SUCCESS && compressed_len > 0);
1243
+
1244
+ BrotliDecoderDestroyInstance(b_state);
1245
+
1246
+ FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + 1);
1247
+ buf_out[buf_out_idx] = '\0';
1248
+ *len = buf_out_idx;
1249
+ return (char *)buf_out;
1250
+ }
1251
+
1252
+ static void zraise(int ret) {
1253
+ switch (ret) {
1254
+ case BZ_IO_ERROR:
1255
+ if (ferror(stdin))
1256
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: error reading stdin");
1257
+ if (ferror(stdout))
1258
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: error writing stdout");
1259
+ break;
1260
+ case BZ_CONFIG_ERROR:
1261
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: system configuration error");
1262
+ break;
1263
+ case BZ_SEQUENCE_ERROR: /* shouldn't occur if code is correct */
1264
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! sequence error");
1265
+ break;
1266
+ case BZ_PARAM_ERROR: /* shouldn't occur if code is correct */
1267
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: !!BUG!! parameter error");
1268
+ break;
1269
+ case BZ_MEM_ERROR:
1270
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: memory error");
1271
+ break;
1272
+ case BZ_DATA_ERROR:
1273
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check error");
1274
+ break;
1275
+ case BZ_DATA_ERROR_MAGIC:
1276
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: data integrity check - non-matching magic");
1277
+ break;
1278
+ case BZ_UNEXPECTED_EOF:
1279
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: unexpected end-of-file");
1280
+ break;
1281
+ case BZ_OUTBUFF_FULL:
1282
+ FRT_RAISE(FRT_IO_ERROR, "bzlib: output buffer full");
1283
+ break;
1284
+ default:
1285
+ FRT_RAISE(FRT_EXCEPTION, "bzlib: unknown error");
1286
+ }
1287
+ }
1288
+
1289
+ static char *is_read_bz2_compressed_bytes(FrtInStream *is, int compressed_len, int *len) {
1290
+ int buf_out_idx = 0, ret, read_len;
1291
+ char *buf_out = NULL;
1292
+ char buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1293
+ bz_stream zstrm;
1294
+ zstrm.bzalloc = NULL;
1295
+ zstrm.bzfree = NULL;
1296
+ zstrm.opaque = NULL;
1297
+ zstrm.next_in = NULL;
1298
+ zstrm.avail_in = 0;
1299
+ if ((ret = BZ2_bzDecompressInit(&zstrm, 0, 0)) != BZ_OK) zraise(ret);
1300
+
1301
+ do {
1302
+ read_len = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1303
+ frt_is_read_bytes(is, (frt_uchar *)buf_in, read_len);
1304
+ compressed_len -= read_len;
1305
+ zstrm.avail_in = read_len;
1306
+ zstrm.next_in = buf_in;
1307
+ zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
1308
+
1309
+ do {
1310
+ REALLOC_N(buf_out, char, buf_out_idx + FRT_COMPRESSION_BUFFER_SIZE);
1311
+ zstrm.next_out = buf_out + buf_out_idx;
1312
+ ret = BZ2_bzDecompress(&zstrm);
1313
+ assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
1314
+ if (ret != BZ_OK && ret != BZ_STREAM_END) {
1315
+ (void)BZ2_bzDecompressEnd(&zstrm);
1316
+ zraise(ret);
1317
+ }
1318
+ buf_out_idx += FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
1319
+ } while (zstrm.avail_out == 0);
1320
+ } while (ret != BZ_STREAM_END && compressed_len != 0);
1321
+
1322
+ (void)BZ2_bzDecompressEnd(&zstrm);
1323
+
1324
+ FRT_REALLOC_N(buf_out, char, buf_out_idx + 1);
1325
+ buf_out[buf_out_idx] = '\0';
1326
+
1327
+ *len = buf_out_idx;
1328
+ return (char *)buf_out;
1329
+ }
1330
+
1331
+ static char *is_read_lz4_compressed_bytes(FrtInStream *is, int compressed_len, int *length) {
1332
+ frt_uchar buf_in[FRT_COMPRESSION_BUFFER_SIZE];
1333
+ char *buf_out = NULL;
1334
+ int dc_length = 0;
1335
+ LZ4F_dctx *dctx;
1336
+ LZ4F_frameInfo_t frame_info;
1337
+ LZ4F_errorCode_t dctx_status = LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION);
1338
+ if (LZ4F_isError(dctx_status)) { *length = -1; return NULL; }
1339
+
1340
+ /* header and buffer */
1341
+ int read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1342
+ frt_is_read_bytes(is, buf_in, read_length);
1343
+ compressed_len -= read_length;
1344
+
1345
+ size_t consumed_size = read_length;
1346
+ size_t res = LZ4F_getFrameInfo(dctx, &frame_info, buf_in, &consumed_size);
1347
+ if (LZ4F_isError(res)) { *length = -1; return NULL; }
1348
+ size_t buf_out_length;
1349
+ switch(frame_info.blockSizeID) {
1350
+ case LZ4F_default:
1351
+ case LZ4F_max64KB:
1352
+ buf_out_length = 1 << 16;
1353
+ break;
1354
+ case LZ4F_max256KB:
1355
+ buf_out_length = 1 << 18;
1356
+ break;
1357
+ case LZ4F_max1MB:
1358
+ buf_out_length = 1 << 20;
1359
+ break;
1360
+ case LZ4F_max4MB:
1361
+ buf_out_length = 1 << 22;
1362
+ break;
1363
+ default:
1364
+ buf_out_length = 0;
1365
+ }
1366
+
1367
+ res = 1;
1368
+ int first_chunk = 1;
1369
+
1370
+ /* decompress data */
1371
+ while (res != 0) {
1372
+ if (!first_chunk) {
1373
+ read_length = (compressed_len > FRT_COMPRESSION_BUFFER_SIZE) ? FRT_COMPRESSION_BUFFER_SIZE : compressed_len;
1374
+ frt_is_read_bytes(is, buf_in, read_length);
1375
+ compressed_len -= read_length;
1376
+ consumed_size = 0;
1377
+ }
1378
+ first_chunk = 0;
1379
+
1380
+ char *src = (char *)(buf_in + consumed_size);
1381
+ char *src_end = (char *)buf_in + read_length;
1382
+
1383
+ while (src < src_end && res != 0){
1384
+ size_t dest_length = buf_out_length;
1385
+ size_t consumed_size = read_length;
1386
+ FRT_REALLOC_N(buf_out, char, dc_length + buf_out_length);
1387
+ res = LZ4F_decompress(dctx, buf_out + dc_length, &dest_length, src, &consumed_size, NULL);
1388
+ if (LZ4F_isError(res)) { *length = -1; return NULL; }
1389
+ dc_length += dest_length;
1390
+ src = src + consumed_size;
1391
+ }
1392
+ }
1393
+
1394
+ /* finish up */
1395
+ LZ4F_freeDecompressionContext(dctx);
1396
+
1397
+ FRT_REALLOC_N(buf_out, char, dc_length + 1);
1398
+ buf_out[dc_length] = '\0';
1399
+
1400
+ *length = dc_length;
1401
+ return buf_out;
1402
+ }
1403
+
1404
+ static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len, FrtCompressionType compression) {
1405
+ switch (compression) {
1406
+ case FRT_COMPRESSION_BROTLI:
1407
+ return is_read_brotli_compressed_bytes(is, compressed_len, len);
1408
+ case FRT_COMPRESSION_BZ2:
1409
+ return is_read_bz2_compressed_bytes(is, compressed_len, len);
1410
+ case FRT_COMPRESSION_LZ4:
1411
+ return is_read_lz4_compressed_bytes(is, compressed_len, len);
1412
+ default:
1413
+ return NULL;
1414
+ }
1415
+ }
1416
+
1417
+ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i) {
1171
1418
  char *text = NULL;
1172
1419
  if (i < self->size && i >= 0) {
1173
1420
  text = self->data[i].text;
1174
1421
  if (NULL == text) {
1175
1422
  const int read_len = self->data[i].length + 1;
1176
1423
  frt_is_seek(self->doc->fields_in, self->data[i].start);
1177
- self->data[i].text = text = FRT_ALLOC_N(char, read_len);
1178
- frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
1179
- text[read_len - 1] = '\0';
1424
+ if (self->data[i].compression != FRT_COMPRESSION_NONE) {
1425
+ self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length), self->data[i].compression);
1426
+ } else {
1427
+ self->data[i].text = text = FRT_ALLOC_N(char, read_len);
1428
+ frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
1429
+ text[read_len - 1] = '\0';
1430
+ }
1180
1431
  }
1181
1432
  }
1182
1433
 
1183
1434
  return text;
1184
1435
  }
1185
1436
 
1186
- void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
1187
- {
1437
+ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len) {
1438
+ if (self->compression != FRT_COMPRESSION_NONE && !self->decompressed) {
1439
+ int i;
1440
+ self->len = 0;
1441
+ for (i = self->size-1; i >= 0; i--) {
1442
+ (void)frt_lazy_df_get_data(self, i);
1443
+ self->len += self->data[i].length + 1;
1444
+ }
1445
+ self->len--; /* each field separated by ' ' but no need to add to end */
1446
+ self->decompressed = true;
1447
+ }
1188
1448
  if (start < 0 || start >= self->len) {
1189
1449
  FRT_RAISE(FRT_IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
1190
1450
  "is not between 0 and %d", start, self->len);
@@ -1196,7 +1456,33 @@ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
1196
1456
  FRT_RAISE(FRT_IO_ERROR, "Tried to read past end of field. Field is only %d "
1197
1457
  "bytes long but tried to read to %d", self->len, start + len);
1198
1458
  }
1199
- else {
1459
+ if (self->compression != FRT_COMPRESSION_NONE) {
1460
+ int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
1461
+ for (i = 0; i < self->size; i++) {
1462
+ cur_end = cur_start + self->data[i].length;
1463
+ if (start < cur_end) {
1464
+ copy_start = start > cur_start ? start - cur_start : 0;
1465
+ copy_len = cur_end - cur_start - copy_start;
1466
+ if (copy_len >= len) {
1467
+ copy_len = len;
1468
+ len = 0;
1469
+ }
1470
+ else {
1471
+ len -= copy_len;
1472
+ }
1473
+ memcpy(buf + buf_start,
1474
+ self->data[i].text + copy_start,
1475
+ copy_len);
1476
+ buf_start += copy_len;
1477
+ if (len > 0) {
1478
+ buf[buf_start++] = ' ';
1479
+ len--;
1480
+ }
1481
+ if (len == 0) break;
1482
+ }
1483
+ cur_start = cur_end + 1;
1484
+ }
1485
+ } else {
1200
1486
  frt_is_seek(self->doc->fields_in, self->data[0].start + start);
1201
1487
  frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)buf, len);
1202
1488
  }
@@ -1234,21 +1520,17 @@ static void lazy_doc_add_field(FrtLazyDoc *self, FrtLazyDocField *lazy_df, int i
1234
1520
  lazy_df->doc = self;
1235
1521
  }
1236
1522
 
1237
- FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self, FrtSymbol field)
1238
- {
1523
+ FrtLazyDocField *frt_lazy_doc_get(FrtLazyDoc *self, ID field) {
1239
1524
  return (FrtLazyDocField *)frt_h_get(self->field_dictionary, (void *)field);
1240
1525
  }
1241
1526
 
1242
1527
  /****************************************************************************
1243
- *
1244
1528
  * FrtFieldsReader
1245
- *
1246
1529
  ****************************************************************************/
1247
1530
 
1248
1531
  #define FIELDS_IDX_PTR_SIZE 12
1249
1532
 
1250
- FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos *fis)
1251
- {
1533
+ FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos *fis) {
1252
1534
  FrtFieldsReader *fr = FRT_ALLOC(FrtFieldsReader);
1253
1535
  FrtInStream *fdx_in;
1254
1536
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
@@ -1268,8 +1550,7 @@ FrtFieldsReader *frt_fr_open(FrtStore *store, const char *segment, FrtFieldInfos
1268
1550
  return fr;
1269
1551
  }
1270
1552
 
1271
- FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig)
1272
- {
1553
+ FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig) {
1273
1554
  FrtFieldsReader *fr = FRT_ALLOC(FrtFieldsReader);
1274
1555
 
1275
1556
  memcpy(fr, orig, sizeof(FrtFieldsReader));
@@ -1279,25 +1560,36 @@ FrtFieldsReader *frt_fr_clone(FrtFieldsReader *orig)
1279
1560
  return fr;
1280
1561
  }
1281
1562
 
1282
- void frt_fr_close(FrtFieldsReader *fr)
1283
- {
1563
+ void frt_fr_close(FrtFieldsReader *fr) {
1284
1564
  frt_is_close(fr->fdt_in);
1285
1565
  frt_is_close(fr->fdx_in);
1286
1566
  free(fr);
1287
1567
  }
1288
1568
 
1289
- static FrtDocField *frt_fr_df_new(FrtSymbol name, int size)
1290
- {
1569
+ static FrtDocField *frt_fr_df_new(ID name, int size, FrtCompressionType compression) {
1291
1570
  FrtDocField *df = FRT_ALLOC(FrtDocField);
1292
1571
  df->name = name;
1293
1572
  df->capa = df->size = size;
1294
1573
  df->data = FRT_ALLOC_N(char *, df->capa);
1295
1574
  df->lengths = FRT_ALLOC_N(int, df->capa);
1575
+ df->encodings = FRT_ALLOC_N(rb_encoding *, df->capa);
1296
1576
  df->destroy_data = true;
1297
1577
  df->boost = 1.0f;
1578
+ df->compression = compression;
1298
1579
  return df;
1299
1580
  }
1300
1581
 
1582
+ static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df, FrtCompressionType compression) {
1583
+ int i;
1584
+ const int df_size = df->size;
1585
+ FrtInStream *fdt_in = fr->fdt_in;
1586
+
1587
+ for (i = 0; i < df_size; i++) {
1588
+ const int compressed_len = df->lengths[i] + 1;
1589
+ df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]), compression);
1590
+ }
1591
+ }
1592
+
1301
1593
  FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
1302
1594
  {
1303
1595
  int i, j;
@@ -1316,22 +1608,28 @@ FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
1316
1608
  const int field_num = frt_is_read_vint(fdt_in);
1317
1609
  FrtFieldInfo *fi = fr->fis->fields[field_num];
1318
1610
  const int df_size = frt_is_read_vint(fdt_in);
1319
- FrtDocField *df = frt_fr_df_new(fi->name, df_size);
1611
+ FrtDocField *df = frt_fr_df_new(fi->name, df_size, frt_fi_get_compression(fi));
1320
1612
 
1321
1613
  for (j = 0; j < df_size; j++) {
1322
1614
  df->lengths[j] = frt_is_read_vint(fdt_in);
1615
+ df->encodings[j] = rb_enc_from_index(frt_is_read_vint(fdt_in));
1616
+ df->compression = frt_is_read_vint(fdt_in);
1323
1617
  }
1324
1618
 
1325
1619
  frt_doc_add_field(doc, df);
1326
1620
  }
1327
1621
  for (i = 0; i < stored_cnt; i++) {
1328
1622
  FrtDocField *df = doc->fields[i];
1329
- const int df_size = df->size;
1330
- for (j = 0; j < df_size; j++) {
1331
- const int read_len = df->lengths[j] + 1;
1332
- df->data[j] = FRT_ALLOC_N(char, read_len);
1333
- frt_is_read_bytes(fdt_in, (frt_uchar *)df->data[j], read_len);
1334
- df->data[j][read_len - 1] = '\0';
1623
+ if (df->compression != FRT_COMPRESSION_NONE) {
1624
+ frt_fr_read_compressed_fields(fr, df, df->compression);
1625
+ } else {
1626
+ const int df_size = df->size;
1627
+ for (j = 0; j < df_size; j++) {
1628
+ const int read_len = df->lengths[j] + 1;
1629
+ df->data[j] = FRT_ALLOC_N(char, read_len);
1630
+ frt_is_read_bytes(fdt_in, (frt_uchar *)df->data[j], read_len);
1631
+ df->data[j][read_len - 1] = '\0';
1632
+ }
1335
1633
  }
1336
1634
  }
1337
1635
 
@@ -1347,31 +1645,37 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
1347
1645
  FrtLazyDoc *lazy_doc;
1348
1646
  FrtInStream *fdx_in = fr->fdx_in;
1349
1647
  FrtInStream *fdt_in = fr->fdt_in;
1648
+
1350
1649
  frt_is_seek(fdx_in, doc_num * FIELDS_IDX_PTR_SIZE);
1351
1650
  pos = (off_t)frt_is_read_u64(fdx_in);
1352
1651
  frt_is_seek(fdt_in, pos);
1353
1652
  stored_cnt = frt_is_read_vint(fdt_in);
1653
+
1354
1654
  lazy_doc = lazy_doc_new(stored_cnt, fdt_in);
1355
1655
  for (i = 0; i < stored_cnt; i++) {
1356
1656
  FrtFieldInfo *fi = fr->fis->fields[frt_is_read_vint(fdt_in)];
1357
- const int data_cnt = frt_is_read_vint(fdt_in);
1358
- FrtLazyDocField *lazy_df = lazy_df_new(fi->name, data_cnt);
1657
+ const int df_size = frt_is_read_vint(fdt_in);
1658
+ FrtLazyDocField *lazy_df = lazy_df_new(fi->name, df_size, frt_fi_get_compression(fi));
1359
1659
  const int field_start = start;
1360
1660
  /* get the starts relative positions this time around */
1361
- for (j = 0; j < data_cnt; j++) {
1661
+
1662
+ for (j = 0; j < df_size; j++) {
1362
1663
  lazy_df->data[j].start = start;
1363
1664
  start += 1 + (lazy_df->data[j].length = frt_is_read_vint(fdt_in));
1665
+ lazy_df->data[j].encoding = rb_enc_from_index(frt_is_read_vint(fdt_in));
1666
+ lazy_df->data[j].compression = frt_is_read_vint(fdt_in);
1364
1667
  }
1668
+
1365
1669
  lazy_df->len = start - field_start - 1;
1366
1670
  lazy_doc_add_field(lazy_doc, lazy_df, i);
1367
1671
  }
1368
1672
  /* correct the starts to their correct absolute positions */
1673
+ const off_t abs_start = frt_is_pos(fdt_in);
1369
1674
  for (i = 0; i < stored_cnt; i++) {
1370
1675
  FrtLazyDocField *lazy_df = lazy_doc->fields[i];
1371
- const int data_cnt = lazy_df->size;
1372
- const off_t start = frt_is_pos(fdt_in);
1373
- for (j = 0; j < data_cnt; j++) {
1374
- lazy_df->data[j].start += start;
1676
+ const int df_size = lazy_df->size;
1677
+ for (j = 0; j < df_size; j++) {
1678
+ lazy_df->data[j].start += abs_start;
1375
1679
  }
1376
1680
  }
1377
1681
 
@@ -1549,11 +1853,150 @@ void frt_fw_close(FrtFieldsWriter *fw)
1549
1853
  free(fw);
1550
1854
  }
1551
1855
 
1552
- void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
1553
- {
1856
+ static int frt_os_write_brotli_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1857
+ size_t compressed_length = 0;
1858
+ const frt_uchar *next_in = data;
1859
+ size_t available_in = length;
1860
+ size_t available_out;
1861
+ frt_uchar compression_buffer[FRT_COMPRESSION_BUFFER_SIZE];
1862
+ frt_uchar *next_out;
1863
+ BrotliEncoderState *b_state = BrotliEncoderCreateInstance(NULL, NULL, NULL);
1864
+ if (!b_state) { comp_raise(); return -1; }
1865
+
1866
+ BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, FRT_BROTLI_COMPRESSION_LEVEL);
1867
+
1868
+ do {
1869
+ available_out = FRT_COMPRESSION_BUFFER_SIZE;
1870
+ next_out = compression_buffer;
1871
+ if (!BrotliEncoderCompressStream(b_state, BROTLI_OPERATION_FINISH,
1872
+ &available_in, &next_in,
1873
+ &available_out, &next_out, &compressed_length)) {
1874
+ BrotliEncoderDestroyInstance(b_state);
1875
+ comp_raise();
1876
+ return -1;
1877
+ }
1878
+ frt_os_write_bytes(out_stream, compression_buffer, FRT_COMPRESSION_BUFFER_SIZE - available_out);
1879
+ } while (!BrotliEncoderIsFinished(b_state));
1880
+
1881
+ BrotliEncoderDestroyInstance(b_state);
1882
+
1883
+ return (int)compressed_length;
1884
+ }
1885
+
1886
+ static int frt_os_write_bz2_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1887
+ int ret, buf_size, compressed_len = 0;
1888
+ char out_buffer[FRT_COMPRESSION_BUFFER_SIZE];
1889
+ bz_stream zstrm;
1890
+ zstrm.bzalloc = NULL;
1891
+ zstrm.bzfree = NULL;
1892
+ zstrm.opaque = NULL;
1893
+ if ((ret = BZ2_bzCompressInit(&zstrm, FRT_BZIP_COMPRESSION_LEVEL, 0, 0)) != BZ_OK) zraise(ret);
1894
+
1895
+ zstrm.avail_in = length;
1896
+ zstrm.next_in = (char *)data;
1897
+ zstrm.avail_out = FRT_COMPRESSION_BUFFER_SIZE;
1898
+ zstrm.next_out = out_buffer;
1899
+
1900
+ do {
1901
+ ret = BZ2_bzCompress(&zstrm, BZ_FINISH); /* no bad return value */
1902
+ assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
1903
+ compressed_len += buf_size = FRT_COMPRESSION_BUFFER_SIZE - zstrm.avail_out;
1904
+ frt_os_write_bytes(out_stream, (frt_uchar *)out_buffer, buf_size);
1905
+ } while (zstrm.avail_out == 0);
1906
+ assert(zstrm.avail_in == 0); /* all input will be used */
1907
+
1908
+ (void)BZ2_bzCompressEnd(&zstrm);
1909
+ return compressed_len;
1910
+ }
1911
+
1912
+ static const LZ4F_preferences_t lz4_prefs = {
1913
+ {
1914
+ LZ4F_default,
1915
+ LZ4F_blockLinked,
1916
+ LZ4F_noContentChecksum,
1917
+ LZ4F_frame,
1918
+ 0, /* unknown content size */
1919
+ 0, /* no dictID */
1920
+ LZ4F_noBlockChecksum
1921
+ },
1922
+ 0,
1923
+ 1,
1924
+ 1,
1925
+ {0,0,0}
1926
+ };
1927
+
1928
+ static int frt_os_write_lz4_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length) {
1929
+ int compressed_length = 0;
1930
+ int remaining_length = length;
1931
+ size_t ccmp_length = 0;
1932
+ LZ4F_compressionContext_t ctx;
1933
+ size_t out_buf_length = LZ4F_compressBound(FRT_COMPRESSION_BUFFER_SIZE, &lz4_prefs);
1934
+ frt_uchar *out_buf = frt_ecalloc(out_buf_length);
1935
+
1936
+ size_t ctx_creation = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
1937
+ if (LZ4F_isError(ctx_creation)) {
1938
+ compressed_length = -1;
1939
+ goto finish;
1940
+ }
1941
+
1942
+ /* create header */
1943
+ ccmp_length = LZ4F_compressBegin(ctx, out_buf, out_buf_length, &lz4_prefs);
1944
+ if (LZ4F_isError(ccmp_length)) {
1945
+ compressed_length = -1;
1946
+ goto finish;
1947
+ }
1948
+ compressed_length = ccmp_length;
1949
+ frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1950
+
1951
+ /* compress data */
1952
+ do {
1953
+ int read_length = (FRT_COMPRESSION_BUFFER_SIZE > remaining_length) ? remaining_length : FRT_COMPRESSION_BUFFER_SIZE;
1954
+ ccmp_length = LZ4F_compressUpdate(ctx, out_buf, out_buf_length, data + (length - remaining_length), read_length, NULL);
1955
+ if (LZ4F_isError(ccmp_length)) {
1956
+ compressed_length = -1;
1957
+ goto finish;
1958
+ }
1959
+ frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1960
+ compressed_length += ccmp_length;
1961
+ remaining_length -= read_length;
1962
+ } while (remaining_length > 0);
1963
+
1964
+ /* finish up */
1965
+ ccmp_length = LZ4F_compressEnd(ctx, out_buf, out_buf_length, NULL);
1966
+ if (LZ4F_isError(ccmp_length)) {
1967
+ compressed_length = -1;
1968
+ goto finish;
1969
+ }
1970
+
1971
+ frt_os_write_bytes(out_stream, out_buf, ccmp_length);
1972
+ compressed_length += ccmp_length;
1973
+
1974
+ finish:
1975
+ LZ4F_freeCompressionContext(ctx);
1976
+ free(out_buf);
1977
+
1978
+ return compressed_length;
1979
+ }
1980
+
1981
+ static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length, FrtCompressionType compression) {
1982
+ switch (compression) {
1983
+ case FRT_COMPRESSION_BROTLI:
1984
+ return frt_os_write_brotli_compressed_bytes(out_stream, data, length);
1985
+ case FRT_COMPRESSION_BZ2:
1986
+ return frt_os_write_bz2_compressed_bytes(out_stream, data, length);
1987
+ case FRT_COMPRESSION_LZ4:
1988
+ return frt_os_write_lz4_compressed_bytes(out_stream, data, length);
1989
+ default:
1990
+ return -1;
1991
+ }
1992
+
1993
+ }
1994
+
1995
+ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc) {
1554
1996
  int i, j, stored_cnt = 0;
1555
1997
  FrtDocField *df;
1556
1998
  FrtFieldInfo *fi;
1999
+ FrtCompressionType compression;
1557
2000
  FrtOutStream *fdt_out = fw->fdt_out, *fdx_out = fw->fdx_out;
1558
2001
  const int doc_size = doc->size;
1559
2002
 
@@ -1577,13 +2020,26 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
1577
2020
  const int df_size = df->size;
1578
2021
  frt_os_write_vint(fdt_out, fi->number);
1579
2022
  frt_os_write_vint(fdt_out, df_size);
1580
- for (j = 0; j < df_size; j++) {
1581
- const int length = df->lengths[j];
1582
- frt_os_write_vint(fdt_out, length);
1583
- frt_os_write_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
1584
- /* leave a space between fields as that is how they are
1585
- * analyzed */
1586
- frt_os_write_byte(fw->buffer, ' ');
2023
+
2024
+ if (fi_is_compressed(fi)) {
2025
+ compression = frt_fi_get_compression(fi);
2026
+ for (j = 0; j < df_size; j++) {
2027
+ const int length = df->lengths[j];
2028
+ int compressed_len = frt_os_write_compressed_bytes(fw->buffer, (frt_uchar*)df->data[j], length, compression);
2029
+ frt_os_write_vint(fdt_out, compressed_len - 1);
2030
+ frt_os_write_vint(fdt_out, rb_enc_to_index(df->encodings[j]));
2031
+ frt_os_write_vint(fdt_out, compression);
2032
+ }
2033
+ } else {
2034
+ for (j = 0; j < df_size; j++) {
2035
+ const int length = df->lengths[j];
2036
+ frt_os_write_vint(fdt_out, length);
2037
+ frt_os_write_vint(fdt_out, rb_enc_to_index(df->encodings[j]));
2038
+ frt_os_write_vint(fdt_out, FRT_COMPRESSION_NONE);
2039
+ frt_os_write_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
2040
+ /* leave a space between fields as that is how they are analyzed */
2041
+ frt_os_write_byte(fw->buffer, ' ');
2042
+ }
1587
2043
  }
1588
2044
  }
1589
2045
  }
@@ -1938,8 +2394,7 @@ static char *ste_scan_to(FrtTermEnum *te, const char *term)
1938
2394
  }
1939
2395
  }
1940
2396
 
1941
- static FrtSegmentTermEnum *ste_allocate()
1942
- {
2397
+ static FrtSegmentTermEnum *ste_allocate(void) {
1943
2398
  FrtSegmentTermEnum *ste = FRT_ALLOC_AND_ZERO(FrtSegmentTermEnum);
1944
2399
 
1945
2400
  TE(ste)->next = &ste_next;
@@ -1964,7 +2419,6 @@ void frt_ste_close(FrtTermEnum *te)
1964
2419
  free(te);
1965
2420
  }
1966
2421
 
1967
-
1968
2422
  static char *frt_ste_get_term(FrtTermEnum *te, int pos)
1969
2423
  {
1970
2424
  FrtSegmentTermEnum *ste = STE(te);
@@ -2079,9 +2533,7 @@ static void tew_destroy(TermEnumWrapper *tew)
2079
2533
  tew->te->close(tew->te);
2080
2534
  }
2081
2535
 
2082
- static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *te,
2083
- FrtIndexReader *ir)
2084
- {
2536
+ static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *te, FrtIndexReader *ir) {
2085
2537
  tew->index = index;
2086
2538
  tew->ir = ir;
2087
2539
  tew->te = te;
@@ -2090,9 +2542,7 @@ static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, FrtTermEnum *
2090
2542
  return tew;
2091
2543
  }
2092
2544
 
2093
-
2094
- static char *mte_next(FrtTermEnum *te)
2095
- {
2545
+ static char *mte_next(FrtTermEnum *te) {
2096
2546
  TermEnumWrapper *top =
2097
2547
  (TermEnumWrapper *)frt_pq_top(MTE(te)->tew_queue);
2098
2548
 
@@ -2122,8 +2572,7 @@ static char *mte_next(FrtTermEnum *te)
2122
2572
  return te->curr_term;
2123
2573
  }
2124
2574
 
2125
- static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num)
2126
- {
2575
+ static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num) {
2127
2576
  MultiTermEnum *mte = MTE(te);
2128
2577
  int i;
2129
2578
  const int size = mte->size;
@@ -2151,8 +2600,7 @@ static FrtTermEnum *mte_set_field(FrtTermEnum *te, int field_num)
2151
2600
  return te;
2152
2601
  }
2153
2602
 
2154
- static char *mte_skip_to(FrtTermEnum *te, const char *term)
2155
- {
2603
+ static char *mte_skip_to(FrtTermEnum *te, const char *term) {
2156
2604
  MultiTermEnum *mte = MTE(te);
2157
2605
  int i;
2158
2606
  const int size = mte->size;
@@ -2168,8 +2616,7 @@ static char *mte_skip_to(FrtTermEnum *te, const char *term)
2168
2616
  return mte_next(te);
2169
2617
  }
2170
2618
 
2171
- static void mte_close(FrtTermEnum *te)
2172
- {
2619
+ static void mte_close(FrtTermEnum *te) {
2173
2620
  int i;
2174
2621
  const int size = MTE(te)->size;
2175
2622
  for (i = 0; i < size; i++) {
@@ -2182,10 +2629,9 @@ static void mte_close(FrtTermEnum *te)
2182
2629
  free(te);
2183
2630
  }
2184
2631
 
2185
- FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2186
- {
2187
- FrtIndexReader **readers = mr->sub_readers;
2188
- int r_cnt = mr->r_cnt;
2632
+ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term) {
2633
+ FrtIndexReader **readers = mr->sub_readers;
2634
+ int r_cnt = mr->r_cnt;
2189
2635
  int i;
2190
2636
  FrtIndexReader *reader;
2191
2637
  MultiTermEnum *mte = FRT_ALLOC_AND_ZERO(MultiTermEnum);
@@ -2213,8 +2659,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2213
2659
 
2214
2660
  if (NULL != term) {
2215
2661
  sub_te = reader->terms_from(reader, fnum, term);
2216
- }
2217
- else {
2662
+ } else {
2218
2663
  sub_te = reader->terms(reader, fnum);
2219
2664
  }
2220
2665
 
@@ -2223,8 +2668,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2223
2668
  || (tew->term && (tew->term[0] != '\0'))) {
2224
2669
  frt_pq_push(mte->tew_queue, tew); /* initialize queue */
2225
2670
  }
2226
- }
2227
- else {
2671
+ } else {
2228
2672
  /* add the term_enum_wrapper just in case */
2229
2673
  sub_te = reader->terms(reader, 0);
2230
2674
  sub_te->field_num = -1;
@@ -2246,9 +2690,7 @@ FrtTermEnum *frt_mte_new(FrtMultiReader *mr, int field_num, const char *term)
2246
2690
  *
2247
2691
  ****************************************************************************/
2248
2692
 
2249
- FrtTermInfosReader *frt_tir_open(FrtStore *store,
2250
- FrtSegmentFieldIndex *sfi, const char *segment)
2251
- {
2693
+ FrtTermInfosReader *frt_tir_open(FrtStore *store, FrtSegmentFieldIndex *sfi, const char *segment) {
2252
2694
  FrtTermInfosReader *tir = FRT_ALLOC(FrtTermInfosReader);
2253
2695
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
2254
2696
 
@@ -2261,8 +2703,7 @@ FrtTermInfosReader *frt_tir_open(FrtStore *store,
2261
2703
  return tir;
2262
2704
  }
2263
2705
 
2264
- static FrtTermEnum *tir_enum(FrtTermInfosReader *tir)
2265
- {
2706
+ static FrtTermEnum *tir_enum(FrtTermInfosReader *tir) {
2266
2707
  FrtTermEnum *te;
2267
2708
  if (NULL == (te = (FrtTermEnum *)frt_thread_getspecific(tir->thread_te))) {
2268
2709
  te = frt_ste_clone(tir->orig_te);
@@ -2273,8 +2714,7 @@ static FrtTermEnum *tir_enum(FrtTermInfosReader *tir)
2273
2714
  return te;
2274
2715
  }
2275
2716
 
2276
- FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num)
2277
- {
2717
+ FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num) {
2278
2718
  if (field_num != tir->field_num) {
2279
2719
  ste_set_field(tir_enum(tir), field_num);
2280
2720
  tir->field_num = field_num;
@@ -2282,8 +2722,7 @@ FrtTermInfosReader *frt_tir_set_field(FrtTermInfosReader *tir, int field_num)
2282
2722
  return tir;
2283
2723
  }
2284
2724
 
2285
- FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term)
2286
- {
2725
+ FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term) {
2287
2726
  FrtTermEnum *te = tir_enum(tir);
2288
2727
  char *match;
2289
2728
 
@@ -2294,9 +2733,7 @@ FrtTermInfo *frt_tir_get_ti(FrtTermInfosReader *tir, const char *term)
2294
2733
  return NULL;
2295
2734
  }
2296
2735
 
2297
- static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num,
2298
- const char *term)
2299
- {
2736
+ static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num, const char *term) {
2300
2737
  FrtTermEnum *te = tir_enum(tir);
2301
2738
  char *match;
2302
2739
 
@@ -2312,19 +2749,16 @@ static FrtTermInfo *tir_get_ti_field(FrtTermInfosReader *tir, int field_num,
2312
2749
  return NULL;
2313
2750
  }
2314
2751
 
2315
- char *frt_tir_get_term(FrtTermInfosReader *tir, int pos)
2316
- {
2752
+ char *frt_tir_get_term(FrtTermInfosReader *tir, int pos) {
2317
2753
  if (pos < 0) {
2318
2754
  return NULL;
2319
- }
2320
- else {
2755
+ } else {
2321
2756
  return frt_ste_get_term(tir_enum(tir), pos);
2322
2757
  }
2323
2758
  }
2324
2759
 
2325
2760
 
2326
- void frt_tir_close(FrtTermInfosReader *tir)
2327
- {
2761
+ void frt_tir_close(FrtTermInfosReader *tir) {
2328
2762
  frt_ary_destroy(tir->te_bucket, (frt_free_ft)&frt_ste_close);
2329
2763
  frt_ste_close(tir->orig_te);
2330
2764
 
@@ -2341,25 +2775,19 @@ void frt_tir_close(FrtTermInfosReader *tir)
2341
2775
  *
2342
2776
  ****************************************************************************/
2343
2777
 
2344
- static FrtTermWriter *tw_new(FrtStore *store, char *file_name)
2345
- {
2778
+ static FrtTermWriter *tw_new(FrtStore *store, char *file_name) {
2346
2779
  FrtTermWriter *tw = FRT_ALLOC_AND_ZERO(FrtTermWriter);
2347
2780
  tw->os = store->new_output(store, file_name);
2348
2781
  tw->last_term = FRT_EMPTY_STRING;
2349
2782
  return tw;
2350
2783
  }
2351
2784
 
2352
- static void tw_close(FrtTermWriter *tw)
2353
- {
2785
+ static void tw_close(FrtTermWriter *tw) {
2354
2786
  frt_os_close(tw->os);
2355
2787
  free(tw);
2356
2788
  }
2357
2789
 
2358
- FrtTermInfosWriter *frt_tiw_open(FrtStore *store,
2359
- const char *segment,
2360
- int index_interval,
2361
- int skip_interval)
2362
- {
2790
+ FrtTermInfosWriter *frt_tiw_open(FrtStore *store, const char *segment, int index_interval, int skip_interval) {
2363
2791
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
2364
2792
  FrtTermInfosWriter *tiw = FRT_ALLOC(FrtTermInfosWriter);
2365
2793
  size_t segment_len = strlen(segment);
@@ -2388,11 +2816,7 @@ FrtTermInfosWriter *frt_tiw_open(FrtStore *store,
2388
2816
  return tiw;
2389
2817
  }
2390
2818
 
2391
- static void tw_write_term(FrtTermWriter *tw,
2392
- FrtOutStream *os,
2393
- const char *term,
2394
- int term_len)
2395
- {
2819
+ static void tw_write_term(FrtTermWriter *tw, FrtOutStream *os, const char *term, int term_len) {
2396
2820
  int start = frt_hlp_string_diff(tw->last_term, term);
2397
2821
  int length = term_len - start;
2398
2822
 
@@ -2403,12 +2827,7 @@ static void tw_write_term(FrtTermWriter *tw,
2403
2827
  tw->last_term = term;
2404
2828
  }
2405
2829
 
2406
- static void tw_add(FrtTermWriter *tw,
2407
- const char *term,
2408
- int term_len,
2409
- FrtTermInfo *ti,
2410
- int skip_interval)
2411
- {
2830
+ static void tw_add(FrtTermWriter *tw, const char *term, int term_len, FrtTermInfo *ti, int skip_interval) {
2412
2831
  FrtOutStream *os = tw->os;
2413
2832
 
2414
2833
  #ifdef DEBUG
@@ -2438,11 +2857,7 @@ static void tw_add(FrtTermWriter *tw,
2438
2857
  tw->counter++;
2439
2858
  }
2440
2859
 
2441
- void frt_tiw_add(FrtTermInfosWriter *tiw,
2442
- const char *term,
2443
- int term_len,
2444
- FrtTermInfo *ti)
2445
- {
2860
+ void frt_tiw_add(FrtTermInfosWriter *tiw, const char *term, int term_len, FrtTermInfo *ti) {
2446
2861
  off_t tis_pos;
2447
2862
 
2448
2863
  if (0 == (tiw->tis_writer->counter % tiw->index_interval)) {
@@ -2460,15 +2875,13 @@ void frt_tiw_add(FrtTermInfosWriter *tiw,
2460
2875
  tw_add(tiw->tis_writer, term, term_len, ti, tiw->skip_interval);
2461
2876
  }
2462
2877
 
2463
- static void tw_reset(FrtTermWriter *tw)
2464
- {
2878
+ static void tw_reset(FrtTermWriter *tw) {
2465
2879
  tw->counter = 0;
2466
2880
  tw->last_term = FRT_EMPTY_STRING;
2467
2881
  FRT_ZEROSET(&(tw->last_term_info), FrtTermInfo);
2468
2882
  }
2469
2883
 
2470
- void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num)
2471
- {
2884
+ void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num) {
2472
2885
  FrtOutStream *tfx_out = tiw->tfx_out;
2473
2886
  frt_os_write_vint(tfx_out, tiw->tix_writer->counter); /* write tix size */
2474
2887
  frt_os_write_vint(tfx_out, tiw->tis_writer->counter); /* write tis size */
@@ -2481,8 +2894,7 @@ void frt_tiw_start_field(FrtTermInfosWriter *tiw, int field_num)
2481
2894
  tiw->field_count++;
2482
2895
  }
2483
2896
 
2484
- void frt_tiw_close(FrtTermInfosWriter *tiw)
2485
- {
2897
+ void frt_tiw_close(FrtTermInfosWriter *tiw) {
2486
2898
  FrtOutStream *tfx_out = tiw->tfx_out;
2487
2899
  frt_os_write_vint(tfx_out, tiw->tix_writer->counter);
2488
2900
  frt_os_write_vint(tfx_out, tiw->tis_writer->counter);
@@ -2516,8 +2928,7 @@ void frt_tiw_close(FrtTermInfosWriter *tiw)
2516
2928
  }\
2517
2929
  } while (0)
2518
2930
 
2519
- static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
2520
- {
2931
+ static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti) {
2521
2932
  if (NULL == ti) {
2522
2933
  stde->doc_freq = 0;
2523
2934
  } else {
@@ -2535,14 +2946,12 @@ static void stde_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
2535
2946
  }
2536
2947
  }
2537
2948
 
2538
- static void stde_seek(FrtTermDocEnum *tde, int field_num, const char *term)
2539
- {
2949
+ static void stde_seek(FrtTermDocEnum *tde, int field_num, const char *term) {
2540
2950
  FrtTermInfo *ti = tir_get_ti_field(STDE(tde)->tir, field_num, term);
2541
2951
  stde_seek_ti(STDE(tde), ti);
2542
2952
  }
2543
2953
 
2544
- static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te)
2545
- {
2954
+ static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te) {
2546
2955
  #ifdef DEBUG
2547
2956
  if (te->set_field != &ste_set_field) {
2548
2957
  FRT_RAISE(FRT_ARG_ERROR, "Passed an incorrect TermEnum type");
@@ -2551,20 +2960,17 @@ static void stde_seek_te(FrtTermDocEnum *tde, FrtTermEnum *te)
2551
2960
  stde_seek_ti(STDE(tde), &(te->curr_ti));
2552
2961
  }
2553
2962
 
2554
- static int stde_doc_num(FrtTermDocEnum *tde)
2555
- {
2963
+ static int stde_doc_num(FrtTermDocEnum *tde) {
2556
2964
  CHECK_STATE("doc_num");
2557
2965
  return STDE(tde)->doc_num;
2558
2966
  }
2559
2967
 
2560
- static int stde_freq(FrtTermDocEnum *tde)
2561
- {
2968
+ static int stde_freq(FrtTermDocEnum *tde) {
2562
2969
  CHECK_STATE("freq");
2563
2970
  return STDE(tde)->freq;
2564
2971
  }
2565
2972
 
2566
- static bool stde_next(FrtTermDocEnum *tde)
2567
- {
2973
+ static bool stde_next(FrtTermDocEnum *tde) {
2568
2974
  int doc_code;
2569
2975
  FrtSegmentTermDocEnum *stde = STDE(tde);
2570
2976
 
@@ -2592,8 +2998,7 @@ static bool stde_next(FrtTermDocEnum *tde)
2592
2998
  return true;
2593
2999
  }
2594
3000
 
2595
- static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num)
2596
- {
3001
+ static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num) {
2597
3002
  FrtSegmentTermDocEnum *stde = STDE(tde);
2598
3003
  int i = 0;
2599
3004
  int doc_code;
@@ -2620,8 +3025,7 @@ static int stde_read(FrtTermDocEnum *tde, int *docs, int *freqs, int req_num)
2620
3025
  return i;
2621
3026
  }
2622
3027
 
2623
- static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num)
2624
- {
3028
+ static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num) {
2625
3029
  FrtSegmentTermDocEnum *stde = STDE(tde);
2626
3030
 
2627
3031
  if (stde->doc_freq >= stde->skip_interval
@@ -2685,8 +3089,7 @@ static bool stde_skip_to(FrtTermDocEnum *tde, int target_doc_num)
2685
3089
  return true;
2686
3090
  }
2687
3091
 
2688
- static void stde_close(FrtTermDocEnum *tde)
2689
- {
3092
+ static void stde_close(FrtTermDocEnum *tde) {
2690
3093
  frt_is_close(STDE(tde)->frq_in);
2691
3094
 
2692
3095
  if (NULL != STDE(tde)->skip_in) {
@@ -2696,23 +3099,17 @@ static void stde_close(FrtTermDocEnum *tde)
2696
3099
  free(tde);
2697
3100
  }
2698
3101
 
2699
- static void stde_skip_prox(FrtSegmentTermDocEnum *stde)
2700
- {
3102
+ static void stde_skip_prox(FrtSegmentTermDocEnum *stde) {
2701
3103
  (void)stde;
2702
3104
  }
2703
3105
 
2704
- static void stde_seek_prox(FrtSegmentTermDocEnum *stde, off_t prx_ptr)
2705
- {
3106
+ static void stde_seek_prox(FrtSegmentTermDocEnum *stde, off_t prx_ptr) {
2706
3107
  (void)stde;
2707
3108
  (void)prx_ptr;
2708
3109
  }
2709
3110
 
2710
3111
 
2711
- FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir,
2712
- FrtInStream *frq_in,
2713
- FrtBitVector *deleted_docs,
2714
- int skip_interval)
2715
- {
3112
+ FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir, FrtInStream *frq_in, FrtBitVector *deleted_docs, int skip_interval) {
2716
3113
  FrtSegmentTermDocEnum *stde = FRT_ALLOC_AND_ZERO(FrtSegmentTermDocEnum);
2717
3114
  FrtTermDocEnum *tde = (FrtTermDocEnum *)stde;
2718
3115
 
@@ -2744,27 +3141,23 @@ FrtTermDocEnum *frt_stde_new(FrtTermInfosReader *tir,
2744
3141
  * SegmentTermPosEnum
2745
3142
  ****************************************************************************/
2746
3143
 
2747
- static void stpe_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti)
2748
- {
3144
+ static void stpe_seek_ti(FrtSegmentTermDocEnum *stde, FrtTermInfo *ti) {
2749
3145
  if (NULL == ti) {
2750
3146
  stde->doc_freq = 0;
2751
- }
2752
- else {
3147
+ } else {
2753
3148
  stde_seek_ti(stde, ti);
2754
3149
  frt_is_seek(stde->prx_in, ti->prx_ptr);
2755
3150
  }
2756
3151
  }
2757
3152
 
2758
- static void stpe_seek(FrtTermDocEnum *tde, int field_num, const char *term)
2759
- {
3153
+ static void stpe_seek(FrtTermDocEnum *tde, int field_num, const char *term) {
2760
3154
  FrtSegmentTermDocEnum *stde = STDE(tde);
2761
3155
  FrtTermInfo *ti = tir_get_ti_field(stde->tir, field_num, term);
2762
3156
  stpe_seek_ti(stde, ti);
2763
3157
  stde->prx_cnt = 0;
2764
3158
  }
2765
3159
 
2766
- static bool stpe_next(FrtTermDocEnum *tde)
2767
- {
3160
+ static bool stpe_next(FrtTermDocEnum *tde) {
2768
3161
  FrtSegmentTermDocEnum *stde = STDE(tde);
2769
3162
  frt_is_skip_vints(stde->prx_in, stde->prx_cnt);
2770
3163
 
@@ -3238,8 +3631,8 @@ FrtTermDocEnum *frt_mtdpe_new(FrtIndexReader *ir, int field_num, char **terms, i
3238
3631
  ****************************************************************************/
3239
3632
 
3240
3633
  static FrtHash *fn_extensions = NULL;
3241
- static void file_name_filter_init()
3242
- {
3634
+
3635
+ static void file_name_filter_init(void) {
3243
3636
  int i;
3244
3637
  fn_extensions = frt_h_new_str((frt_free_ft)NULL, (frt_free_ft)NULL);
3245
3638
  for (i = 0; i < FRT_NELEMS(INDEX_EXTENSIONS); i++) {
@@ -3538,9 +3931,8 @@ static void ir_acquire_write_lock(FrtIndexReader *ir)
3538
3931
  }
3539
3932
  }
3540
3933
 
3541
- static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentInfos *sis,
3542
- FrtFieldInfos *fis, int is_owner)
3543
- {
3934
+ static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentInfos *sis, FrtFieldInfos *fis, int is_owner) {
3935
+ ir->type = FRT_INDEX_READER;
3544
3936
  frt_mutex_init(&ir->mutex, NULL);
3545
3937
  frt_mutex_init(&ir->field_index_mutex, NULL);
3546
3938
 
@@ -3563,8 +3955,7 @@ static FrtIndexReader *ir_setup(FrtIndexReader *ir, FrtStore *store, FrtSegmentI
3563
3955
  return ir;
3564
3956
  }
3565
3957
 
3566
- int frt_ir_doc_freq(FrtIndexReader *ir, FrtSymbol field, const char *term)
3567
- {
3958
+ int frt_ir_doc_freq(FrtIndexReader *ir, ID field, const char *term) {
3568
3959
  int field_num = frt_fis_get_field_num(ir->fis, field);
3569
3960
  if (field_num >= 0) {
3570
3961
  return ir->doc_freq(ir, field_num, term);
@@ -3574,8 +3965,7 @@ int frt_ir_doc_freq(FrtIndexReader *ir, FrtSymbol field, const char *term)
3574
3965
  }
3575
3966
  }
3576
3967
 
3577
- static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uchar val)
3578
- {
3968
+ static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uchar val) {
3579
3969
  frt_mutex_lock(&ir->mutex);
3580
3970
  ir->acquire_write_lock(ir);
3581
3971
  ir->set_norm_i(ir, doc_num, field_num, val);
@@ -3583,8 +3973,7 @@ static void ir_set_norm_i(FrtIndexReader *ir, int doc_num, int field_num, frt_uc
3583
3973
  frt_mutex_unlock(&ir->mutex);
3584
3974
  }
3585
3975
 
3586
- void frt_ir_set_norm(FrtIndexReader *ir, int doc_num, FrtSymbol field, frt_uchar val)
3587
- {
3976
+ void frt_ir_set_norm(FrtIndexReader *ir, int doc_num, ID field, frt_uchar val) {
3588
3977
  int field_num = frt_fis_get_field_num(ir->fis, field);
3589
3978
  if (field_num >= 0) {
3590
3979
  ir_set_norm_i(ir, doc_num, field_num, val);
@@ -3606,14 +3995,12 @@ frt_uchar *frt_ir_get_norms_i(FrtIndexReader *ir, int field_num)
3606
3995
  return norms;
3607
3996
  }
3608
3997
 
3609
- frt_uchar *frt_ir_get_norms(FrtIndexReader *ir, FrtSymbol field)
3610
- {
3998
+ frt_uchar *frt_ir_get_norms(FrtIndexReader *ir, ID field) {
3611
3999
  int field_num = frt_fis_get_field_num(ir->fis, field);
3612
4000
  return frt_ir_get_norms_i(ir, field_num);
3613
4001
  }
3614
4002
 
3615
- frt_uchar *frt_ir_get_norms_into(FrtIndexReader *ir, FrtSymbol field, frt_uchar *buf)
3616
- {
4003
+ frt_uchar *frt_ir_get_norms_into(FrtIndexReader *ir, ID field, frt_uchar *buf) {
3617
4004
  int field_num = frt_fis_get_field_num(ir->fis, field);
3618
4005
  if (field_num >= 0) {
3619
4006
  ir->get_norms_into(ir, field_num, buf);
@@ -3644,7 +4031,7 @@ void frt_ir_delete_doc(FrtIndexReader *ir, int doc_num)
3644
4031
  }
3645
4032
  }
3646
4033
 
3647
- FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, FrtSymbol field, const char *term) {
4034
+ FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, ID field, const char *term) {
3648
4035
  FrtTermDocEnum *tde = ir_term_docs_for(ir, field, term);
3649
4036
  FrtDocument *doc = NULL;
3650
4037
 
@@ -3657,8 +4044,7 @@ FrtDocument *frt_ir_get_doc_with_term(FrtIndexReader *ir, FrtSymbol field, const
3657
4044
  return doc;
3658
4045
  }
3659
4046
 
3660
- FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, FrtSymbol field)
3661
- {
4047
+ FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, ID field) {
3662
4048
  FrtTermEnum *te = NULL;
3663
4049
  int field_num = frt_fis_get_field_num(ir->fis, field);
3664
4050
  if (field_num >= 0) {
@@ -3667,9 +4053,7 @@ FrtTermEnum *frt_ir_terms(FrtIndexReader *ir, FrtSymbol field)
3667
4053
  return te;
3668
4054
  }
3669
4055
 
3670
- FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, FrtSymbol field,
3671
- const char *term)
3672
- {
4056
+ FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, ID field, const char *term) {
3673
4057
  FrtTermEnum *te = NULL;
3674
4058
  int field_num = frt_fis_get_field_num(ir->fis, field);
3675
4059
  if (field_num >= 0) {
@@ -3678,9 +4062,7 @@ FrtTermEnum *frt_ir_terms_from(FrtIndexReader *ir, FrtSymbol field,
3678
4062
  return te;
3679
4063
  }
3680
4064
 
3681
- FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, FrtSymbol field,
3682
- const char *term)
3683
- {
4065
+ FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, ID field, const char *term) {
3684
4066
  int field_num = frt_fis_get_field_num(ir->fis, field);
3685
4067
  FrtTermDocEnum *tde = ir->term_docs(ir);
3686
4068
  if (field_num >= 0) {
@@ -3689,9 +4071,7 @@ FrtTermDocEnum *ir_term_docs_for(FrtIndexReader *ir, FrtSymbol field,
3689
4071
  return tde;
3690
4072
  }
3691
4073
 
3692
- FrtTermDocEnum *frt_ir_term_positions_for(FrtIndexReader *ir, FrtSymbol field,
3693
- const char *term)
3694
- {
4074
+ FrtTermDocEnum *frt_ir_term_positions_for(FrtIndexReader *ir, ID field, const char *term) {
3695
4075
  int field_num = frt_fis_get_field_num(ir->fis, field);
3696
4076
  FrtTermDocEnum *tde = ir->term_positions(ir);
3697
4077
  if (field_num >= 0) {
@@ -3705,7 +4085,7 @@ static void ir_commit_i(FrtIndexReader *ir)
3705
4085
  if (ir->has_changes) {
3706
4086
  if (NULL == ir->deleter && NULL != ir->store) {
3707
4087
  /* In the MultiReader case, we share this deleter across all
3708
- * SegmentReaders: */
4088
+ * FrtSegmentReaders: */
3709
4089
  ir->set_deleter_i(ir, frt_deleter_new(ir->sis, ir->store));
3710
4090
  }
3711
4091
  if (ir->is_owner) {
@@ -3841,34 +4221,14 @@ static void norm_rewrite(Norm *norm, FrtStore *store, FrtDeleter *dlr,
3841
4221
  }
3842
4222
 
3843
4223
  /****************************************************************************
3844
- * SegmentReader
4224
+ * FrtSegmentReader
3845
4225
  ****************************************************************************/
3846
4226
 
3847
- typedef struct SegmentReader {
3848
- FrtIndexReader ir;
3849
- FrtSegmentInfo *si;
3850
- char *segment;
3851
- FrtFieldsReader *fr;
3852
- FrtBitVector *deleted_docs;
3853
- FrtInStream *frq_in;
3854
- FrtInStream *prx_in;
3855
- FrtSegmentFieldIndex *sfi;
3856
- FrtTermInfosReader *tir;
3857
- frt_thread_key_t thread_fr;
3858
- void **fr_bucket;
3859
- FrtHash *norms;
3860
- FrtStore *cfs_store;
3861
- bool deleted_docs_dirty : 1;
3862
- bool undelete_all : 1;
3863
- bool norms_dirty : 1;
3864
- } SegmentReader;
3865
-
3866
4227
  #define IR(ir) ((FrtIndexReader *)(ir))
3867
-
3868
- #define SR(ir) ((SegmentReader *)(ir))
4228
+ #define SR(ir) ((FrtSegmentReader *)(ir))
3869
4229
  #define SR_SIZE(ir) (SR(ir)->fr->size)
3870
4230
 
3871
- static FrtFieldsReader *sr_fr(SegmentReader *sr)
4231
+ static FrtFieldsReader *sr_fr(FrtSegmentReader *sr)
3872
4232
  {
3873
4233
  FrtFieldsReader *fr;
3874
4234
 
@@ -3880,12 +4240,12 @@ static FrtFieldsReader *sr_fr(SegmentReader *sr)
3880
4240
  return fr;
3881
4241
  }
3882
4242
 
3883
- static bool sr_is_deleted_i(SegmentReader *sr, int doc_num)
4243
+ static bool sr_is_deleted_i(FrtSegmentReader *sr, int doc_num)
3884
4244
  {
3885
4245
  return (NULL != sr->deleted_docs && frt_bv_get(sr->deleted_docs, doc_num));
3886
4246
  }
3887
4247
 
3888
- static void sr_get_norms_into_i(SegmentReader *sr, int field_num,
4248
+ static void sr_get_norms_into_i(FrtSegmentReader *sr, int field_num,
3889
4249
  frt_uchar *buf)
3890
4250
  {
3891
4251
  Norm *norm = (Norm *)frt_h_get_int(sr->norms, field_num);
@@ -3904,7 +4264,7 @@ static void sr_get_norms_into_i(SegmentReader *sr, int field_num,
3904
4264
  }
3905
4265
  }
3906
4266
 
3907
- static frt_uchar *sr_get_norms_i(SegmentReader *sr, int field_num)
4267
+ static frt_uchar *sr_get_norms_i(FrtSegmentReader *sr, int field_num)
3908
4268
  {
3909
4269
  Norm *norm = (Norm *)frt_h_get_int(sr->norms, field_num);
3910
4270
  if (NULL == norm) { /* not an indexed field */
@@ -4040,7 +4400,7 @@ static void sr_commit_i(FrtIndexReader *ir)
4040
4400
 
4041
4401
  static void sr_close_i(FrtIndexReader *ir)
4042
4402
  {
4043
- SegmentReader *sr = SR(ir);
4403
+ FrtSegmentReader *sr = SR(ir);
4044
4404
 
4045
4405
  if (sr->fr) frt_fr_close(sr->fr);
4046
4406
  if (sr->tir) frt_tir_close(sr->tir);
@@ -4149,14 +4509,12 @@ static FrtTermDocEnum *sr_term_docs(FrtIndexReader *ir)
4149
4509
 
4150
4510
  static FrtTermDocEnum *sr_term_positions(FrtIndexReader *ir)
4151
4511
  {
4152
- SegmentReader *sr = SR(ir);
4512
+ FrtSegmentReader *sr = SR(ir);
4153
4513
  return frt_stpe_new(sr->tir, sr->frq_in, sr->prx_in, sr->deleted_docs,
4154
4514
  STE(sr->tir->orig_te)->skip_interval);
4155
4515
  }
4156
4516
 
4157
- static FrtTermVector *sr_term_vector(FrtIndexReader *ir, int doc_num,
4158
- FrtSymbol field)
4159
- {
4517
+ static FrtTermVector *sr_term_vector(FrtIndexReader *ir, int doc_num, ID field) {
4160
4518
  FrtFieldInfo *fi = (FrtFieldInfo *)frt_h_get(ir->fis->field_dict, (void *)field);
4161
4519
  FrtFieldsReader *fr;
4162
4520
 
@@ -4211,7 +4569,7 @@ static void sr_open_norms(FrtIndexReader *ir, FrtStore *cfs_store)
4211
4569
  SR(ir)->norms_dirty = false;
4212
4570
  }
4213
4571
 
4214
- static FrtIndexReader *sr_setup_i(SegmentReader *sr)
4572
+ static FrtIndexReader *sr_setup_i(FrtSegmentReader *sr)
4215
4573
  {
4216
4574
  FrtStore *volatile store = sr->si->store;
4217
4575
  FrtIndexReader *ir = IR(sr);
@@ -4242,6 +4600,8 @@ static FrtIndexReader *sr_setup_i(SegmentReader *sr)
4242
4600
  ir->commit_i = &sr_commit_i;
4243
4601
  ir->close_i = &sr_close_i;
4244
4602
 
4603
+ ir->type = FRT_SEGMENT_READER;
4604
+
4245
4605
  sr->cfs_store = NULL;
4246
4606
 
4247
4607
  FRT_TRY
@@ -4281,10 +4641,13 @@ static FrtIndexReader *sr_setup_i(SegmentReader *sr)
4281
4641
  return ir;
4282
4642
  }
4283
4643
 
4284
- static FrtIndexReader *sr_open(FrtSegmentInfos *sis, FrtFieldInfos *fis, int si_num,
4285
- bool is_owner)
4286
- {
4287
- SegmentReader *sr = FRT_ALLOC_AND_ZERO(SegmentReader);
4644
+ FrtSegmentReader *frt_sr_alloc(void) {
4645
+ return FRT_ALLOC_AND_ZERO(FrtSegmentReader);
4646
+ }
4647
+
4648
+ static FrtIndexReader *sr_open(FrtSegmentInfos *sis, FrtFieldInfos *fis, int si_num, bool is_owner, FrtSegmentReader *sr) {
4649
+ if (sr == NULL)
4650
+ sr = frt_sr_alloc();
4288
4651
  sr->si = sis->segs[si_num];
4289
4652
  ir_setup(IR(sr), sr->si->store, sis, fis, is_owner);
4290
4653
  return sr_setup_i(sr);
@@ -4455,9 +4818,7 @@ static FrtTermDocEnum *mr_term_positions(FrtIndexReader *ir)
4455
4818
  return mtpe_new(MR(ir));
4456
4819
  }
4457
4820
 
4458
- static FrtTermVector *mr_term_vector(FrtIndexReader *ir, int doc_num,
4459
- FrtSymbol field)
4460
- {
4821
+ static FrtTermVector *mr_term_vector(FrtIndexReader *ir, int doc_num, ID field) {
4461
4822
  GET_READER();
4462
4823
  return reader->term_vector(reader, doc_num - MR(ir)->starts[i], field);
4463
4824
  }
@@ -4561,10 +4922,12 @@ static void mr_close_i(FrtIndexReader *ir)
4561
4922
  free(MR(ir)->starts);
4562
4923
  }
4563
4924
 
4564
- static FrtIndexReader *mr_new(FrtIndexReader **sub_readers, const int r_cnt)
4565
- {
4925
+ FrtMultiReader *frt_mr_alloc(void) {
4926
+ return FRT_ALLOC_AND_ZERO(FrtMultiReader);
4927
+ }
4928
+
4929
+ FrtMultiReader *frt_mr_init(FrtMultiReader *mr, FrtIndexReader **sub_readers, const int r_cnt) {
4566
4930
  int i;
4567
- FrtMultiReader *mr = FRT_ALLOC_AND_ZERO(FrtMultiReader);
4568
4931
  FrtIndexReader *ir = IR(mr);
4569
4932
 
4570
4933
  mr->sub_readers = sub_readers;
@@ -4611,21 +4974,19 @@ static FrtIndexReader *mr_new(FrtIndexReader **sub_readers, const int r_cnt)
4611
4974
  ir->commit_i = &mr_commit_i;
4612
4975
  ir->close_i = &mr_close_i;
4613
4976
 
4614
- return ir;
4977
+ ir->type = FRT_MULTI_READER;
4978
+
4979
+ return mr;
4615
4980
  }
4616
4981
 
4617
- static FrtIndexReader *frt_mr_open_i(FrtStore *store,
4618
- FrtSegmentInfos *sis,
4619
- FrtFieldInfos *fis,
4620
- FrtIndexReader **sub_readers,
4621
- const int r_cnt)
4622
- {
4623
- FrtIndexReader *ir = mr_new(sub_readers, r_cnt);
4982
+ static FrtIndexReader *frt_mr_open_i(FrtStore *store, FrtSegmentInfos *sis, FrtFieldInfos *fis, FrtIndexReader **sub_readers, const int r_cnt, FrtIndexReader *ir) {
4983
+ if (ir == NULL)
4984
+ ir = (FrtIndexReader *)frt_mr_alloc();
4985
+ ir = (FrtIndexReader *)frt_mr_init((FrtMultiReader *)ir, sub_readers, r_cnt);
4624
4986
  return ir_setup(ir, store, sis, fis, true);
4625
4987
  }
4626
4988
 
4627
- static void mr_close_ext_i(FrtIndexReader *ir)
4628
- {
4989
+ static void mr_close_ext_i(FrtIndexReader *ir) {
4629
4990
  int **field_num_map = MR(ir)->field_num_map;
4630
4991
  if (field_num_map) {
4631
4992
  int i;
@@ -4638,12 +4999,13 @@ static void mr_close_ext_i(FrtIndexReader *ir)
4638
4999
  mr_close_i(ir);
4639
5000
  }
4640
5001
 
4641
- FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
4642
- {
4643
- FrtIndexReader *ir = mr_new(sub_readers, r_cnt);
5002
+ FrtIndexReader *frt_mr_open(FrtIndexReader *ir, FrtIndexReader **sub_readers, const int r_cnt) {
5003
+ if (ir == NULL)
5004
+ ir = (FrtIndexReader *)frt_mr_alloc();
5005
+ ir = (FrtIndexReader *)frt_mr_init((FrtMultiReader *)ir, sub_readers, r_cnt);
4644
5006
  FrtMultiReader *mr = MR(ir);
4645
5007
  /* defaults don't matter, this is just for reading fields, not adding */
4646
- FrtFieldInfos *fis = frt_fis_new(FRT_STORE_NO, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
5008
+ FrtFieldInfos *fis = frt_fis_new(FRT_STORE_NO, FRT_COMPRESSION_NONE, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
4647
5009
  int i, j;
4648
5010
  bool need_field_map = false;
4649
5011
 
@@ -4678,12 +5040,10 @@ FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
4678
5040
  mr->field_num_map[i][j] = fi_sub ? fi_sub->number : -1;
4679
5041
  }
4680
5042
  }
4681
- }
4682
- else {
5043
+ } else {
4683
5044
  mr->field_num_map = NULL;
4684
5045
  }
4685
5046
 
4686
-
4687
5047
  ir->close_i = &mr_close_ext_i;
4688
5048
 
4689
5049
  return ir_setup(ir, NULL, NULL, fis, false);
@@ -4693,21 +5053,19 @@ FrtIndexReader *frt_mr_open(FrtIndexReader **sub_readers, const int r_cnt)
4693
5053
  * IndexReader
4694
5054
  ****************************************************************************/
4695
5055
 
4696
-
4697
- static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4698
- {
5056
+ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf, FrtIndexReader *ir) {
4699
5057
  volatile bool success = false;
4700
- FrtIndexReader *volatile ir = NULL;
5058
+ // FrtIndexReader *volatile ir = NULL;
4701
5059
  FrtSegmentInfos *volatile sis = NULL;
4702
5060
  FRT_TRY
4703
5061
  do {
4704
5062
  FrtFieldInfos *fis;
4705
5063
  frt_mutex_lock(&store->mutex);
4706
- frt_sis_read_i(store, fsf);
5064
+ frt_sis_read_i(store, fsf, NULL);
4707
5065
  sis = fsf->ret.sis;
4708
5066
  fis = sis->fis;
4709
5067
  if (sis->size == 1) {
4710
- ir = sr_open(sis, fis, 0, true);
5068
+ ir = sr_open(sis, fis, 0, true, (FrtSegmentReader *)ir);
4711
5069
  }
4712
5070
  else {
4713
5071
  volatile int i;
@@ -4715,7 +5073,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4715
5073
  int num_segments = sis->size;
4716
5074
  for (i = num_segments - 1; i >= 0; i--) {
4717
5075
  FRT_TRY
4718
- readers[i] = sr_open(sis, fis, i, false);
5076
+ readers[i] = sr_open(sis, fis, i, false, NULL);
4719
5077
  FRT_XCATCHALL
4720
5078
  for (i++; i < num_segments; i++) {
4721
5079
  frt_ir_close(readers[i]);
@@ -4723,7 +5081,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4723
5081
  free(readers);
4724
5082
  FRT_XENDTRY
4725
5083
  }
4726
- ir = frt_mr_open_i(store, sis, fis, readers, sis->size);
5084
+ ir = frt_mr_open_i(store, sis, fis, readers, sis->size, ir);
4727
5085
  }
4728
5086
  fsf->ret.ir = ir;
4729
5087
  success = true;
@@ -4732,8 +5090,7 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4732
5090
  if (!success) {
4733
5091
  if (ir) {
4734
5092
  frt_ir_close(ir);
4735
- }
4736
- else if (sis) {
5093
+ } else if (sis) {
4737
5094
  frt_sis_destroy(sis);
4738
5095
  }
4739
5096
  }
@@ -4745,15 +5102,12 @@ static void ir_open_i(FrtStore *store, FindSegmentsFile *fsf)
4745
5102
  * Will keep a reference to the store. To let this method delete the store
4746
5103
  * make sure you deref the store that you pass to it
4747
5104
  */
4748
- FrtIndexReader *frt_ir_open(FrtStore *store)
4749
- {
5105
+ FrtIndexReader *frt_ir_open(FrtIndexReader *ir, FrtStore *store) {
4750
5106
  FindSegmentsFile fsf;
4751
- sis_find_segments_file(store, &fsf, &ir_open_i);
5107
+ sis_find_segments_file(store, &fsf, &ir_open_i, ir);
4752
5108
  return fsf.ret.ir;
4753
5109
  }
4754
5110
 
4755
-
4756
-
4757
5111
  /****************************************************************************
4758
5112
  *
4759
5113
  * Occurence
@@ -5143,10 +5497,7 @@ static void dw_add_offsets(FrtDocWriter *dw, int pos, off_t start, off_t end)
5143
5497
  dw->offsets_size = pos + 1;
5144
5498
  }
5145
5499
 
5146
- FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5147
- FrtFieldInverter *fld_inv,
5148
- FrtDocField *df)
5149
- {
5500
+ FrtHash *frt_dw_invert_field(FrtDocWriter *dw, FrtFieldInverter *fld_inv, FrtDocField *df) {
5150
5501
  FrtMemoryPool *mp = dw->mp;
5151
5502
  FrtAnalyzer *a = dw->analyzer;
5152
5503
  FrtHash *curr_plists = dw->curr_plists;
@@ -5162,7 +5513,7 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5162
5513
  int pos = -1, num_terms = 0;
5163
5514
 
5164
5515
  for (i = 0; i < df_size; i++) {
5165
- FrtTokenStream *ts = frt_a_get_ts(a, df->name, df->data[i]);
5516
+ FrtTokenStream *ts = frt_a_get_ts(a, df->name, df->data[i], df->encodings[i]);
5166
5517
  /* ts->reset(ts, df->data[i]); no longer being called */
5167
5518
  if (store_offsets) {
5168
5519
  while (NULL != (tk = ts->next(ts))) {
@@ -5172,21 +5523,16 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5172
5523
  if (pos < 0) {
5173
5524
  pos = 0;
5174
5525
  }
5175
- dw_add_posting(mp, curr_plists, fld_plists, doc_num,
5176
- tk->text, tk->len, pos);
5177
- dw_add_offsets(dw, pos,
5178
- start_offset + tk->start,
5179
- start_offset + tk->end);
5526
+ dw_add_posting(mp, curr_plists, fld_plists, doc_num, tk->text, tk->len, pos);
5527
+ dw_add_offsets(dw, pos, start_offset + tk->start, start_offset + tk->end);
5180
5528
  if (num_terms++ >= dw->max_field_length) {
5181
5529
  break;
5182
5530
  }
5183
5531
  }
5184
- }
5185
- else {
5532
+ } else {
5186
5533
  while (NULL != (tk = ts->next(ts))) {
5187
5534
  pos += tk->pos_inc;
5188
- dw_add_posting(mp, curr_plists, fld_plists, doc_num,
5189
- tk->text, tk->len, pos);
5535
+ dw_add_posting(mp, curr_plists, fld_plists, doc_num, tk->text, tk->len, pos);
5190
5536
  if (num_terms++ >= dw->max_field_length) {
5191
5537
  break;
5192
5538
  }
@@ -5196,8 +5542,7 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5196
5542
  start_offset += df->lengths[i] + 1;
5197
5543
  }
5198
5544
  fld_inv->length = num_terms;
5199
- }
5200
- else {
5545
+ } else {
5201
5546
  char buf[FRT_MAX_WORD_SIZE];
5202
5547
  buf[FRT_MAX_WORD_SIZE - 1] = '\0';
5203
5548
  for (i = 0; i < df_size; i++) {
@@ -5207,11 +5552,9 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5207
5552
  len = FRT_MAX_WORD_SIZE - 1;
5208
5553
  data_ptr = (char *)memcpy(buf, df->data[i], len);
5209
5554
  }
5210
- dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr,
5211
- len, i);
5555
+ dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr, len, i);
5212
5556
  if (store_offsets) {
5213
- dw_add_offsets(dw, i, start_offset,
5214
- start_offset + df->lengths[i]);
5557
+ dw_add_offsets(dw, i, start_offset, start_offset + df->lengths[i]);
5215
5558
  }
5216
5559
  start_offset += df->lengths[i] + 1;
5217
5560
  }
@@ -5220,14 +5563,12 @@ FrtHash *frt_dw_invert_field(FrtDocWriter *dw,
5220
5563
  return curr_plists;
5221
5564
  }
5222
5565
 
5223
- void frt_dw_reset_postings(FrtHash *postings)
5224
- {
5566
+ void frt_dw_reset_postings(FrtHash *postings) {
5225
5567
  FRT_ZEROSET_N(postings->table, FrtHashEntry, postings->mask + 1);
5226
5568
  postings->fill = postings->size = 0;
5227
5569
  }
5228
5570
 
5229
- void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc)
5230
- {
5571
+ void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc) {
5231
5572
  int i;
5232
5573
  float boost;
5233
5574
  FrtDocField *df;
@@ -5249,16 +5590,12 @@ void frt_dw_add_doc(FrtDocWriter *dw, FrtDocument *doc)
5249
5590
 
5250
5591
  postings = frt_dw_invert_field(dw, fld_inv, df);
5251
5592
  if (fld_inv->store_term_vector) {
5252
- frt_fw_add_postings(dw->fw, fld_inv->fi->number,
5253
- dw_sort_postings(postings), postings->size,
5254
- dw->offsets, dw->offsets_size);
5593
+ frt_fw_add_postings(dw->fw, fld_inv->fi->number, dw_sort_postings(postings), postings->size, dw->offsets, dw->offsets_size);
5255
5594
  }
5256
5595
 
5257
5596
  if (fld_inv->has_norms) {
5258
- boost = fld_inv->fi->boost * doc->boost * df->boost *
5259
- frt_sim_length_norm(dw->similarity, fi->name, fld_inv->length);
5260
- fld_inv->norms[dw->doc_num] =
5261
- frt_sim_encode_norm(dw->similarity, boost);
5597
+ boost = fld_inv->fi->boost * doc->boost * df->boost * frt_sim_length_norm(dw->similarity, fi->name, fld_inv->length);
5598
+ fld_inv->norms[dw->doc_num] = frt_sim_encode_norm(dw->similarity, boost);
5262
5599
  }
5263
5600
  frt_dw_reset_postings(postings);
5264
5601
  if (dw->offsets_size > 0) {
@@ -5811,15 +6148,12 @@ static void iw_commit_compound_file(FrtIndexWriter *iw, FrtSegmentInfo *si)
5811
6148
  iw_create_compound_file(iw->store, iw->fis, si, cfs_name, iw->deleter);
5812
6149
  }
5813
6150
 
5814
- static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg,
5815
- const int max_seg)
5816
- {
6151
+ static void iw_merge_segments(FrtIndexWriter *iw, const int min_seg, const int max_seg) {
5817
6152
  int i;
5818
6153
  FrtSegmentInfos *sis = iw->sis;
5819
6154
  FrtSegmentInfo *si = frt_sis_new_segment(sis, 0, iw->store);
5820
6155
 
5821
- SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg],
5822
- max_seg - min_seg);
6156
+ SegmentMerger *merger = sm_create(iw, si, &sis->segs[min_seg], max_seg - min_seg);
5823
6157
 
5824
6158
  /* This is where all the action happens. */
5825
6159
  si->doc_cnt = sm_merge(merger);
@@ -5931,8 +6265,7 @@ void frt_iw_commit(FrtIndexWriter *iw)
5931
6265
  frt_mutex_unlock(&iw->mutex);
5932
6266
  }
5933
6267
 
5934
- void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
5935
- {
6268
+ void frt_iw_delete_term(FrtIndexWriter *iw, ID field, const char *term) {
5936
6269
  int field_num = frt_fis_get_field_num(iw->fis, field);
5937
6270
  if (field_num >= 0) {
5938
6271
  int i;
@@ -5943,7 +6276,7 @@ void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
5943
6276
  const int seg_cnt = sis->size;
5944
6277
  bool did_delete = false;
5945
6278
  for (i = 0; i < seg_cnt; i++) {
5946
- FrtIndexReader *ir = sr_open(sis, iw->fis, i, false);
6279
+ FrtIndexReader *ir = sr_open(sis, iw->fis, i, false, NULL);
5947
6280
  FrtTermDocEnum *tde = ir->term_docs(ir);
5948
6281
  ir->deleter = iw->deleter;
5949
6282
  stde_seek(tde, field_num, term);
@@ -5965,9 +6298,7 @@ void frt_iw_delete_term(FrtIndexWriter *iw, FrtSymbol field, const char *term)
5965
6298
  }
5966
6299
  }
5967
6300
 
5968
- void frt_iw_delete_terms(FrtIndexWriter *iw, FrtSymbol field,
5969
- char **terms, const int term_cnt)
5970
- {
6301
+ void frt_iw_delete_terms(FrtIndexWriter *iw, ID field, char **terms, const int term_cnt) {
5971
6302
  int field_num = frt_fis_get_field_num(iw->fis, field);
5972
6303
  if (field_num >= 0) {
5973
6304
  int i;
@@ -5978,7 +6309,7 @@ void frt_iw_delete_terms(FrtIndexWriter *iw, FrtSymbol field,
5978
6309
  const int seg_cnt = sis->size;
5979
6310
  bool did_delete = false;
5980
6311
  for (i = 0; i < seg_cnt; i++) {
5981
- FrtIndexReader *ir = sr_open(sis, iw->fis, i, false);
6312
+ FrtIndexReader *ir = sr_open(sis, iw->fis, i, false, NULL);
5982
6313
  FrtTermDocEnum *tde = ir->term_docs(ir);
5983
6314
  int j;
5984
6315
  for (j = 0 ; j < term_cnt; j++) {
@@ -6047,10 +6378,13 @@ void frt_iw_close(FrtIndexWriter *iw)
6047
6378
  free(iw);
6048
6379
  }
6049
6380
 
6050
- FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
6051
- const FrtConfig *config)
6052
- {
6053
- FrtIndexWriter *iw = FRT_ALLOC_AND_ZERO(FrtIndexWriter);
6381
+ FrtIndexWriter *frt_iw_alloc(void) {
6382
+ return FRT_ALLOC_AND_ZERO(FrtIndexWriter);
6383
+ }
6384
+
6385
+ FrtIndexWriter *frt_iw_open(FrtIndexWriter *iw, FrtStore *store, FrtAnalyzer *volatile analyzer, const FrtConfig *config) {
6386
+ if (iw == NULL)
6387
+ iw = frt_iw_alloc();
6054
6388
  frt_mutex_init(&iw->mutex, NULL);
6055
6389
  iw->store = store;
6056
6390
  if (!config) {
@@ -6081,7 +6415,7 @@ FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
6081
6415
 
6082
6416
  iw->similarity = frt_sim_create_default();
6083
6417
  iw->analyzer = analyzer ? (FrtAnalyzer *)analyzer
6084
- : frt_mb_standard_analyzer_new(true);
6418
+ : frt_standard_analyzer_new(true);
6085
6419
 
6086
6420
  iw->deleter = frt_deleter_new(iw->sis, store);
6087
6421
  deleter_delete_deletable_files(iw->deleter);
@@ -6093,9 +6427,7 @@ FrtIndexWriter *frt_iw_open(FrtStore *store, FrtAnalyzer *volatile analyzer,
6093
6427
  /*******************/
6094
6428
  /*** Add Indexes ***/
6095
6429
  /*******************/
6096
- static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6097
- const char *segment, int *map)
6098
- {
6430
+ static void iw_cp_fields(FrtIndexWriter *iw, FrtSegmentReader *sr, const char *segment, int *map) {
6099
6431
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
6100
6432
  FrtOutStream *fdt_out, *fdx_out;
6101
6433
  FrtInStream *fdt_in, *fdx_in;
@@ -6122,7 +6454,6 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6122
6454
  frt_is2os_copy_bytes(del_in, del_out, frt_is_length(del_in));
6123
6455
  }
6124
6456
 
6125
-
6126
6457
  if (map) {
6127
6458
  int i;
6128
6459
  const int max_doc = sr_max_doc(IR(sr));
@@ -6143,10 +6474,14 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6143
6474
  frt_os_write_vint(fdt_out, df_size);
6144
6475
  /* sum total lengths of FrtDocField */
6145
6476
  for (k = 0; k < df_size; k++) {
6146
- /* Each field has one ' ' byte so add 1 */
6147
- const int flen = frt_is_read_vint(fdt_in);
6477
+ const int flen = frt_is_read_vint(fdt_in); /* length */
6478
+ const int fenc = frt_is_read_vint(fdt_in); /* encoding */
6479
+ const int fcmp = frt_is_read_vint(fdt_in); /* compression */
6148
6480
  frt_os_write_vint(fdt_out, flen);
6149
- data_len += flen + 1;
6481
+ frt_os_write_vint(fdt_out, fenc);
6482
+ frt_os_write_vint(fdt_out, fcmp);
6483
+ /* Each field has one ' ' byte so add 1 */
6484
+ data_len += flen + 1;
6150
6485
  }
6151
6486
  }
6152
6487
  frt_is2os_copy_bytes(fdt_in, fdt_out, data_len);
@@ -6169,8 +6504,7 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6169
6504
  frt_os_write_vint(fdt_out, tv_size);
6170
6505
  }
6171
6506
  }
6172
- }
6173
- else {
6507
+ } else {
6174
6508
  frt_is2os_copy_bytes(fdt_in, fdt_out, frt_is_length(fdt_in));
6175
6509
  frt_is2os_copy_bytes(fdx_in, fdx_out, frt_is_length(fdx_in));
6176
6510
  }
@@ -6180,7 +6514,7 @@ static void iw_cp_fields(FrtIndexWriter *iw, SegmentReader *sr,
6180
6514
  frt_os_close(fdx_out);
6181
6515
  }
6182
6516
 
6183
- static void iw_cp_terms(FrtIndexWriter *iw, SegmentReader *sr,
6517
+ static void iw_cp_terms(FrtIndexWriter *iw, FrtSegmentReader *sr,
6184
6518
  const char *segment, int *map)
6185
6519
  {
6186
6520
  char file_name[FRT_SEGMENT_NAME_MAX_LENGTH];
@@ -6249,7 +6583,7 @@ static void iw_cp_terms(FrtIndexWriter *iw, SegmentReader *sr,
6249
6583
  frt_os_close(prx_out);
6250
6584
  }
6251
6585
 
6252
- static void iw_cp_norms(FrtIndexWriter *iw, SegmentReader *sr,
6586
+ static void iw_cp_norms(FrtIndexWriter *iw, FrtSegmentReader *sr,
6253
6587
  FrtSegmentInfo *si, int *map)
6254
6588
  {
6255
6589
  int i;
@@ -6280,9 +6614,7 @@ static void iw_cp_norms(FrtIndexWriter *iw, SegmentReader *sr,
6280
6614
  }
6281
6615
  }
6282
6616
 
6283
- static void iw_cp_map_files(FrtIndexWriter *iw, SegmentReader *sr,
6284
- FrtSegmentInfo *si)
6285
- {
6617
+ static void iw_cp_map_files(FrtIndexWriter *iw, FrtSegmentReader *sr, FrtSegmentInfo *si) {
6286
6618
  int i;
6287
6619
  FrtFieldInfos *from_fis = IR(sr)->fis;
6288
6620
  FrtFieldInfos *to_fis = iw->fis;
@@ -6300,15 +6632,13 @@ static void iw_cp_map_files(FrtIndexWriter *iw, SegmentReader *sr,
6300
6632
  free(field_map);
6301
6633
  }
6302
6634
 
6303
- static void iw_cp_files(FrtIndexWriter *iw, SegmentReader *sr,
6304
- FrtSegmentInfo *si)
6305
- {
6635
+ static void iw_cp_files(FrtIndexWriter *iw, FrtSegmentReader *sr, FrtSegmentInfo *si) {
6306
6636
  iw_cp_fields(iw, sr, si->name, NULL);
6307
6637
  iw_cp_terms( iw, sr, si->name, NULL);
6308
6638
  iw_cp_norms( iw, sr, si, NULL);
6309
6639
  }
6310
6640
 
6311
- static void iw_add_segment(FrtIndexWriter *iw, SegmentReader *sr)
6641
+ static void iw_add_segment(FrtIndexWriter *iw, FrtSegmentReader *sr)
6312
6642
  {
6313
6643
  FrtSegmentInfo *si = frt_sis_new_segment(iw->sis, 0, iw->store);
6314
6644
  FrtFieldInfos *fis = iw->fis;
@@ -6323,7 +6653,7 @@ static void iw_add_segment(FrtIndexWriter *iw, SegmentReader *sr)
6323
6653
  FrtFieldInfo *fi = sub_fis->fields[j];
6324
6654
  FrtFieldInfo *new_fi = frt_fis_get_field(fis, fi->name);
6325
6655
  if (NULL == new_fi) {
6326
- new_fi = frt_fi_new(fi->name, FRT_STORE_NO, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
6656
+ new_fi = frt_fi_new(fi->name, FRT_STORE_NO, FRT_COMPRESSION_NONE, FRT_INDEX_NO, FRT_TERM_VECTOR_NO);
6327
6657
  new_fi->bits = fi->bits;
6328
6658
  frt_fis_add_field(fis, new_fi);
6329
6659
  }