isomorfeus-ferret 0.12.4 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (216) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +612 -612
  3. data/README.md +77 -48
  4. data/ext/isomorfeus_ferret_ext/bm_hash.c +9 -6
  5. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +4 -2
  6. data/ext/isomorfeus_ferret_ext/brotli_common_constants.c +15 -0
  7. data/ext/isomorfeus_ferret_ext/brotli_common_constants.h +200 -0
  8. data/ext/isomorfeus_ferret_ext/brotli_common_context.c +156 -0
  9. data/ext/isomorfeus_ferret_ext/brotli_common_context.h +113 -0
  10. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +5914 -0
  11. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.h +64 -0
  12. data/ext/isomorfeus_ferret_ext/brotli_common_platform.c +22 -0
  13. data/ext/isomorfeus_ferret_ext/brotli_common_platform.h +594 -0
  14. data/ext/isomorfeus_ferret_ext/brotli_common_transform.c +291 -0
  15. data/ext/isomorfeus_ferret_ext/brotli_common_transform.h +85 -0
  16. data/ext/isomorfeus_ferret_ext/brotli_common_version.h +26 -0
  17. data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.c +76 -0
  18. data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.h +351 -0
  19. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +2608 -0
  20. data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.c +339 -0
  21. data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.h +121 -0
  22. data/ext/isomorfeus_ferret_ext/brotli_dec_prefix.h +732 -0
  23. data/ext/isomorfeus_ferret_ext/brotli_dec_state.c +159 -0
  24. data/ext/isomorfeus_ferret_ext/brotli_dec_state.h +365 -0
  25. data/ext/isomorfeus_ferret_ext/brotli_decode.h +344 -0
  26. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.c +145 -0
  27. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.h +39 -0
  28. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.c +843 -0
  29. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.h +95 -0
  30. data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_inc.h +163 -0
  31. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.c +35 -0
  32. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.h +63 -0
  33. data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost_inc.h +127 -0
  34. data/ext/isomorfeus_ferret_ext/brotli_enc_block_encoder_inc.h +34 -0
  35. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.c +194 -0
  36. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.h +51 -0
  37. data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter_inc.h +440 -0
  38. data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.c +1314 -0
  39. data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.h +84 -0
  40. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.c +56 -0
  41. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.h +48 -0
  42. data/ext/isomorfeus_ferret_ext/brotli_enc_cluster_inc.h +320 -0
  43. data/ext/isomorfeus_ferret_ext/brotli_enc_command.c +28 -0
  44. data/ext/isomorfeus_ferret_ext/brotli_enc_command.h +190 -0
  45. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.c +790 -0
  46. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.h +61 -0
  47. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.c +645 -0
  48. data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.c +1846 -0
  50. data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.h +25 -0
  51. data/ext/isomorfeus_ferret_ext/brotli_enc_encode.c +1927 -0
  52. data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.c +33 -0
  53. data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.h +43 -0
  54. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.c +503 -0
  55. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.h +122 -0
  56. data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode_static.h +539 -0
  57. data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.c +105 -0
  58. data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.h +66 -0
  59. data/ext/isomorfeus_ferret_ext/brotli_enc_find_match_length.h +79 -0
  60. data/ext/isomorfeus_ferret_ext/brotli_enc_hash.h +488 -0
  61. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_composite_inc.h +125 -0
  62. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_forgetful_chain_inc.h +293 -0
  63. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match64_inc.h +267 -0
  64. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_inc.h +262 -0
  65. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_quickly_inc.h +266 -0
  66. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_rolling_inc.h +212 -0
  67. data/ext/isomorfeus_ferret_ext/brotli_enc_hash_to_binary_tree_inc.h +329 -0
  68. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.c +100 -0
  69. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.h +63 -0
  70. data/ext/isomorfeus_ferret_ext/brotli_enc_histogram_inc.h +51 -0
  71. data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.c +175 -0
  72. data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.h +30 -0
  73. data/ext/isomorfeus_ferret_ext/brotli_enc_memory.c +170 -0
  74. data/ext/isomorfeus_ferret_ext/brotli_enc_memory.h +114 -0
  75. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.c +663 -0
  76. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.h +105 -0
  77. data/ext/isomorfeus_ferret_ext/brotli_enc_metablock_inc.h +183 -0
  78. data/ext/isomorfeus_ferret_ext/brotli_enc_params.h +46 -0
  79. data/ext/isomorfeus_ferret_ext/brotli_enc_prefix.h +53 -0
  80. data/ext/isomorfeus_ferret_ext/brotli_enc_quality.h +165 -0
  81. data/ext/isomorfeus_ferret_ext/brotli_enc_ringbuffer.h +167 -0
  82. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.c +486 -0
  83. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.h +40 -0
  84. data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict_lut.h +5864 -0
  85. data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.c +85 -0
  86. data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.h +32 -0
  87. data/ext/isomorfeus_ferret_ext/brotli_enc_write_bits.h +87 -0
  88. data/ext/isomorfeus_ferret_ext/brotli_encode.h +448 -0
  89. data/ext/isomorfeus_ferret_ext/brotli_port.h +288 -0
  90. data/ext/isomorfeus_ferret_ext/brotli_types.h +83 -0
  91. data/ext/isomorfeus_ferret_ext/frb_index.c +35 -4
  92. data/ext/isomorfeus_ferret_ext/frb_store.c +34 -5
  93. data/ext/isomorfeus_ferret_ext/frt_document.h +1 -0
  94. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  95. data/ext/isomorfeus_ferret_ext/frt_index.c +174 -25
  96. data/ext/isomorfeus_ferret_ext/frt_index.h +6 -3
  97. data/ext/isomorfeus_ferret_ext/frt_posh.h +11 -19
  98. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +1844 -1911
  99. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +12 -15
  100. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +1 -0
  101. data/ext/isomorfeus_ferret_ext/frt_scanner.c +1 -0
  102. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +1 -0
  103. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +1 -0
  104. data/ext/isomorfeus_ferret_ext/frt_search.h +1 -1
  105. data/ext/isomorfeus_ferret_ext/libstemmer.c +14 -11
  106. data/ext/isomorfeus_ferret_ext/libstemmer.h +4 -9
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +1167 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +6 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +1433 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +6 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +120 -143
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +1 -2
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +217 -237
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +1 -1
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +377 -432
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +1 -1
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +298 -342
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +1 -2
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +530 -524
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +1 -1
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +201 -214
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +1 -1
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1 -1
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +394 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +457 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +396 -439
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +1 -1
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +104 -128
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +1 -1
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +242 -273
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +1 -1
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +406 -461
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +1 -2
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +405 -456
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +1 -1
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +108 -126
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +1 -1
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +849 -0
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +373 -405
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +1 -1
  144. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +288 -305
  145. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +1 -1
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.c +1651 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.c +546 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.c +1171 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.c +1436 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +121 -141
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +1 -1
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +221 -241
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +1 -1
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +381 -431
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +1 -1
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +300 -345
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +1 -1
  162. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +518 -511
  163. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +1 -1
  164. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +201 -209
  165. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +1 -1
  166. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.c +3660 -0
  167. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.h +6 -0
  168. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.c +309 -0
  169. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.h +6 -0
  170. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +306 -671
  171. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +1 -1
  172. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.c +394 -0
  173. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.h +6 -0
  174. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.c +457 -0
  175. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.h +6 -0
  176. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +400 -442
  177. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +1 -1
  178. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.c +824 -0
  179. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.h +6 -0
  180. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.c +408 -0
  181. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.h +6 -0
  182. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +105 -127
  183. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +1 -1
  184. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +245 -276
  185. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +1 -1
  186. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +409 -464
  187. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +1 -1
  188. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +376 -408
  189. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +1 -1
  190. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +272 -287
  191. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +1 -1
  192. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.c +6530 -0
  193. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.h +6 -0
  194. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +407 -458
  195. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +1 -1
  196. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +110 -125
  197. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +1 -1
  198. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.c +1865 -0
  199. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.h +6 -0
  200. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +698 -806
  201. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +1 -1
  202. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.c +1220 -0
  203. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.h +6 -0
  204. data/ext/isomorfeus_ferret_ext/stem_api.c +1 -9
  205. data/ext/isomorfeus_ferret_ext/stem_api.h +1 -3
  206. data/ext/isomorfeus_ferret_ext/stem_header.h +30 -26
  207. data/ext/isomorfeus_ferret_ext/stem_modules.h +113 -26
  208. data/ext/isomorfeus_ferret_ext/stem_modules.txt +18 -5
  209. data/ext/isomorfeus_ferret_ext/stem_utilities.c +167 -132
  210. data/ext/isomorfeus_ferret_ext/test.c +7 -1
  211. data/ext/isomorfeus_ferret_ext/test_fields.c +57 -45
  212. data/ext/isomorfeus_ferret_ext/test_index.c +4 -1
  213. data/ext/isomorfeus_ferret_ext/test_search.c +0 -1
  214. data/lib/isomorfeus/ferret/version.rb +1 -1
  215. metadata +125 -5
  216. data/ext/isomorfeus_ferret_ext/q_parser.y +0 -1366
@@ -6,6 +6,8 @@
6
6
  #include <string.h>
7
7
  #include <limits.h>
8
8
  #include <ctype.h>
9
+ #include "brotli_decode.h"
10
+ #include "brotli_encode.h"
9
11
 
10
12
  extern void frt_micro_sleep(const int micro_seconds);
11
13
 
@@ -39,8 +41,8 @@ static char *ste_next(FrtTermEnum *te);
39
41
  #define FORMAT 0
40
42
  #define SEGMENTS_GEN_FILE_NAME "segments"
41
43
  #define MAX_EXT_LEN 10
42
- #define ZIP_BUFFER_SIZE 16348
43
- #define ZIP_LEVEL 9
44
+ #define COMPRESSION_BUFFER_SIZE 16348
45
+ #define COMPRESSION_LEVEL 9
44
46
 
45
47
  /* *** Must be three characters *** */
46
48
  static const char *INDEX_EXTENSIONS[] = {
@@ -220,6 +222,9 @@ static void fi_set_store(FrtFieldInfo *fi, int store)
220
222
  case FRT_STORE_YES:
221
223
  fi->bits |= FRT_FI_IS_STORED_BM;
222
224
  break;
225
+ case FRT_STORE_COMPRESS:
226
+ fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_IS_STORED_BM;
227
+ break;
223
228
  }
224
229
  }
225
230
 
@@ -304,8 +309,9 @@ char *frt_fi_to_s(FrtFieldInfo *fi)
304
309
  const char *fi_name = rb_id2name(fi->name);
305
310
  char *str = FRT_ALLOC_N(char, strlen(fi_name) + 200);
306
311
  char *s = str;
307
- s += sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s", fi_name,
312
+ s += sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s%s", fi_name,
308
313
  fi_is_stored(fi) ? "is_stored, " : "",
314
+ fi_is_compressed(fi) ? "is_compressed, " : "",
309
315
  fi_is_indexed(fi) ? "is_indexed, " : "",
310
316
  fi_is_tokenized(fi) ? "is_tokenized, " : "",
311
317
  fi_omit_norms(fi) ? "omit_norms, " : "",
@@ -443,7 +449,8 @@ void frt_fis_write(FrtFieldInfos *fis, FrtOutStream *os)
443
449
  static const char *store_str[] = {
444
450
  ":no",
445
451
  ":yes",
446
- ""
452
+ "",
453
+ ":compressed"
447
454
  };
448
455
 
449
456
  static const char *fi_store_str(FrtFieldInfo *fi)
@@ -1145,12 +1152,13 @@ frt_u64 frt_sis_read_current_version(FrtStore *store)
1145
1152
  *
1146
1153
  ****************************************************************************/
1147
1154
 
1148
- static FrtLazyDocField *lazy_df_new(FrtSymbol name, const int size)
1155
+ static FrtLazyDocField *lazy_df_new(FrtSymbol name, const int size, bool is_compressed)
1149
1156
  {
1150
1157
  FrtLazyDocField *self = FRT_ALLOC(FrtLazyDocField);
1151
1158
  self->name = name;
1152
1159
  self->size = size;
1153
1160
  self->data = FRT_ALLOC_AND_ZERO_N(FrtLazyDocFieldData, size);
1161
+ self->is_compressed = is_compressed;
1154
1162
  return self;
1155
1163
  }
1156
1164
 
@@ -1166,6 +1174,52 @@ static void lazy_df_destroy(FrtLazyDocField *self)
1166
1174
  free(self);
1167
1175
  }
1168
1176
 
1177
+ static void comp_raise()
1178
+ {
1179
+ FRT_RAISE(EXCEPTION, "Compression error");
1180
+ }
1181
+
1182
+ static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len)
1183
+ {
1184
+ int buf_out_idx = 0;
1185
+ int read_len;
1186
+ frt_uchar buf_in[COMPRESSION_BUFFER_SIZE];
1187
+ const frt_uchar *next_in;
1188
+ size_t available_in;
1189
+ frt_uchar *buf_out = NULL;
1190
+ frt_uchar *next_out;
1191
+ size_t available_out;
1192
+
1193
+ BrotliDecoderState *b_state = BrotliDecoderCreateInstance(NULL, NULL, NULL);
1194
+ BrotliDecoderResult b_result = BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
1195
+ if (!b_state) { comp_raise(); return NULL; }
1196
+
1197
+ do {
1198
+ read_len = compressed_len > COMPRESSION_BUFFER_SIZE ? COMPRESSION_BUFFER_SIZE : compressed_len;
1199
+ frt_is_read_bytes(is, buf_in, read_len);
1200
+ compressed_len -= read_len;
1201
+ available_in = read_len;
1202
+ next_in = buf_in;
1203
+ available_out = COMPRESSION_BUFFER_SIZE;
1204
+ do {
1205
+ FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + COMPRESSION_BUFFER_SIZE);
1206
+ next_out = buf_out + buf_out_idx;
1207
+ b_result = BrotliDecoderDecompressStream(b_state,
1208
+ &available_in, &next_in,
1209
+ &available_out, &next_out, NULL);
1210
+ if (b_result == BROTLI_DECODER_RESULT_ERROR) { comp_raise(); return NULL; }
1211
+ buf_out_idx += COMPRESSION_BUFFER_SIZE - available_out;
1212
+ } while (b_result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT);
1213
+ } while (b_result != BROTLI_DECODER_RESULT_SUCCESS && compressed_len > 0);
1214
+
1215
+ BrotliDecoderDestroyInstance(b_state);
1216
+
1217
+ FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + 1);
1218
+ buf_out[buf_out_idx] = '\0';
1219
+ *len = buf_out_idx;
1220
+ return (char *)buf_out;
1221
+ }
1222
+
1169
1223
  char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
1170
1224
  {
1171
1225
  char *text = NULL;
@@ -1174,9 +1228,13 @@ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
1174
1228
  if (NULL == text) {
1175
1229
  const int read_len = self->data[i].length + 1;
1176
1230
  frt_is_seek(self->doc->fields_in, self->data[i].start);
1177
- self->data[i].text = text = FRT_ALLOC_N(char, read_len);
1178
- frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
1179
- text[read_len - 1] = '\0';
1231
+ if (self->is_compressed) {
1232
+ self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length));
1233
+ } else {
1234
+ self->data[i].text = text = FRT_ALLOC_N(char, read_len);
1235
+ frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
1236
+ text[read_len - 1] = '\0';
1237
+ }
1180
1238
  }
1181
1239
  }
1182
1240
 
@@ -1185,6 +1243,16 @@ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
1185
1243
 
1186
1244
  void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
1187
1245
  {
1246
+ if (self->is_compressed == 1) {
1247
+ int i;
1248
+ self->len = 0;
1249
+ for (i = self->size-1; i >= 0; i--) {
1250
+ (void)frt_lazy_df_get_data(self, i);
1251
+ self->len += self->data[i].length + 1;
1252
+ }
1253
+ self->len--; /* each field separated by ' ' but no need to add to end */
1254
+ self->is_compressed = 2;
1255
+ }
1188
1256
  if (start < 0 || start >= self->len) {
1189
1257
  FRT_RAISE(FRT_IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
1190
1258
  "is not between 0 and %d", start, self->len);
@@ -1196,7 +1264,33 @@ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
1196
1264
  FRT_RAISE(FRT_IO_ERROR, "Tried to read past end of field. Field is only %d "
1197
1265
  "bytes long but tried to read to %d", self->len, start + len);
1198
1266
  }
1199
- else {
1267
+ if (self->is_compressed) {
1268
+ int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
1269
+ for (i = 0; i < self->size; i++) {
1270
+ cur_end = cur_start + self->data[i].length;
1271
+ if (start < cur_end) {
1272
+ copy_start = start > cur_start ? start - cur_start : 0;
1273
+ copy_len = cur_end - cur_start - copy_start;
1274
+ if (copy_len >= len) {
1275
+ copy_len = len;
1276
+ len = 0;
1277
+ }
1278
+ else {
1279
+ len -= copy_len;
1280
+ }
1281
+ memcpy(buf + buf_start,
1282
+ self->data[i].text + copy_start,
1283
+ copy_len);
1284
+ buf_start += copy_len;
1285
+ if (len > 0) {
1286
+ buf[buf_start++] = ' ';
1287
+ len--;
1288
+ }
1289
+ if (len == 0) break;
1290
+ }
1291
+ cur_start = cur_end + 1;
1292
+ }
1293
+ } else {
1200
1294
  frt_is_seek(self->doc->fields_in, self->data[0].start + start);
1201
1295
  frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)buf, len);
1202
1296
  }
@@ -1286,7 +1380,7 @@ void frt_fr_close(FrtFieldsReader *fr)
1286
1380
  free(fr);
1287
1381
  }
1288
1382
 
1289
- static FrtDocField *frt_fr_df_new(FrtSymbol name, int size)
1383
+ static FrtDocField *frt_fr_df_new(FrtSymbol name, int size, bool is_compressed)
1290
1384
  {
1291
1385
  FrtDocField *df = FRT_ALLOC(FrtDocField);
1292
1386
  df->name = name;
@@ -1295,9 +1389,22 @@ static FrtDocField *frt_fr_df_new(FrtSymbol name, int size)
1295
1389
  df->lengths = FRT_ALLOC_N(int, df->capa);
1296
1390
  df->destroy_data = true;
1297
1391
  df->boost = 1.0f;
1392
+ df->is_compressed = is_compressed;
1298
1393
  return df;
1299
1394
  }
1300
1395
 
1396
+ static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df)
1397
+ {
1398
+ int i;
1399
+ const int df_size = df->size;
1400
+ FrtInStream *fdt_in = fr->fdt_in;
1401
+
1402
+ for (i = 0; i < df_size; i++) {
1403
+ const int compressed_len = df->lengths[i] + 1;
1404
+ df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]));
1405
+ }
1406
+ }
1407
+
1301
1408
  FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
1302
1409
  {
1303
1410
  int i, j;
@@ -1316,7 +1423,7 @@ FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
1316
1423
  const int field_num = frt_is_read_vint(fdt_in);
1317
1424
  FrtFieldInfo *fi = fr->fis->fields[field_num];
1318
1425
  const int df_size = frt_is_read_vint(fdt_in);
1319
- FrtDocField *df = frt_fr_df_new(fi->name, df_size);
1426
+ FrtDocField *df = frt_fr_df_new(fi->name, df_size, fi_is_compressed(fi));
1320
1427
 
1321
1428
  for (j = 0; j < df_size; j++) {
1322
1429
  df->lengths[j] = frt_is_read_vint(fdt_in);
@@ -1326,12 +1433,16 @@ FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
1326
1433
  }
1327
1434
  for (i = 0; i < stored_cnt; i++) {
1328
1435
  FrtDocField *df = doc->fields[i];
1329
- const int df_size = df->size;
1330
- for (j = 0; j < df_size; j++) {
1331
- const int read_len = df->lengths[j] + 1;
1332
- df->data[j] = FRT_ALLOC_N(char, read_len);
1333
- frt_is_read_bytes(fdt_in, (frt_uchar *)df->data[j], read_len);
1334
- df->data[j][read_len - 1] = '\0';
1436
+ if (df->is_compressed) {
1437
+ frt_fr_read_compressed_fields(fr, df);
1438
+ } else {
1439
+ const int df_size = df->size;
1440
+ for (j = 0; j < df_size; j++) {
1441
+ const int read_len = df->lengths[j] + 1;
1442
+ df->data[j] = FRT_ALLOC_N(char, read_len);
1443
+ frt_is_read_bytes(fdt_in, (frt_uchar *)df->data[j], read_len);
1444
+ df->data[j][read_len - 1] = '\0';
1445
+ }
1335
1446
  }
1336
1447
  }
1337
1448
 
@@ -1355,7 +1466,7 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
1355
1466
  for (i = 0; i < stored_cnt; i++) {
1356
1467
  FrtFieldInfo *fi = fr->fis->fields[frt_is_read_vint(fdt_in)];
1357
1468
  const int data_cnt = frt_is_read_vint(fdt_in);
1358
- FrtLazyDocField *lazy_df = lazy_df_new(fi->name, data_cnt);
1469
+ FrtLazyDocField *lazy_df = lazy_df_new(fi->name, data_cnt, fi_is_compressed(fi));
1359
1470
  const int field_start = start;
1360
1471
  /* get the starts relative positions this time around */
1361
1472
  for (j = 0; j < data_cnt; j++) {
@@ -1549,6 +1660,37 @@ void frt_fw_close(FrtFieldsWriter *fw)
1549
1660
  free(fw);
1550
1661
  }
1551
1662
 
1663
+ static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length)
1664
+ {
1665
+ size_t compressed_len = 0;
1666
+ const frt_uchar *next_in = data;
1667
+ size_t available_in = length;
1668
+ size_t available_out;
1669
+ frt_uchar compression_buffer[COMPRESSION_BUFFER_SIZE];
1670
+ frt_uchar *next_out;
1671
+ BrotliEncoderState *b_state = BrotliEncoderCreateInstance(NULL, NULL, NULL);
1672
+ if (!b_state) { comp_raise(); return -1; }
1673
+
1674
+ BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, COMPRESSION_LEVEL);
1675
+
1676
+ do {
1677
+ available_out = COMPRESSION_BUFFER_SIZE;
1678
+ next_out = compression_buffer;
1679
+ if (!BrotliEncoderCompressStream(b_state, BROTLI_OPERATION_FINISH,
1680
+ &available_in, &next_in,
1681
+ &available_out, &next_out, &compressed_len)) {
1682
+ BrotliEncoderDestroyInstance(b_state);
1683
+ comp_raise();
1684
+ return -1;
1685
+ }
1686
+ frt_os_write_bytes(out_stream, compression_buffer, COMPRESSION_BUFFER_SIZE - available_out);
1687
+ } while (!BrotliEncoderIsFinished(b_state));
1688
+
1689
+ BrotliEncoderDestroyInstance(b_state);
1690
+ // fprintf(stderr, "Compressed: %i -> %i\n", length, (int)compressed_len);
1691
+ return (int)compressed_len;
1692
+ }
1693
+
1552
1694
  void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
1553
1695
  {
1554
1696
  int i, j, stored_cnt = 0;
@@ -1577,13 +1719,20 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
1577
1719
  const int df_size = df->size;
1578
1720
  frt_os_write_vint(fdt_out, fi->number);
1579
1721
  frt_os_write_vint(fdt_out, df_size);
1580
- for (j = 0; j < df_size; j++) {
1581
- const int length = df->lengths[j];
1582
- frt_os_write_vint(fdt_out, length);
1583
- frt_os_write_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
1584
- /* leave a space between fields as that is how they are
1585
- * analyzed */
1586
- frt_os_write_byte(fw->buffer, ' ');
1722
+ if (fi_is_compressed(fi)) {
1723
+ for (j = 0; j < df_size; j++) {
1724
+ const int length = df->lengths[j];
1725
+ int compressed_len = frt_os_write_compressed_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
1726
+ frt_os_write_vint(fdt_out, compressed_len - 1);
1727
+ }
1728
+ } else {
1729
+ for (j = 0; j < df_size; j++) {
1730
+ const int length = df->lengths[j];
1731
+ frt_os_write_vint(fdt_out, length);
1732
+ frt_os_write_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
1733
+ /* leave a space between fields as that is how they are analyzed */
1734
+ frt_os_write_byte(fw->buffer, ' ');
1735
+ }
1587
1736
  }
1588
1737
  }
1589
1738
  }
@@ -66,7 +66,8 @@ extern FrtHash *frt_co_hash_create();
66
66
  typedef enum
67
67
  {
68
68
  FRT_STORE_NO = 0,
69
- FRT_STORE_YES = 1
69
+ FRT_STORE_YES = 1,
70
+ FRT_STORE_COMPRESS = 2
70
71
  } FrtStoreValue;
71
72
 
72
73
  typedef enum
@@ -88,6 +89,7 @@ typedef enum
88
89
  } FrtTermVectorValue;
89
90
 
90
91
  #define FRT_FI_IS_STORED_BM 0x001
92
+ #define FRT_FI_IS_COMPRESSED_BM 0x002
91
93
  #define FRT_FI_IS_INDEXED_BM 0x004
92
94
  #define FRT_FI_IS_TOKENIZED_BM 0x008
93
95
  #define FRT_FI_OMIT_NORMS_BM 0x010
@@ -112,6 +114,7 @@ extern char *frt_fi_to_s(FrtFieldInfo *fi);
112
114
  extern void frt_fi_deref(FrtFieldInfo *fi);
113
115
 
114
116
  #define fi_is_stored(fi) (((fi)->bits & FRT_FI_IS_STORED_BM) != 0)
117
+ #define fi_is_compressed(fi) (((fi)->bits & FRT_FI_IS_COMPRESSED_BM) != 0)
115
118
  #define fi_is_indexed(fi) (((fi)->bits & FRT_FI_IS_INDEXED_BM) != 0)
116
119
  #define fi_is_tokenized(fi) (((fi)->bits & FRT_FI_IS_TOKENIZED_BM) != 0)
117
120
  #define fi_omit_norms(fi) (((fi)->bits & FRT_FI_OMIT_NORMS_BM) != 0)
@@ -575,11 +578,11 @@ typedef struct FrtLazyDocField
575
578
  FrtLazyDoc *doc;
576
579
  int size; /* number of data elements */
577
580
  int len; /* length of data elements concatenated */
581
+ int is_compressed : 2; /* set to 2 after all data is loaded */
578
582
  } FrtLazyDocField;
579
583
 
580
584
  extern char *frt_lazy_df_get_data(FrtLazyDocField *self, int i);
581
- extern void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf,
582
- int start, int len);
585
+ extern void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len);
583
586
 
584
587
  /* * * FrtLazyDoc * * */
585
588
  struct FrtLazyDoc
@@ -236,7 +236,11 @@ Metrowerks:
236
236
  #endif
237
237
 
238
238
  #if defined __GNUC__
239
- # define POSH_COMPILER_STRING "Gnu GCC"
239
+ # if defined __MINGW32__
240
+ # define POSH_COMPILER_STRING "MingW Gnu GCC"
241
+ # else
242
+ # define POSH_COMPILER_STRING "Gnu GCC"
243
+ # endif
240
244
  # define POSH_COMPILER_GCC 1
241
245
  #endif
242
246
 
@@ -307,9 +311,13 @@ Metrowerks:
307
311
 
308
312
  #if defined __MINGW32__
309
313
  # define POSH_OS_MINGW 1
310
- # define POSH_OS_STRING "MinGW"
311
314
  # if defined _WIN64
312
315
  # define POSH_OS_WIN64 1
316
+ # define POSH_OS_STRING "Win64"
317
+ # elif defined _WIN32
318
+ # define POSH_OS_STRING "Win32"
319
+ # else
320
+ # define POSH_OS_STRING "MinGW"
313
321
  # endif
314
322
  #endif
315
323
 
@@ -474,7 +482,7 @@ Metrowerks:
474
482
  # define POSH_CPU_SPARC 1
475
483
  #endif
476
484
 
477
- #if defined ARM || defined __arm__ || defined _ARM || __aarch64__
485
+ #if defined ARM || defined __arm__ || defined _ARM || defined __aarch64__
478
486
  # define POSH_CPU_STRONGARM 1
479
487
  # define POSH_CPU_STRING "ARM"
480
488
  #endif
@@ -690,16 +698,6 @@ typedef unsigned long long posh_u64_t;
690
698
  # define POSH_I64_PRINTF_PREFIX "ll"
691
699
  #endif
692
700
 
693
- /* hack */
694
- #ifdef __MINGW32__
695
- #undef POSH_I64
696
- #undef POSH_U64
697
- #undef POSH_I64_PRINTF_PREFIX
698
- #define POSH_I64( x ) ((posh_i64_t)x)
699
- #define POSH_U64( x ) ((posh_u64_t)x)
700
- #define POSH_I64_PRINTF_PREFIX "I64"
701
- #endif
702
-
703
701
  /** Minimum value for a 64-bit signed integer */
704
702
  #define POSH_I64_MIN POSH_I64(0x8000000000000000)
705
703
  /** Maximum value for a 64-bit signed integer */
@@ -965,9 +963,3 @@ extern posh_i64_t POSH_ReadI64FromBig( const void *src );
965
963
  # endif /* POSH_64BIT_INTEGER */
966
964
 
967
965
  #endif
968
-
969
- #ifdef __cplusplus
970
- }
971
- #endif
972
-
973
-