isomorfeus-ferret 0.12.4 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +612 -612
- data/README.md +77 -48
- data/ext/isomorfeus_ferret_ext/bm_hash.c +9 -6
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +4 -2
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.c +15 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.h +200 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.c +156 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.h +113 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +5914 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.h +64 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.c +22 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.h +594 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.c +291 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.h +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_version.h +26 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.c +76 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.h +351 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +2608 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.c +339 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.h +121 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_prefix.h +732 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.c +159 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.h +365 -0
- data/ext/isomorfeus_ferret_ext/brotli_decode.h +344 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.c +145 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.h +39 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.c +843 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.h +95 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_inc.h +163 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.c +35 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost_inc.h +127 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_encoder_inc.h +34 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.c +194 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter_inc.h +440 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.c +1314 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.h +84 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.c +56 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.h +48 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster_inc.h +320 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.c +28 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.h +190 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.c +790 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.h +61 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.c +645 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.h +54 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.c +1846 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.h +25 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encode.c +1927 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.c +33 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.h +43 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.c +503 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.h +122 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode_static.h +539 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.c +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.h +66 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_find_match_length.h +79 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash.h +488 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_composite_inc.h +125 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_forgetful_chain_inc.h +293 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match64_inc.h +267 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_inc.h +262 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_quickly_inc.h +266 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_rolling_inc.h +212 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_to_binary_tree_inc.h +329 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.c +100 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram_inc.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.c +175 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.h +30 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.c +170 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.h +114 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.c +663 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.h +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock_inc.h +183 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_params.h +46 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_prefix.h +53 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_quality.h +165 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_ringbuffer.h +167 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.c +486 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.h +40 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict_lut.h +5864 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.c +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.h +32 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_write_bits.h +87 -0
- data/ext/isomorfeus_ferret_ext/brotli_encode.h +448 -0
- data/ext/isomorfeus_ferret_ext/brotli_port.h +288 -0
- data/ext/isomorfeus_ferret_ext/brotli_types.h +83 -0
- data/ext/isomorfeus_ferret_ext/frb_index.c +35 -4
- data/ext/isomorfeus_ferret_ext/frb_store.c +34 -5
- data/ext/isomorfeus_ferret_ext/frt_document.h +1 -0
- data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_index.c +174 -25
- data/ext/isomorfeus_ferret_ext/frt_index.h +6 -3
- data/ext/isomorfeus_ferret_ext/frt_posh.h +11 -19
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +1844 -1911
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +12 -15
- data/ext/isomorfeus_ferret_ext/frt_ram_store.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_search.h +1 -1
- data/ext/isomorfeus_ferret_ext/libstemmer.c +14 -11
- data/ext/isomorfeus_ferret_ext/libstemmer.h +4 -9
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +1167 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +1433 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +120 -143
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +217 -237
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +377 -432
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +298 -342
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +530 -524
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +201 -214
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +394 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +457 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +396 -439
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +104 -128
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +242 -273
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +406 -461
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +1 -2
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +405 -456
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +108 -126
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +849 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +373 -405
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +288 -305
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.c +1651 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.c +546 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.c +1171 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.c +1436 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +121 -141
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +221 -241
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +381 -431
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +300 -345
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +518 -511
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +201 -209
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.c +3660 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.c +309 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +306 -671
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.c +394 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.c +457 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +400 -442
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.c +824 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.c +408 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +105 -127
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +245 -276
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +409 -464
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +376 -408
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +272 -287
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.c +6530 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +407 -458
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +110 -125
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.c +1865 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +698 -806
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +1 -1
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.c +1220 -0
- data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.h +6 -0
- data/ext/isomorfeus_ferret_ext/stem_api.c +1 -9
- data/ext/isomorfeus_ferret_ext/stem_api.h +1 -3
- data/ext/isomorfeus_ferret_ext/stem_header.h +30 -26
- data/ext/isomorfeus_ferret_ext/stem_modules.h +113 -26
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +18 -5
- data/ext/isomorfeus_ferret_ext/stem_utilities.c +167 -132
- data/ext/isomorfeus_ferret_ext/test.c +7 -1
- data/ext/isomorfeus_ferret_ext/test_fields.c +57 -45
- data/ext/isomorfeus_ferret_ext/test_index.c +4 -1
- data/ext/isomorfeus_ferret_ext/test_search.c +0 -1
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +125 -5
- data/ext/isomorfeus_ferret_ext/q_parser.y +0 -1366
@@ -6,6 +6,8 @@
|
|
6
6
|
#include <string.h>
|
7
7
|
#include <limits.h>
|
8
8
|
#include <ctype.h>
|
9
|
+
#include "brotli_decode.h"
|
10
|
+
#include "brotli_encode.h"
|
9
11
|
|
10
12
|
extern void frt_micro_sleep(const int micro_seconds);
|
11
13
|
|
@@ -39,8 +41,8 @@ static char *ste_next(FrtTermEnum *te);
|
|
39
41
|
#define FORMAT 0
|
40
42
|
#define SEGMENTS_GEN_FILE_NAME "segments"
|
41
43
|
#define MAX_EXT_LEN 10
|
42
|
-
#define
|
43
|
-
#define
|
44
|
+
#define COMPRESSION_BUFFER_SIZE 16348
|
45
|
+
#define COMPRESSION_LEVEL 9
|
44
46
|
|
45
47
|
/* *** Must be three characters *** */
|
46
48
|
static const char *INDEX_EXTENSIONS[] = {
|
@@ -220,6 +222,9 @@ static void fi_set_store(FrtFieldInfo *fi, int store)
|
|
220
222
|
case FRT_STORE_YES:
|
221
223
|
fi->bits |= FRT_FI_IS_STORED_BM;
|
222
224
|
break;
|
225
|
+
case FRT_STORE_COMPRESS:
|
226
|
+
fi->bits |= FRT_FI_IS_COMPRESSED_BM | FRT_FI_IS_STORED_BM;
|
227
|
+
break;
|
223
228
|
}
|
224
229
|
}
|
225
230
|
|
@@ -304,8 +309,9 @@ char *frt_fi_to_s(FrtFieldInfo *fi)
|
|
304
309
|
const char *fi_name = rb_id2name(fi->name);
|
305
310
|
char *str = FRT_ALLOC_N(char, strlen(fi_name) + 200);
|
306
311
|
char *s = str;
|
307
|
-
s += sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s", fi_name,
|
312
|
+
s += sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s%s", fi_name,
|
308
313
|
fi_is_stored(fi) ? "is_stored, " : "",
|
314
|
+
fi_is_compressed(fi) ? "is_compressed, " : "",
|
309
315
|
fi_is_indexed(fi) ? "is_indexed, " : "",
|
310
316
|
fi_is_tokenized(fi) ? "is_tokenized, " : "",
|
311
317
|
fi_omit_norms(fi) ? "omit_norms, " : "",
|
@@ -443,7 +449,8 @@ void frt_fis_write(FrtFieldInfos *fis, FrtOutStream *os)
|
|
443
449
|
static const char *store_str[] = {
|
444
450
|
":no",
|
445
451
|
":yes",
|
446
|
-
""
|
452
|
+
"",
|
453
|
+
":compressed"
|
447
454
|
};
|
448
455
|
|
449
456
|
static const char *fi_store_str(FrtFieldInfo *fi)
|
@@ -1145,12 +1152,13 @@ frt_u64 frt_sis_read_current_version(FrtStore *store)
|
|
1145
1152
|
*
|
1146
1153
|
****************************************************************************/
|
1147
1154
|
|
1148
|
-
static FrtLazyDocField *lazy_df_new(FrtSymbol name, const int size)
|
1155
|
+
static FrtLazyDocField *lazy_df_new(FrtSymbol name, const int size, bool is_compressed)
|
1149
1156
|
{
|
1150
1157
|
FrtLazyDocField *self = FRT_ALLOC(FrtLazyDocField);
|
1151
1158
|
self->name = name;
|
1152
1159
|
self->size = size;
|
1153
1160
|
self->data = FRT_ALLOC_AND_ZERO_N(FrtLazyDocFieldData, size);
|
1161
|
+
self->is_compressed = is_compressed;
|
1154
1162
|
return self;
|
1155
1163
|
}
|
1156
1164
|
|
@@ -1166,6 +1174,52 @@ static void lazy_df_destroy(FrtLazyDocField *self)
|
|
1166
1174
|
free(self);
|
1167
1175
|
}
|
1168
1176
|
|
1177
|
+
static void comp_raise()
|
1178
|
+
{
|
1179
|
+
FRT_RAISE(EXCEPTION, "Compression error");
|
1180
|
+
}
|
1181
|
+
|
1182
|
+
static char *is_read_compressed_bytes(FrtInStream *is, int compressed_len, int *len)
|
1183
|
+
{
|
1184
|
+
int buf_out_idx = 0;
|
1185
|
+
int read_len;
|
1186
|
+
frt_uchar buf_in[COMPRESSION_BUFFER_SIZE];
|
1187
|
+
const frt_uchar *next_in;
|
1188
|
+
size_t available_in;
|
1189
|
+
frt_uchar *buf_out = NULL;
|
1190
|
+
frt_uchar *next_out;
|
1191
|
+
size_t available_out;
|
1192
|
+
|
1193
|
+
BrotliDecoderState *b_state = BrotliDecoderCreateInstance(NULL, NULL, NULL);
|
1194
|
+
BrotliDecoderResult b_result = BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
|
1195
|
+
if (!b_state) { comp_raise(); return NULL; }
|
1196
|
+
|
1197
|
+
do {
|
1198
|
+
read_len = compressed_len > COMPRESSION_BUFFER_SIZE ? COMPRESSION_BUFFER_SIZE : compressed_len;
|
1199
|
+
frt_is_read_bytes(is, buf_in, read_len);
|
1200
|
+
compressed_len -= read_len;
|
1201
|
+
available_in = read_len;
|
1202
|
+
next_in = buf_in;
|
1203
|
+
available_out = COMPRESSION_BUFFER_SIZE;
|
1204
|
+
do {
|
1205
|
+
FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + COMPRESSION_BUFFER_SIZE);
|
1206
|
+
next_out = buf_out + buf_out_idx;
|
1207
|
+
b_result = BrotliDecoderDecompressStream(b_state,
|
1208
|
+
&available_in, &next_in,
|
1209
|
+
&available_out, &next_out, NULL);
|
1210
|
+
if (b_result == BROTLI_DECODER_RESULT_ERROR) { comp_raise(); return NULL; }
|
1211
|
+
buf_out_idx += COMPRESSION_BUFFER_SIZE - available_out;
|
1212
|
+
} while (b_result == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT);
|
1213
|
+
} while (b_result != BROTLI_DECODER_RESULT_SUCCESS && compressed_len > 0);
|
1214
|
+
|
1215
|
+
BrotliDecoderDestroyInstance(b_state);
|
1216
|
+
|
1217
|
+
FRT_REALLOC_N(buf_out, frt_uchar, buf_out_idx + 1);
|
1218
|
+
buf_out[buf_out_idx] = '\0';
|
1219
|
+
*len = buf_out_idx;
|
1220
|
+
return (char *)buf_out;
|
1221
|
+
}
|
1222
|
+
|
1169
1223
|
char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
|
1170
1224
|
{
|
1171
1225
|
char *text = NULL;
|
@@ -1174,9 +1228,13 @@ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
|
|
1174
1228
|
if (NULL == text) {
|
1175
1229
|
const int read_len = self->data[i].length + 1;
|
1176
1230
|
frt_is_seek(self->doc->fields_in, self->data[i].start);
|
1177
|
-
self->
|
1178
|
-
|
1179
|
-
|
1231
|
+
if (self->is_compressed) {
|
1232
|
+
self->data[i].text = text = is_read_compressed_bytes(self->doc->fields_in, read_len, &(self->data[i].length));
|
1233
|
+
} else {
|
1234
|
+
self->data[i].text = text = FRT_ALLOC_N(char, read_len);
|
1235
|
+
frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)text, read_len);
|
1236
|
+
text[read_len - 1] = '\0';
|
1237
|
+
}
|
1180
1238
|
}
|
1181
1239
|
}
|
1182
1240
|
|
@@ -1185,6 +1243,16 @@ char *frt_lazy_df_get_data(FrtLazyDocField *self, int i)
|
|
1185
1243
|
|
1186
1244
|
void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
|
1187
1245
|
{
|
1246
|
+
if (self->is_compressed == 1) {
|
1247
|
+
int i;
|
1248
|
+
self->len = 0;
|
1249
|
+
for (i = self->size-1; i >= 0; i--) {
|
1250
|
+
(void)frt_lazy_df_get_data(self, i);
|
1251
|
+
self->len += self->data[i].length + 1;
|
1252
|
+
}
|
1253
|
+
self->len--; /* each field separated by ' ' but no need to add to end */
|
1254
|
+
self->is_compressed = 2;
|
1255
|
+
}
|
1188
1256
|
if (start < 0 || start >= self->len) {
|
1189
1257
|
FRT_RAISE(FRT_IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
|
1190
1258
|
"is not between 0 and %d", start, self->len);
|
@@ -1196,7 +1264,33 @@ void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len)
|
|
1196
1264
|
FRT_RAISE(FRT_IO_ERROR, "Tried to read past end of field. Field is only %d "
|
1197
1265
|
"bytes long but tried to read to %d", self->len, start + len);
|
1198
1266
|
}
|
1199
|
-
|
1267
|
+
if (self->is_compressed) {
|
1268
|
+
int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
|
1269
|
+
for (i = 0; i < self->size; i++) {
|
1270
|
+
cur_end = cur_start + self->data[i].length;
|
1271
|
+
if (start < cur_end) {
|
1272
|
+
copy_start = start > cur_start ? start - cur_start : 0;
|
1273
|
+
copy_len = cur_end - cur_start - copy_start;
|
1274
|
+
if (copy_len >= len) {
|
1275
|
+
copy_len = len;
|
1276
|
+
len = 0;
|
1277
|
+
}
|
1278
|
+
else {
|
1279
|
+
len -= copy_len;
|
1280
|
+
}
|
1281
|
+
memcpy(buf + buf_start,
|
1282
|
+
self->data[i].text + copy_start,
|
1283
|
+
copy_len);
|
1284
|
+
buf_start += copy_len;
|
1285
|
+
if (len > 0) {
|
1286
|
+
buf[buf_start++] = ' ';
|
1287
|
+
len--;
|
1288
|
+
}
|
1289
|
+
if (len == 0) break;
|
1290
|
+
}
|
1291
|
+
cur_start = cur_end + 1;
|
1292
|
+
}
|
1293
|
+
} else {
|
1200
1294
|
frt_is_seek(self->doc->fields_in, self->data[0].start + start);
|
1201
1295
|
frt_is_read_bytes(self->doc->fields_in, (frt_uchar *)buf, len);
|
1202
1296
|
}
|
@@ -1286,7 +1380,7 @@ void frt_fr_close(FrtFieldsReader *fr)
|
|
1286
1380
|
free(fr);
|
1287
1381
|
}
|
1288
1382
|
|
1289
|
-
static FrtDocField *frt_fr_df_new(FrtSymbol name, int size)
|
1383
|
+
static FrtDocField *frt_fr_df_new(FrtSymbol name, int size, bool is_compressed)
|
1290
1384
|
{
|
1291
1385
|
FrtDocField *df = FRT_ALLOC(FrtDocField);
|
1292
1386
|
df->name = name;
|
@@ -1295,9 +1389,22 @@ static FrtDocField *frt_fr_df_new(FrtSymbol name, int size)
|
|
1295
1389
|
df->lengths = FRT_ALLOC_N(int, df->capa);
|
1296
1390
|
df->destroy_data = true;
|
1297
1391
|
df->boost = 1.0f;
|
1392
|
+
df->is_compressed = is_compressed;
|
1298
1393
|
return df;
|
1299
1394
|
}
|
1300
1395
|
|
1396
|
+
static void frt_fr_read_compressed_fields(FrtFieldsReader *fr, FrtDocField *df)
|
1397
|
+
{
|
1398
|
+
int i;
|
1399
|
+
const int df_size = df->size;
|
1400
|
+
FrtInStream *fdt_in = fr->fdt_in;
|
1401
|
+
|
1402
|
+
for (i = 0; i < df_size; i++) {
|
1403
|
+
const int compressed_len = df->lengths[i] + 1;
|
1404
|
+
df->data[i] = is_read_compressed_bytes(fdt_in, compressed_len, &(df->lengths[i]));
|
1405
|
+
}
|
1406
|
+
}
|
1407
|
+
|
1301
1408
|
FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
|
1302
1409
|
{
|
1303
1410
|
int i, j;
|
@@ -1316,7 +1423,7 @@ FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
|
|
1316
1423
|
const int field_num = frt_is_read_vint(fdt_in);
|
1317
1424
|
FrtFieldInfo *fi = fr->fis->fields[field_num];
|
1318
1425
|
const int df_size = frt_is_read_vint(fdt_in);
|
1319
|
-
FrtDocField *df = frt_fr_df_new(fi->name, df_size);
|
1426
|
+
FrtDocField *df = frt_fr_df_new(fi->name, df_size, fi_is_compressed(fi));
|
1320
1427
|
|
1321
1428
|
for (j = 0; j < df_size; j++) {
|
1322
1429
|
df->lengths[j] = frt_is_read_vint(fdt_in);
|
@@ -1326,12 +1433,16 @@ FrtDocument *frt_fr_get_doc(FrtFieldsReader *fr, int doc_num)
|
|
1326
1433
|
}
|
1327
1434
|
for (i = 0; i < stored_cnt; i++) {
|
1328
1435
|
FrtDocField *df = doc->fields[i];
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1436
|
+
if (df->is_compressed) {
|
1437
|
+
frt_fr_read_compressed_fields(fr, df);
|
1438
|
+
} else {
|
1439
|
+
const int df_size = df->size;
|
1440
|
+
for (j = 0; j < df_size; j++) {
|
1441
|
+
const int read_len = df->lengths[j] + 1;
|
1442
|
+
df->data[j] = FRT_ALLOC_N(char, read_len);
|
1443
|
+
frt_is_read_bytes(fdt_in, (frt_uchar *)df->data[j], read_len);
|
1444
|
+
df->data[j][read_len - 1] = '\0';
|
1445
|
+
}
|
1335
1446
|
}
|
1336
1447
|
}
|
1337
1448
|
|
@@ -1355,7 +1466,7 @@ FrtLazyDoc *frt_fr_get_lazy_doc(FrtFieldsReader *fr, int doc_num)
|
|
1355
1466
|
for (i = 0; i < stored_cnt; i++) {
|
1356
1467
|
FrtFieldInfo *fi = fr->fis->fields[frt_is_read_vint(fdt_in)];
|
1357
1468
|
const int data_cnt = frt_is_read_vint(fdt_in);
|
1358
|
-
FrtLazyDocField *lazy_df = lazy_df_new(fi->name, data_cnt);
|
1469
|
+
FrtLazyDocField *lazy_df = lazy_df_new(fi->name, data_cnt, fi_is_compressed(fi));
|
1359
1470
|
const int field_start = start;
|
1360
1471
|
/* get the starts relative positions this time around */
|
1361
1472
|
for (j = 0; j < data_cnt; j++) {
|
@@ -1549,6 +1660,37 @@ void frt_fw_close(FrtFieldsWriter *fw)
|
|
1549
1660
|
free(fw);
|
1550
1661
|
}
|
1551
1662
|
|
1663
|
+
static int frt_os_write_compressed_bytes(FrtOutStream* out_stream, frt_uchar *data, int length)
|
1664
|
+
{
|
1665
|
+
size_t compressed_len = 0;
|
1666
|
+
const frt_uchar *next_in = data;
|
1667
|
+
size_t available_in = length;
|
1668
|
+
size_t available_out;
|
1669
|
+
frt_uchar compression_buffer[COMPRESSION_BUFFER_SIZE];
|
1670
|
+
frt_uchar *next_out;
|
1671
|
+
BrotliEncoderState *b_state = BrotliEncoderCreateInstance(NULL, NULL, NULL);
|
1672
|
+
if (!b_state) { comp_raise(); return -1; }
|
1673
|
+
|
1674
|
+
BrotliEncoderSetParameter(b_state, BROTLI_PARAM_QUALITY, COMPRESSION_LEVEL);
|
1675
|
+
|
1676
|
+
do {
|
1677
|
+
available_out = COMPRESSION_BUFFER_SIZE;
|
1678
|
+
next_out = compression_buffer;
|
1679
|
+
if (!BrotliEncoderCompressStream(b_state, BROTLI_OPERATION_FINISH,
|
1680
|
+
&available_in, &next_in,
|
1681
|
+
&available_out, &next_out, &compressed_len)) {
|
1682
|
+
BrotliEncoderDestroyInstance(b_state);
|
1683
|
+
comp_raise();
|
1684
|
+
return -1;
|
1685
|
+
}
|
1686
|
+
frt_os_write_bytes(out_stream, compression_buffer, COMPRESSION_BUFFER_SIZE - available_out);
|
1687
|
+
} while (!BrotliEncoderIsFinished(b_state));
|
1688
|
+
|
1689
|
+
BrotliEncoderDestroyInstance(b_state);
|
1690
|
+
// fprintf(stderr, "Compressed: %i -> %i\n", length, (int)compressed_len);
|
1691
|
+
return (int)compressed_len;
|
1692
|
+
}
|
1693
|
+
|
1552
1694
|
void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
|
1553
1695
|
{
|
1554
1696
|
int i, j, stored_cnt = 0;
|
@@ -1577,13 +1719,20 @@ void frt_fw_add_doc(FrtFieldsWriter *fw, FrtDocument *doc)
|
|
1577
1719
|
const int df_size = df->size;
|
1578
1720
|
frt_os_write_vint(fdt_out, fi->number);
|
1579
1721
|
frt_os_write_vint(fdt_out, df_size);
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1722
|
+
if (fi_is_compressed(fi)) {
|
1723
|
+
for (j = 0; j < df_size; j++) {
|
1724
|
+
const int length = df->lengths[j];
|
1725
|
+
int compressed_len = frt_os_write_compressed_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
|
1726
|
+
frt_os_write_vint(fdt_out, compressed_len - 1);
|
1727
|
+
}
|
1728
|
+
} else {
|
1729
|
+
for (j = 0; j < df_size; j++) {
|
1730
|
+
const int length = df->lengths[j];
|
1731
|
+
frt_os_write_vint(fdt_out, length);
|
1732
|
+
frt_os_write_bytes(fw->buffer, (frt_uchar*)df->data[j], length);
|
1733
|
+
/* leave a space between fields as that is how they are analyzed */
|
1734
|
+
frt_os_write_byte(fw->buffer, ' ');
|
1735
|
+
}
|
1587
1736
|
}
|
1588
1737
|
}
|
1589
1738
|
}
|
@@ -66,7 +66,8 @@ extern FrtHash *frt_co_hash_create();
|
|
66
66
|
typedef enum
|
67
67
|
{
|
68
68
|
FRT_STORE_NO = 0,
|
69
|
-
FRT_STORE_YES = 1
|
69
|
+
FRT_STORE_YES = 1,
|
70
|
+
FRT_STORE_COMPRESS = 2
|
70
71
|
} FrtStoreValue;
|
71
72
|
|
72
73
|
typedef enum
|
@@ -88,6 +89,7 @@ typedef enum
|
|
88
89
|
} FrtTermVectorValue;
|
89
90
|
|
90
91
|
#define FRT_FI_IS_STORED_BM 0x001
|
92
|
+
#define FRT_FI_IS_COMPRESSED_BM 0x002
|
91
93
|
#define FRT_FI_IS_INDEXED_BM 0x004
|
92
94
|
#define FRT_FI_IS_TOKENIZED_BM 0x008
|
93
95
|
#define FRT_FI_OMIT_NORMS_BM 0x010
|
@@ -112,6 +114,7 @@ extern char *frt_fi_to_s(FrtFieldInfo *fi);
|
|
112
114
|
extern void frt_fi_deref(FrtFieldInfo *fi);
|
113
115
|
|
114
116
|
#define fi_is_stored(fi) (((fi)->bits & FRT_FI_IS_STORED_BM) != 0)
|
117
|
+
#define fi_is_compressed(fi) (((fi)->bits & FRT_FI_IS_COMPRESSED_BM) != 0)
|
115
118
|
#define fi_is_indexed(fi) (((fi)->bits & FRT_FI_IS_INDEXED_BM) != 0)
|
116
119
|
#define fi_is_tokenized(fi) (((fi)->bits & FRT_FI_IS_TOKENIZED_BM) != 0)
|
117
120
|
#define fi_omit_norms(fi) (((fi)->bits & FRT_FI_OMIT_NORMS_BM) != 0)
|
@@ -575,11 +578,11 @@ typedef struct FrtLazyDocField
|
|
575
578
|
FrtLazyDoc *doc;
|
576
579
|
int size; /* number of data elements */
|
577
580
|
int len; /* length of data elements concatenated */
|
581
|
+
int is_compressed : 2; /* set to 2 after all data is loaded */
|
578
582
|
} FrtLazyDocField;
|
579
583
|
|
580
584
|
extern char *frt_lazy_df_get_data(FrtLazyDocField *self, int i);
|
581
|
-
extern void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf,
|
582
|
-
int start, int len);
|
585
|
+
extern void frt_lazy_df_get_bytes(FrtLazyDocField *self, char *buf, int start, int len);
|
583
586
|
|
584
587
|
/* * * FrtLazyDoc * * */
|
585
588
|
struct FrtLazyDoc
|
@@ -236,7 +236,11 @@ Metrowerks:
|
|
236
236
|
#endif
|
237
237
|
|
238
238
|
#if defined __GNUC__
|
239
|
-
#
|
239
|
+
# if defined __MINGW32__
|
240
|
+
# define POSH_COMPILER_STRING "MingW Gnu GCC"
|
241
|
+
# else
|
242
|
+
# define POSH_COMPILER_STRING "Gnu GCC"
|
243
|
+
# endif
|
240
244
|
# define POSH_COMPILER_GCC 1
|
241
245
|
#endif
|
242
246
|
|
@@ -307,9 +311,13 @@ Metrowerks:
|
|
307
311
|
|
308
312
|
#if defined __MINGW32__
|
309
313
|
# define POSH_OS_MINGW 1
|
310
|
-
# define POSH_OS_STRING "MinGW"
|
311
314
|
# if defined _WIN64
|
312
315
|
# define POSH_OS_WIN64 1
|
316
|
+
# define POSH_OS_STRING "Win64"
|
317
|
+
# elif defined _WIN32
|
318
|
+
# define POSH_OS_STRING "Win32"
|
319
|
+
# else
|
320
|
+
# define POSH_OS_STRING "MinGW"
|
313
321
|
# endif
|
314
322
|
#endif
|
315
323
|
|
@@ -474,7 +482,7 @@ Metrowerks:
|
|
474
482
|
# define POSH_CPU_SPARC 1
|
475
483
|
#endif
|
476
484
|
|
477
|
-
#if defined ARM || defined __arm__ || defined _ARM || __aarch64__
|
485
|
+
#if defined ARM || defined __arm__ || defined _ARM || defined __aarch64__
|
478
486
|
# define POSH_CPU_STRONGARM 1
|
479
487
|
# define POSH_CPU_STRING "ARM"
|
480
488
|
#endif
|
@@ -690,16 +698,6 @@ typedef unsigned long long posh_u64_t;
|
|
690
698
|
# define POSH_I64_PRINTF_PREFIX "ll"
|
691
699
|
#endif
|
692
700
|
|
693
|
-
/* hack */
|
694
|
-
#ifdef __MINGW32__
|
695
|
-
#undef POSH_I64
|
696
|
-
#undef POSH_U64
|
697
|
-
#undef POSH_I64_PRINTF_PREFIX
|
698
|
-
#define POSH_I64( x ) ((posh_i64_t)x)
|
699
|
-
#define POSH_U64( x ) ((posh_u64_t)x)
|
700
|
-
#define POSH_I64_PRINTF_PREFIX "I64"
|
701
|
-
#endif
|
702
|
-
|
703
701
|
/** Minimum value for a 64-bit signed integer */
|
704
702
|
#define POSH_I64_MIN POSH_I64(0x8000000000000000)
|
705
703
|
/** Maximum value for a 64-bit signed integer */
|
@@ -965,9 +963,3 @@ extern posh_i64_t POSH_ReadI64FromBig( const void *src );
|
|
965
963
|
# endif /* POSH_64BIT_INTEGER */
|
966
964
|
|
967
965
|
#endif
|
968
|
-
|
969
|
-
#ifdef __cplusplus
|
970
|
-
}
|
971
|
-
#endif
|
972
|
-
|
973
|
-
|