isomorfeus-ferret 0.12.5 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +101 -19
- data/README.md +54 -4
- data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
- data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
- data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.c +15 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.h +200 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.c +156 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.h +113 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +5914 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.h +64 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.c +22 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.h +594 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.c +291 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.h +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_version.h +26 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.c +76 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.h +351 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +2608 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.c +339 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.h +121 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_prefix.h +732 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.c +159 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.h +365 -0
- data/ext/isomorfeus_ferret_ext/brotli_decode.h +344 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.c +145 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.h +39 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.c +843 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.h +95 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_inc.h +163 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.c +35 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost_inc.h +127 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_encoder_inc.h +34 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.c +194 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter_inc.h +440 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.c +1314 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.h +84 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.c +56 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.h +48 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster_inc.h +320 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.c +28 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.h +190 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.c +790 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.h +61 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.c +645 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.h +54 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.c +1846 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.h +25 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encode.c +1927 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.c +33 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.h +43 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.c +503 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.h +122 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode_static.h +539 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.c +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.h +66 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_find_match_length.h +79 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash.h +488 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_composite_inc.h +125 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_forgetful_chain_inc.h +293 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match64_inc.h +267 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_inc.h +262 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_quickly_inc.h +266 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_rolling_inc.h +212 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_to_binary_tree_inc.h +329 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.c +100 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram_inc.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.c +175 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.h +30 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.c +170 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.h +114 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.c +663 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.h +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock_inc.h +183 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_params.h +46 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_prefix.h +53 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_quality.h +165 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_ringbuffer.h +167 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.c +486 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.h +40 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict_lut.h +5864 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.c +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.h +32 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_write_bits.h +87 -0
- data/ext/isomorfeus_ferret_ext/brotli_encode.h +448 -0
- data/ext/isomorfeus_ferret_ext/brotli_port.h +288 -0
- data/ext/isomorfeus_ferret_ext/brotli_types.h +83 -0
- data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
- data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
- data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
- data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
- data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
- data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
- data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
- data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
- data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
- data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
- data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
- data/ext/isomorfeus_ferret_ext/frb_index.c +513 -464
- data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
- data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
- data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
- data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
- data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
- data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
- data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
- data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
- data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
- data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
- data/ext/isomorfeus_ferret_ext/frt_document.h +10 -9
- data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
- data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
- data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
- data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
- data/ext/isomorfeus_ferret_ext/frt_fs_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
- data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
- data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
- data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
- data/ext/isomorfeus_ferret_ext/frt_index.c +714 -384
- data/ext/isomorfeus_ferret_ext/frt_index.h +274 -290
- data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
- data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
- data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
- data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
- data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
- data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
- data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +26 -25
- data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
- data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
- data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
- data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
- data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
- data/ext/isomorfeus_ferret_ext/frt_ram_store.c +46 -84
- data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
- data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
- data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
- data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
- data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
- data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
- data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
- data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
- data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
- data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
- data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
- data/ext/isomorfeus_ferret_ext/test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
- data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
- data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
- data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_fields.c +111 -100
- data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
- data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
- data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
- data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
- data/ext/isomorfeus_ferret_ext/test_index.c +373 -363
- data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
- data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
- data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
- data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
- data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
- data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
- data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
- data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
- data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
- data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
- data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +113 -58
- data/ext/isomorfeus_ferret_ext/email.rl +0 -21
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
- data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
- data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
- data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
- data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
- data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
- data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
- data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
- data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,130 +1,63 @@
|
|
1
1
|
#ifndef FRT_ANALYSIS_H
|
2
2
|
#define FRT_ANALYSIS_H
|
3
3
|
|
4
|
-
#include <wchar.h>
|
5
4
|
#include "frt_global.h"
|
6
5
|
#include "frt_hash.h"
|
7
6
|
#include "frt_multimapper.h"
|
7
|
+
#include <ruby/encoding.h>
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
*
|
13
|
-
****************************************************************************/
|
9
|
+
/*****************************************************************************/
|
10
|
+
/*** FrtToken ****************************************************************/
|
11
|
+
/*****************************************************************************/
|
14
12
|
|
15
|
-
typedef struct FrtToken
|
16
|
-
|
17
|
-
|
18
|
-
int len;
|
13
|
+
typedef struct FrtToken {
|
14
|
+
char text[FRT_MAX_WORD_SIZE];
|
15
|
+
int len;
|
19
16
|
off_t start;
|
20
17
|
off_t end;
|
21
|
-
int
|
18
|
+
int pos_inc;
|
22
19
|
} FrtToken;
|
23
20
|
|
24
21
|
extern FrtToken *frt_tk_new();
|
25
|
-
extern void
|
26
|
-
extern FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, off_t start, off_t end, int pos_inc);
|
27
|
-
extern FrtToken *frt_tk_set_no_len(FrtToken *tk, char *text, off_t start, off_t end, int pos_inc);
|
28
|
-
extern int
|
29
|
-
extern int
|
30
|
-
|
31
|
-
/****************************************************************************
|
32
|
-
*
|
33
|
-
* FrtTokenStream
|
34
|
-
*
|
35
|
-
****************************************************************************/
|
22
|
+
extern void frt_tk_destroy(void *p);
|
23
|
+
extern FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, off_t start, off_t end, int pos_inc, rb_encoding *encoding);
|
24
|
+
extern FrtToken *frt_tk_set_no_len(FrtToken *tk, char *text, off_t start, off_t end, int pos_inc, rb_encoding *encoding);
|
25
|
+
extern int frt_tk_eq(FrtToken *tk1, FrtToken *tk2);
|
26
|
+
extern int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2);
|
36
27
|
|
28
|
+
/*****************************************************************************/
|
29
|
+
/*** FrtTokenStream **********************************************************/
|
30
|
+
/*****************************************************************************/
|
37
31
|
|
38
32
|
typedef struct FrtTokenStream FrtTokenStream;
|
39
|
-
struct FrtTokenStream
|
40
|
-
|
41
|
-
char
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
FrtTokenStream
|
46
|
-
|
33
|
+
struct FrtTokenStream {
|
34
|
+
char *t; /* ptr used to scan text */
|
35
|
+
char *text;
|
36
|
+
int length;
|
37
|
+
rb_encoding *encoding;
|
38
|
+
FrtToken *(*next)(FrtTokenStream *ts);
|
39
|
+
FrtTokenStream *(*reset)(FrtTokenStream *ts, char *text, rb_encoding *encoding);
|
40
|
+
FrtTokenStream *(*clone_i)(FrtTokenStream *ts);
|
41
|
+
void (*destroy_i)(FrtTokenStream *ts);
|
47
42
|
int ref_cnt;
|
43
|
+
VALUE rts;
|
44
|
+
FrtToken token;
|
48
45
|
};
|
49
46
|
|
50
|
-
#define frt_ts_new(type) frt_ts_new_i(sizeof(type))
|
51
47
|
extern FrtTokenStream *frt_ts_new_i(size_t size);
|
48
|
+
extern FrtTokenStream *frt_ts_init(FrtTokenStream *ts);
|
49
|
+
extern FrtTokenStream *frt_ts_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding);
|
52
50
|
extern FrtTokenStream *frt_ts_clone_size(FrtTokenStream *orig_ts, size_t size);
|
53
51
|
|
54
|
-
typedef struct
|
55
|
-
{
|
56
|
-
FrtTokenStream super;
|
57
|
-
FrtToken token;
|
58
|
-
} FrtCachedTokenStream;
|
59
|
-
|
60
|
-
typedef struct FrtMultiByteTokenStream
|
61
|
-
{
|
62
|
-
FrtCachedTokenStream super;
|
63
|
-
mbstate_t state;
|
64
|
-
} FrtMultiByteTokenStream;
|
65
|
-
|
66
|
-
typedef enum
|
67
|
-
{
|
68
|
-
FRT_STT_ASCII,
|
69
|
-
FRT_STT_MB,
|
70
|
-
FRT_STT_UTF8
|
71
|
-
} FrtStandardTokenizerType;
|
72
|
-
|
73
|
-
typedef struct FrtStandardTokenizer
|
74
|
-
{
|
75
|
-
FrtCachedTokenStream super;
|
76
|
-
FrtStandardTokenizerType type;
|
77
|
-
} FrtStandardTokenizer;
|
78
|
-
|
79
|
-
typedef struct FrtLegacyStandardTokenizer
|
80
|
-
{
|
81
|
-
FrtCachedTokenStream super;
|
82
|
-
bool (*advance_to_start)(FrtTokenStream *ts);
|
83
|
-
bool (*is_tok_char)(char *c);
|
84
|
-
int (*get_alpha)(FrtTokenStream *ts, char *token);
|
85
|
-
int (*get_apostrophe)(char *input);
|
86
|
-
} FrtLegacyStandardTokenizer;
|
87
|
-
|
88
|
-
typedef struct FrtTokenFilter
|
89
|
-
{
|
52
|
+
typedef struct FrtTokenFilter {
|
90
53
|
FrtTokenStream super;
|
91
54
|
FrtTokenStream *sub_ts;
|
92
55
|
} FrtTokenFilter;
|
93
56
|
|
94
57
|
extern FrtTokenStream *frt_filter_clone_size(FrtTokenStream *ts, size_t size);
|
95
|
-
#define
|
58
|
+
#define frt_tf_new(type, sub) frt_tf_new_i(sizeof(type), sub)
|
96
59
|
extern FrtTokenStream *frt_tf_new_i(size_t size, FrtTokenStream *sub_ts);
|
97
60
|
|
98
|
-
typedef struct FrtStopFilter
|
99
|
-
{
|
100
|
-
FrtTokenFilter super;
|
101
|
-
FrtHash *words;
|
102
|
-
} FrtStopFilter;
|
103
|
-
|
104
|
-
typedef struct FrtMappingFilter
|
105
|
-
{
|
106
|
-
FrtTokenFilter super;
|
107
|
-
FrtMultiMapper *mapper;
|
108
|
-
} FrtMappingFilter;
|
109
|
-
|
110
|
-
typedef struct FrtHyphenFilter
|
111
|
-
{
|
112
|
-
FrtTokenFilter super;
|
113
|
-
char text[FRT_MAX_WORD_SIZE];
|
114
|
-
int start;
|
115
|
-
int pos;
|
116
|
-
int len;
|
117
|
-
FrtToken *tk;
|
118
|
-
} FrtHyphenFilter;
|
119
|
-
|
120
|
-
typedef struct FrtStemFilter
|
121
|
-
{
|
122
|
-
FrtTokenFilter super;
|
123
|
-
struct sb_stemmer *stemmer;
|
124
|
-
char *algorithm;
|
125
|
-
char *charenc;
|
126
|
-
} FrtStemFilter;
|
127
|
-
|
128
61
|
#define frt_ts_next(mts) mts->next(mts)
|
129
62
|
#define frt_ts_clone(mts) mts->clone_i(mts)
|
130
63
|
|
@@ -132,22 +65,58 @@ extern void frt_ts_deref(FrtTokenStream *ts);
|
|
132
65
|
|
133
66
|
extern FrtTokenStream *frt_non_tokenizer_new();
|
134
67
|
|
135
|
-
|
136
|
-
|
68
|
+
/*****************************************************************************/
|
69
|
+
/*** FrtWhiteSpaceTokenizer **************************************************/
|
70
|
+
/*****************************************************************************/
|
71
|
+
|
72
|
+
extern FrtTokenStream *frt_whitespace_tokenizer_alloc(void);
|
73
|
+
extern FrtTokenStream *frt_whitespace_tokenizer_init(FrtTokenStream *ts);
|
74
|
+
extern FrtTokenStream *frt_whitespace_tokenizer_new(void);
|
75
|
+
|
76
|
+
/*****************************************************************************/
|
77
|
+
/*** FrtLetterTokenizer ******************************************************/
|
78
|
+
/*****************************************************************************/
|
79
|
+
|
80
|
+
extern FrtTokenStream *frt_letter_tokenizer_alloc(void);
|
81
|
+
extern FrtTokenStream *frt_letter_tokenizer_init(FrtTokenStream *ts);
|
82
|
+
extern FrtTokenStream *frt_letter_tokenizer_new(void);
|
83
|
+
|
84
|
+
/*****************************************************************************/
|
85
|
+
/*** FrtStandardTokenizer ****************************************************/
|
86
|
+
/*****************************************************************************/
|
137
87
|
|
138
|
-
extern FrtTokenStream *
|
139
|
-
extern FrtTokenStream *
|
88
|
+
extern FrtTokenStream *frt_standard_tokenizer_alloc(void);
|
89
|
+
extern FrtTokenStream *frt_standard_tokenizer_init(FrtTokenStream *ts);
|
90
|
+
extern FrtTokenStream *frt_standard_tokenizer_new(void);
|
140
91
|
|
141
|
-
|
142
|
-
|
143
|
-
|
92
|
+
/*****************************************************************************/
|
93
|
+
/*** FrtHyphenFilter *********************************************************/
|
94
|
+
/*****************************************************************************/
|
144
95
|
|
145
|
-
|
146
|
-
|
96
|
+
typedef struct FrtHyphenFilter {
|
97
|
+
FrtTokenFilter super;
|
98
|
+
char text[FRT_MAX_WORD_SIZE];
|
99
|
+
int start;
|
100
|
+
int pos;
|
101
|
+
int len;
|
102
|
+
FrtToken *tk;
|
103
|
+
} FrtHyphenFilter;
|
104
|
+
|
105
|
+
extern FrtTokenStream *frt_hyphen_filter_alloc(void);
|
106
|
+
extern FrtTokenStream *frt_hyphen_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
|
107
|
+
extern FrtTokenStream *frt_hyphen_filter_new(FrtTokenStream *sub_ts);
|
147
108
|
|
148
|
-
|
149
|
-
|
150
|
-
|
109
|
+
/*****************************************************************************/
|
110
|
+
/*** FrtLowercaseFilter ******************************************************/
|
111
|
+
/*****************************************************************************/
|
112
|
+
|
113
|
+
extern FrtTokenStream *frt_lowercase_filter_alloc(void);
|
114
|
+
extern void frt_lowercase_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
|
115
|
+
extern FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *sub_ts);
|
116
|
+
|
117
|
+
/*****************************************************************************/
|
118
|
+
/*** FrtStopFilter ***********************************************************/
|
119
|
+
/*****************************************************************************/
|
151
120
|
|
152
121
|
extern const char *FRT_ENGLISH_STOP_WORDS[];
|
153
122
|
extern const char *FRT_FULL_ENGLISH_STOP_WORDS[];
|
@@ -165,83 +134,118 @@ extern const char *FRT_FULL_RUSSIAN_STOP_WORDS[];
|
|
165
134
|
extern const char *FRT_FULL_FINNISH_STOP_WORDS[];
|
166
135
|
extern const char *FRT_FULL_HUNGARIAN_STOP_WORDS[];
|
167
136
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
extern FrtTokenStream *
|
174
|
-
|
175
|
-
|
176
|
-
extern
|
177
|
-
extern FrtTokenStream *
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
137
|
+
typedef struct FrtStopFilter {
|
138
|
+
FrtTokenFilter super;
|
139
|
+
FrtHash *words;
|
140
|
+
} FrtStopFilter;
|
141
|
+
|
142
|
+
extern FrtTokenStream *frt_stop_filter_alloc(void);
|
143
|
+
extern FrtTokenStream *frt_stop_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
|
144
|
+
extern void frt_stop_filter_set_words(FrtTokenStream *ts, const char **words);
|
145
|
+
extern void frt_stop_filter_set_words_len(FrtTokenStream *ts, const char **words, int len);
|
146
|
+
extern FrtTokenStream *frt_stop_filter_new(FrtTokenStream *sub_ts);
|
147
|
+
extern FrtTokenStream *frt_stop_filter_new_with_words(FrtTokenStream *sub_ts, const char **words);
|
148
|
+
extern FrtTokenStream *frt_stop_filter_new_with_words_len(FrtTokenStream *sub_ts, const char **words, int len);
|
149
|
+
|
150
|
+
/*****************************************************************************/
|
151
|
+
/*** FrtStemFilter ***********************************************************/
|
152
|
+
/*****************************************************************************/
|
153
|
+
|
154
|
+
typedef struct FrtStemFilter {
|
155
|
+
FrtTokenFilter super;
|
156
|
+
struct sb_stemmer *stemmer;
|
157
|
+
char *algorithm;
|
158
|
+
char *charenc;
|
159
|
+
} FrtStemFilter;
|
160
|
+
|
161
|
+
extern FrtTokenStream *frt_stem_filter_alloc(void);
|
162
|
+
extern void frt_stem_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts, const char *algorithm);
|
163
|
+
extern FrtTokenStream *frt_stem_filter_new(FrtTokenStream *sub_ts, const char *algorithm);
|
164
|
+
|
165
|
+
/*****************************************************************************/
|
166
|
+
/*** FrtMappingFilter ********************************************************/
|
167
|
+
/*****************************************************************************/
|
168
|
+
|
169
|
+
typedef struct FrtMappingFilter {
|
170
|
+
FrtTokenFilter super;
|
171
|
+
FrtMultiMapper *mapper;
|
172
|
+
} FrtMappingFilter;
|
173
|
+
|
174
|
+
extern FrtTokenStream *frt_mapping_filter_alloc(void);
|
175
|
+
extern void frt_mapping_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
|
176
|
+
extern FrtTokenStream *frt_mapping_filter_new(FrtTokenStream *sub_ts);
|
177
|
+
extern FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern, const char *replacement);
|
178
|
+
|
179
|
+
/*****************************************************************************/
|
180
|
+
/*** FrtAnalyzer *************************************************************/
|
181
|
+
/*****************************************************************************/
|
182
|
+
|
183
|
+
typedef struct FrtAnalyzer {
|
188
184
|
FrtTokenStream *current_ts;
|
189
|
-
FrtTokenStream *(*get_ts)(struct FrtAnalyzer *a,
|
190
|
-
void
|
191
|
-
int
|
185
|
+
FrtTokenStream *(*get_ts)(struct FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding);
|
186
|
+
void (*destroy_i)(struct FrtAnalyzer *a);
|
187
|
+
int ref_cnt;
|
188
|
+
VALUE ranalyzer;
|
192
189
|
} FrtAnalyzer;
|
193
190
|
|
194
191
|
extern void frt_a_deref(FrtAnalyzer *a);
|
195
192
|
|
196
|
-
#define frt_a_get_ts(ma, field, text) ma->get_ts(ma, field, text)
|
193
|
+
#define frt_a_get_ts(ma, field, text, encoding) ma->get_ts(ma, field, text, encoding)
|
194
|
+
|
195
|
+
extern FrtAnalyzer *frt_analyzer_alloc(void);
|
196
|
+
extern void frt_analyzer_init(FrtAnalyzer *a, FrtTokenStream *ts, void (*destroy)(FrtAnalyzer *a),
|
197
|
+
FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding));
|
198
|
+
extern FrtAnalyzer *frt_analyzer_new(FrtTokenStream *ts, void (*destroy)(FrtAnalyzer *a),
|
199
|
+
FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding));
|
197
200
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
extern FrtAnalyzer *frt_non_analyzer_new();
|
201
|
+
/*****************************************************************************/
|
202
|
+
/*** FrtNonAnalyzer **********************************************************/
|
203
|
+
/*****************************************************************************/
|
204
|
+
|
205
|
+
extern FrtAnalyzer *frt_non_analyzer_new(void);
|
204
206
|
|
205
207
|
extern void frt_a_standard_destroy(FrtAnalyzer *a);
|
206
208
|
|
209
|
+
/*****************************************************************************/
|
210
|
+
/*** FrtWhiteSpaceAnalyzer ***************************************************/
|
211
|
+
/*****************************************************************************/
|
212
|
+
|
213
|
+
extern FrtAnalyzer *frt_whitespace_analyzer_alloc(void);
|
214
|
+
extern void frt_whitespace_analyzer_init(FrtAnalyzer *a, bool lowercase);
|
207
215
|
extern FrtAnalyzer *frt_whitespace_analyzer_new(bool lowercase);
|
208
|
-
extern FrtAnalyzer *frt_mb_whitespace_analyzer_new(bool lowercase);
|
209
216
|
|
217
|
+
/*****************************************************************************/
|
218
|
+
/*** FrtLetterAnalyzer *******************************************************/
|
219
|
+
/*****************************************************************************/
|
220
|
+
|
221
|
+
extern FrtAnalyzer *frt_letter_analyzer_alloc(void);
|
222
|
+
extern void frt_letter_analyzer_init(FrtAnalyzer *a, bool lowercase);
|
210
223
|
extern FrtAnalyzer *frt_letter_analyzer_new(bool lowercase);
|
211
|
-
extern FrtAnalyzer *frt_mb_letter_analyzer_new(bool lowercase);
|
212
224
|
|
225
|
+
/*****************************************************************************/
|
226
|
+
/*** FrtStandardAnalyzer *****************************************************/
|
227
|
+
/*****************************************************************************/
|
228
|
+
|
229
|
+
extern FrtAnalyzer *frt_standard_analyzer_alloc(void);
|
230
|
+
extern void frt_standard_analyzer_init(FrtAnalyzer *a, bool lowercase, const char **words);
|
213
231
|
extern FrtAnalyzer *frt_standard_analyzer_new(bool lowercase);
|
214
|
-
extern FrtAnalyzer *
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
extern FrtAnalyzer *frt_standard_analyzer_new_with_words_len(
|
220
|
-
const char **words, int len, bool lowercase);
|
221
|
-
extern FrtAnalyzer *frt_mb_standard_analyzer_new_with_words(
|
222
|
-
const char **words, bool lowercase);
|
223
|
-
extern FrtAnalyzer *frt_utf8_standard_analyzer_new_with_words(
|
224
|
-
const char **words, bool lowercase);
|
225
|
-
|
226
|
-
extern FrtAnalyzer *frt_legacy_standard_analyzer_new(bool lowercase);
|
227
|
-
extern FrtAnalyzer *frt_mb_legacy_standard_analyzer_new(bool lowercase);
|
228
|
-
|
229
|
-
extern FrtAnalyzer *frt_legacy_standard_analyzer_new_with_words(
|
230
|
-
const char **words, bool lowercase);
|
231
|
-
extern FrtAnalyzer *frt_mb_legacy_standard_analyzer_new_with_words(
|
232
|
-
const char **words, bool lowercase);
|
232
|
+
extern FrtAnalyzer *frt_standard_analyzer_new_with_words(bool lowercase, const char **words);
|
233
|
+
|
234
|
+
/*****************************************************************************/
|
235
|
+
/*** FrtPerFieldAnalyzer *****************************************************/
|
236
|
+
/*****************************************************************************/
|
233
237
|
|
234
238
|
#define PFA(analyzer) ((FrtPerFieldAnalyzer *)(analyzer))
|
235
|
-
|
236
|
-
{
|
237
|
-
FrtAnalyzer
|
238
|
-
FrtHash
|
239
|
-
FrtAnalyzer
|
239
|
+
|
240
|
+
typedef struct FrtPerFieldAnalyzer {
|
241
|
+
FrtAnalyzer super;
|
242
|
+
FrtHash *dict;
|
243
|
+
FrtAnalyzer *default_a;
|
240
244
|
} FrtPerFieldAnalyzer;
|
241
245
|
|
242
|
-
extern FrtAnalyzer *
|
243
|
-
extern void
|
244
|
-
|
245
|
-
|
246
|
+
extern FrtAnalyzer *frt_per_field_analyzer_alloc(void);
|
247
|
+
extern void frt_per_field_analyzer_init(FrtAnalyzer *a, FrtAnalyzer *default_a);
|
248
|
+
extern FrtAnalyzer *frt_per_field_analyzer_new(FrtAnalyzer *default_a);
|
249
|
+
extern void frt_pfa_add_field(FrtAnalyzer *self, ID field, FrtAnalyzer *analyzer);
|
246
250
|
|
247
251
|
#endif
|
@@ -109,13 +109,11 @@ void *frt_ary_remove_i(void **ary, int index)
|
|
109
109
|
}
|
110
110
|
}
|
111
111
|
|
112
|
-
void frt_ary_delete_i(void **ary, int index, void (*free_elem)(void *p))
|
113
|
-
{
|
112
|
+
void frt_ary_delete_i(void **ary, int index, void (*free_elem)(void *p)) {
|
114
113
|
free_elem(frt_ary_remove(ary, index));
|
115
114
|
}
|
116
115
|
|
117
|
-
void frt_ary_destroy_i(void **ary, void (*free_elem)(void *p))
|
118
|
-
{
|
116
|
+
void frt_ary_destroy_i(void **ary, void (*free_elem)(void *p)) {
|
119
117
|
int i;
|
120
118
|
for (i = frt_ary_sz(ary) - 1; i >= 0; i--) {
|
121
119
|
free_elem(ary[i]);
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#include "frt_bitvector.h"
|
2
2
|
#include <string.h>
|
3
3
|
|
4
|
-
FrtBitVector *frt_bv_new_capa(int capa)
|
5
|
-
{
|
4
|
+
FrtBitVector *frt_bv_new_capa(int capa) {
|
6
5
|
FrtBitVector *bv = FRT_ALLOC_AND_ZERO(FrtBitVector);
|
7
6
|
|
8
7
|
/* The capacity passed by the user is number of bits allowed, however we
|
@@ -11,37 +10,33 @@ FrtBitVector *frt_bv_new_capa(int capa)
|
|
11
10
|
bv->bits = FRT_ALLOC_AND_ZERO_N(frt_u32, bv->capa);
|
12
11
|
bv->curr_bit = -1;
|
13
12
|
bv->ref_cnt = 1;
|
13
|
+
bv->rbv = Qnil;
|
14
14
|
return bv;
|
15
15
|
}
|
16
16
|
|
17
|
-
FrtBitVector *frt_bv_new()
|
18
|
-
{
|
17
|
+
FrtBitVector *frt_bv_new(void) {
|
19
18
|
return frt_bv_new_capa(FRT_BV_INIT_CAPA);
|
20
19
|
}
|
21
20
|
|
22
|
-
void frt_bv_destroy(FrtBitVector *bv)
|
23
|
-
{
|
21
|
+
void frt_bv_destroy(FrtBitVector *bv) {
|
24
22
|
if (--(bv->ref_cnt) == 0) {
|
25
23
|
free(bv->bits);
|
26
24
|
free(bv);
|
27
25
|
}
|
28
26
|
}
|
29
27
|
|
30
|
-
void frt_bv_clear(FrtBitVector *bv)
|
31
|
-
{
|
28
|
+
void frt_bv_clear(FrtBitVector *bv) {
|
32
29
|
memset(bv->bits, 0, bv->capa * sizeof(frt_u32));
|
33
30
|
bv->extends_as_ones = 0;
|
34
31
|
bv->count = 0;
|
35
32
|
bv->size = 0;
|
36
33
|
}
|
37
34
|
|
38
|
-
void frt_bv_scan_reset(FrtBitVector *bv)
|
39
|
-
{
|
35
|
+
void frt_bv_scan_reset(FrtBitVector *bv) {
|
40
36
|
bv->curr_bit = -1;
|
41
37
|
}
|
42
38
|
|
43
|
-
int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2)
|
44
|
-
{
|
39
|
+
int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2) {
|
45
40
|
frt_u32 *bits, *bits2;
|
46
41
|
int min_size, word_size, ext_word_size = 0, i;
|
47
42
|
if (bv1 == bv2) {
|
@@ -65,8 +60,7 @@ int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2)
|
|
65
60
|
if (bv1->size > min_size) {
|
66
61
|
bits = bv1->bits;
|
67
62
|
ext_word_size = FRT_TO_WORD(bv1->size);
|
68
|
-
}
|
69
|
-
else if (bv2->size > min_size) {
|
63
|
+
} else if (bv2->size > min_size) {
|
70
64
|
bits = bv2->bits;
|
71
65
|
ext_word_size = FRT_TO_WORD(bv2->size);
|
72
66
|
}
|
@@ -81,8 +75,7 @@ int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2)
|
|
81
75
|
return true;
|
82
76
|
}
|
83
77
|
|
84
|
-
unsigned long long frt_bv_hash(FrtBitVector *bv)
|
85
|
-
{
|
78
|
+
unsigned long long frt_bv_hash(FrtBitVector *bv) {
|
86
79
|
unsigned long long hash = 0;
|
87
80
|
const frt_u32 empty_word = bv->extends_as_ones ? 0xFFFFFFFF : 0;
|
88
81
|
int i;
|