isomorfeus-ferret 0.12.6 → 0.13.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +101 -19
- data/README.md +85 -16
- data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
- data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
- data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.c +15 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_constants.h +200 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.c +156 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_context.h +113 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +5914 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.h +64 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.c +22 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_platform.h +594 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.c +291 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_transform.h +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_common_version.h +26 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.c +76 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_bit_reader.h +351 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +2608 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.c +339 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_huffman.h +121 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_prefix.h +732 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.c +159 -0
- data/ext/isomorfeus_ferret_ext/brotli_dec_state.h +365 -0
- data/ext/isomorfeus_ferret_ext/brotli_decode.h +344 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.c +145 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references.h +39 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.c +843 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_hq.h +95 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_backward_references_inc.h +163 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.c +35 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_bit_cost_inc.h +127 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_encoder_inc.h +34 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.c +194 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_block_splitter_inc.h +440 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.c +1314 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_brotli_bit_stream.h +84 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.c +56 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster.h +48 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_cluster_inc.h +320 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.c +28 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_command.h +190 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.c +790 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment.h +61 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.c +645 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_compress_fragment_two_pass.h +54 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.c +1846 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_dictionary_hash.h +25 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encode.c +1927 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.c +33 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_encoder_dict.h +43 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.c +503 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode.h +122 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_entropy_encode_static.h +539 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.c +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_fast_log.h +66 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_find_match_length.h +79 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash.h +488 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_composite_inc.h +125 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_forgetful_chain_inc.h +293 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match64_inc.h +267 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_inc.h +262 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_longest_match_quickly_inc.h +266 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_rolling_inc.h +212 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_hash_to_binary_tree_inc.h +329 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.c +100 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram.h +63 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_histogram_inc.h +51 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.c +175 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_literal_cost.h +30 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.c +170 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_memory.h +114 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.c +663 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock.h +105 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_metablock_inc.h +183 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_params.h +46 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_prefix.h +53 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_quality.h +165 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_ringbuffer.h +167 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.c +486 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict.h +40 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_static_dict_lut.h +5864 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.c +85 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_utf8_util.h +32 -0
- data/ext/isomorfeus_ferret_ext/brotli_enc_write_bits.h +87 -0
- data/ext/isomorfeus_ferret_ext/brotli_encode.h +448 -0
- data/ext/isomorfeus_ferret_ext/brotli_port.h +288 -0
- data/ext/isomorfeus_ferret_ext/brotli_types.h +83 -0
- data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
- data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
- data/ext/isomorfeus_ferret_ext/bzlib_blocksort.c +1094 -0
- data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
- data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
- data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
- data/ext/isomorfeus_ferret_ext/bzlib_huffman.c +205 -0
- data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
- data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
- data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
- data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
- data/ext/isomorfeus_ferret_ext/frb_index.c +513 -464
- data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
- data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
- data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
- data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
- data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
- data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
- data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
- data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
- data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
- data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
- data/ext/isomorfeus_ferret_ext/frt_document.h +10 -9
- data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
- data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
- data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
- data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
- data/ext/isomorfeus_ferret_ext/frt_fs_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/frt_global.c +91 -200
- data/ext/isomorfeus_ferret_ext/frt_global.h +7 -18
- data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
- data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
- data/ext/isomorfeus_ferret_ext/frt_index.c +714 -384
- data/ext/isomorfeus_ferret_ext/frt_index.h +274 -290
- data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
- data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
- data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +68 -91
- data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
- data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
- data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
- data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
- data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
- data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
- data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
- data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
- data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
- data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
- data/ext/isomorfeus_ferret_ext/frt_ram_store.c +46 -84
- data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
- data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
- data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
- data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
- data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
- data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
- data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
- data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +22 -112
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
- data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
- data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
- data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
- data/ext/isomorfeus_ferret_ext/test.c +0 -17
- data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
- data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
- data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
- data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_fields.c +111 -100
- data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
- data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
- data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
- data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
- data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
- data/ext/isomorfeus_ferret_ext/test_index.c +373 -363
- data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
- data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
- data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
- data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
- data/ext/isomorfeus_ferret_ext/test_search.c +60 -64
- data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
- data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
- data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
- data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
- data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
- data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +113 -58
- data/ext/isomorfeus_ferret_ext/email.rl +0 -21
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
- data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
- data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
- data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
- data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
- data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
- data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
- data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
- data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,741 +1,346 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include <ctype.h>
|
3
|
-
#include <wctype.h>
|
4
|
-
#include <wchar.h>
|
5
3
|
#include "frt_analysis.h"
|
6
4
|
#include "frt_hash.h"
|
7
5
|
#include "libstemmer.h"
|
8
|
-
#include "frt_scanner.h"
|
9
6
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
*
|
14
|
-
****************************************************************************/
|
7
|
+
/*****************************************************************************/
|
8
|
+
/*** Helpers *****************************************************************/
|
9
|
+
/*****************************************************************************/
|
15
10
|
|
16
|
-
|
17
|
-
|
18
|
-
|
11
|
+
/* initialized in frt_global.c */
|
12
|
+
extern rb_encoding *utf8_encoding;
|
13
|
+
extern OnigCodePoint cp_apostrophe;
|
14
|
+
extern OnigCodePoint cp_dot;
|
15
|
+
extern OnigCodePoint cp_comma;
|
16
|
+
extern OnigCodePoint cp_backslash;
|
17
|
+
extern OnigCodePoint cp_slash;
|
18
|
+
extern OnigCodePoint cp_underscore;
|
19
|
+
extern OnigCodePoint cp_dash;
|
20
|
+
extern OnigCodePoint cp_hyphen;
|
21
|
+
extern OnigCodePoint cp_at;
|
22
|
+
extern OnigCodePoint cp_ampersand;
|
23
|
+
extern OnigCodePoint cp_colon;
|
24
|
+
|
25
|
+
static int cp_isnumpunc(OnigCodePoint cp) {
|
26
|
+
return (cp == cp_dot || cp == cp_comma || cp == cp_backslash || cp == cp_slash || cp == cp_underscore || cp == cp_dash);
|
27
|
+
}
|
28
|
+
|
29
|
+
static int cp_isurlpunc(OnigCodePoint cp) {
|
30
|
+
return (cp == cp_dot || cp == cp_slash || cp == cp_dash || cp == cp_underscore);
|
31
|
+
}
|
32
|
+
|
33
|
+
static int cp_enc_isurlc(OnigCodePoint cp, rb_encoding *enc) {
|
34
|
+
return (cp_isurlpunc(cp) || rb_enc_isalnum(cp, enc));
|
35
|
+
}
|
36
|
+
|
37
|
+
static int cp_isurlxatpunc(OnigCodePoint cp) {
|
38
|
+
return (cp == cp_dot || cp == cp_slash || cp == cp_dash || cp == cp_underscore || cp == cp_at);
|
39
|
+
}
|
40
|
+
|
41
|
+
static int cp_enc_isurlxatc(OnigCodePoint cp, rb_encoding *enc){
|
42
|
+
return (cp_isurlxatpunc(cp) || rb_enc_isalnum(cp, enc));
|
43
|
+
}
|
44
|
+
|
45
|
+
static bool cp_enc_istok(OnigCodePoint cp, rb_encoding *enc) {
|
46
|
+
if (rb_enc_isspace(cp, enc)) /* most common so check first. */
|
47
|
+
return false;
|
48
|
+
if (rb_enc_isalnum(cp, enc) || cp_isnumpunc(cp) ||
|
49
|
+
cp == cp_ampersand || cp == cp_at || cp == cp_apostrophe || cp == cp_colon) {
|
50
|
+
return true;
|
51
|
+
}
|
52
|
+
return false;
|
53
|
+
}
|
54
|
+
|
55
|
+
static inline int get_cp(char *start, char *end, int *cp_len, rb_encoding *enc) {
|
56
|
+
if (start >= end) {
|
57
|
+
*cp_len = 0;
|
58
|
+
return 0;
|
59
|
+
}
|
60
|
+
return rb_enc_codepoint_len(start, end, cp_len, enc);
|
61
|
+
}
|
62
|
+
|
63
|
+
/*****************************************************************************/
|
64
|
+
/*** FrtToken ****************************************************************/
|
65
|
+
/*****************************************************************************/
|
66
|
+
|
67
|
+
FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, off_t start, off_t end, int pos_inc, rb_encoding *encoding) {
|
19
68
|
if (tlen >= FRT_MAX_WORD_SIZE) {
|
20
|
-
tlen = FRT_MAX_WORD_SIZE - 1;
|
69
|
+
tlen = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
|
70
|
+
}
|
71
|
+
|
72
|
+
if (encoding == utf8_encoding) {
|
73
|
+
memcpy(tk->text, text, sizeof(char) * tlen);
|
74
|
+
} else {
|
75
|
+
const unsigned char *sp = (unsigned char *)text;
|
76
|
+
unsigned char *dp = (unsigned char *)tk->text;
|
77
|
+
rb_econv_t *ec = rb_econv_open(rb_enc_name(encoding), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
|
78
|
+
assert(ec != NULL);
|
79
|
+
rb_econv_convert(ec, &sp, (unsigned char *)text + tlen, &dp, (unsigned char *)tk->text + FRT_MAX_WORD_SIZE - 1, 0);
|
80
|
+
rb_econv_close(ec);
|
81
|
+
tlen = dp - (unsigned char *)tk->text;
|
21
82
|
}
|
22
|
-
memcpy(tk->text, text, sizeof(char) * tlen);
|
23
83
|
tk->text[tlen] = '\0';
|
24
|
-
tk->len = tlen;
|
25
|
-
tk->start = start;
|
26
|
-
tk->end = end;
|
84
|
+
tk->len = tlen; // in bytes in utf8_encoding
|
85
|
+
tk->start = start; // in original encoding
|
86
|
+
tk->end = end; // in original encoding
|
27
87
|
tk->pos_inc = pos_inc;
|
28
88
|
return tk;
|
29
89
|
}
|
30
90
|
|
31
|
-
static FrtToken *frt_tk_set_ts(FrtToken *tk, char *start, char *end,
|
32
|
-
|
33
|
-
{
|
34
|
-
return frt_tk_set(tk, start, (int)(end - start),
|
35
|
-
(off_t)(start - text), (off_t)(end - text), pos_inc);
|
91
|
+
static FrtToken *frt_tk_set_ts(FrtToken *tk, char *start, char *end, char *text, int pos_inc, rb_encoding *encoding) {
|
92
|
+
return frt_tk_set(tk, start, (int)(end - start), (off_t)(start - text), (off_t)(end - text), pos_inc, encoding);
|
36
93
|
}
|
37
94
|
|
38
|
-
FrtToken *frt_tk_set_no_len(FrtToken *tk,
|
39
|
-
|
40
|
-
{
|
41
|
-
return frt_tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
|
95
|
+
FrtToken *frt_tk_set_no_len(FrtToken *tk, char *text, off_t start, off_t end, int pos_inc, rb_encoding *encoding) {
|
96
|
+
return frt_tk_set(tk, text, (int)strlen(text), start, end, pos_inc, encoding);
|
42
97
|
}
|
43
98
|
|
44
|
-
|
45
|
-
off_t end, int pos_inc)
|
46
|
-
{
|
47
|
-
int len = wcstombs(tk->text, text, FRT_MAX_WORD_SIZE - 1);
|
48
|
-
tk->text[len] = '\0';
|
49
|
-
tk->len = len;
|
50
|
-
tk->start = start;
|
51
|
-
tk->end = end;
|
52
|
-
tk->pos_inc = pos_inc;
|
53
|
-
return tk;
|
54
|
-
}
|
55
|
-
|
56
|
-
int frt_tk_eq(FrtToken *tk1, FrtToken *tk2)
|
57
|
-
{
|
99
|
+
int frt_tk_eq(FrtToken *tk1, FrtToken *tk2) {
|
58
100
|
return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
|
59
101
|
tk1->start == tk2->start && tk1->end == tk2->end &&
|
60
102
|
tk1->pos_inc == tk2->pos_inc);
|
61
103
|
}
|
62
104
|
|
63
|
-
int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2)
|
64
|
-
{
|
105
|
+
int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2) {
|
65
106
|
int cmp;
|
66
107
|
if (tk1->start > tk2->start) {
|
67
108
|
cmp = 1;
|
68
|
-
}
|
69
|
-
else if (tk1->start < tk2->start) {
|
109
|
+
} else if (tk1->start < tk2->start) {
|
70
110
|
cmp = -1;
|
71
|
-
}
|
72
|
-
else {
|
111
|
+
} else {
|
73
112
|
if (tk1->end > tk2->end) {
|
74
113
|
cmp = 1;
|
75
|
-
}
|
76
|
-
else if (tk1->end < tk2->end) {
|
114
|
+
} else if (tk1->end < tk2->end) {
|
77
115
|
cmp = -1;
|
78
|
-
}
|
79
|
-
else {
|
116
|
+
} else {
|
80
117
|
cmp = strcmp((char *)tk1->text, (char *)tk2->text);
|
81
118
|
}
|
82
119
|
}
|
83
120
|
return cmp;
|
84
121
|
}
|
85
122
|
|
86
|
-
void frt_tk_destroy(void *p)
|
87
|
-
{
|
123
|
+
void frt_tk_destroy(void *p) {
|
88
124
|
free(p);
|
89
125
|
}
|
90
126
|
|
91
|
-
FrtToken *frt_tk_new()
|
92
|
-
{
|
127
|
+
FrtToken *frt_tk_new(void) {
|
93
128
|
return FRT_ALLOC(FrtToken);
|
94
129
|
}
|
95
|
-
/****************************************************************************
|
96
|
-
*
|
97
|
-
* TokenStream
|
98
|
-
*
|
99
|
-
****************************************************************************/
|
100
130
|
|
101
|
-
|
102
|
-
|
103
|
-
|
131
|
+
/*****************************************************************************/
|
132
|
+
/*** FrtTokenStream **********************************************************/
|
133
|
+
/*****************************************************************************/
|
134
|
+
|
135
|
+
void frt_ts_deref(FrtTokenStream *ts) {
|
136
|
+
if (--ts->ref_cnt <= 0)
|
104
137
|
ts->destroy_i(ts);
|
105
|
-
}
|
106
138
|
}
|
107
139
|
|
108
|
-
|
109
|
-
{
|
140
|
+
FrtTokenStream *frt_ts_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
|
110
141
|
ts->t = ts->text = text;
|
142
|
+
ts->length = strlen(text);
|
143
|
+
ts->encoding = encoding;
|
111
144
|
return ts;
|
112
145
|
}
|
113
146
|
|
114
|
-
FrtTokenStream *frt_ts_clone_size(FrtTokenStream *orig_ts, size_t size)
|
115
|
-
{
|
147
|
+
FrtTokenStream *frt_ts_clone_size(FrtTokenStream *orig_ts, size_t size) {
|
116
148
|
FrtTokenStream *ts = (FrtTokenStream *)frt_ecalloc(size);
|
117
149
|
memcpy(ts, orig_ts, size);
|
118
150
|
ts->ref_cnt = 1;
|
151
|
+
ts->rts = 0;
|
152
|
+
ts->rts = Qnil;
|
119
153
|
return ts;
|
120
154
|
}
|
121
155
|
|
122
|
-
FrtTokenStream *
|
123
|
-
|
124
|
-
|
156
|
+
FrtTokenStream *frt_ts_alloc_i(size_t size) {
|
157
|
+
return (FrtTokenStream *)frt_ecalloc(size);
|
158
|
+
}
|
125
159
|
|
160
|
+
FrtTokenStream *frt_ts_init(FrtTokenStream *ts) {
|
126
161
|
ts->destroy_i = (void (*)(FrtTokenStream *))&free;
|
127
|
-
ts->reset = &
|
162
|
+
ts->reset = &frt_ts_reset;
|
128
163
|
ts->ref_cnt = 1;
|
129
|
-
|
164
|
+
ts->rts = Qnil;
|
130
165
|
return ts;
|
131
166
|
}
|
132
167
|
|
133
|
-
|
134
|
-
*
|
135
|
-
|
136
|
-
|
137
|
-
#define CTS(token_stream) ((FrtCachedTokenStream *)(token_stream))
|
138
|
-
|
139
|
-
static FrtTokenStream *cts_clone_i(FrtTokenStream *orig_ts)
|
140
|
-
{
|
141
|
-
return frt_ts_clone_size(orig_ts, sizeof(FrtCachedTokenStream));
|
168
|
+
FrtTokenStream *frt_ts_new_i(size_t size) {
|
169
|
+
FrtTokenStream *ts = frt_ts_alloc_i(size);
|
170
|
+
return frt_ts_init(ts);
|
142
171
|
}
|
143
172
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
ts->clone_i = &cts_clone_i;
|
148
|
-
return ts;
|
149
|
-
}
|
150
|
-
|
151
|
-
/* * Multi-byte TokenStream * */
|
152
|
-
|
153
|
-
#define MBTS(token_stream) ((FrtMultiByteTokenStream *)(token_stream))
|
154
|
-
|
155
|
-
static int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
|
156
|
-
{
|
157
|
-
int num_bytes;
|
158
|
-
if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
|
159
|
-
const char *t = s;
|
160
|
-
do {
|
161
|
-
t++;
|
162
|
-
FRT_ZEROSET(state, mbstate_t);
|
163
|
-
num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
|
164
|
-
} while ((num_bytes < 0) && (*t != 0));
|
165
|
-
num_bytes = t - s;
|
166
|
-
if (*t == 0) *wchr = 0;
|
167
|
-
}
|
168
|
-
return num_bytes;
|
169
|
-
}
|
173
|
+
/*****************************************************************************/
|
174
|
+
/*** FrtCachedTokenStream ****************************************************/
|
175
|
+
/*****************************************************************************/
|
170
176
|
|
171
|
-
static FrtTokenStream *
|
172
|
-
|
173
|
-
FRT_ZEROSET(&(MBTS(ts)->state), mbstate_t);
|
174
|
-
ts_reset(ts, text);
|
175
|
-
return ts;
|
177
|
+
static FrtTokenStream *cts_clone_i(FrtTokenStream *orig_ts) {
|
178
|
+
return frt_ts_clone_size(orig_ts, sizeof(FrtTokenStream));
|
176
179
|
}
|
177
180
|
|
178
|
-
static FrtTokenStream *
|
179
|
-
|
180
|
-
return frt_ts_clone_size(orig_ts, sizeof(FrtMultiByteTokenStream));
|
181
|
+
static FrtTokenStream *frt_cts_alloc(void) {
|
182
|
+
return (FrtTokenStream *)frt_ecalloc(sizeof(FrtTokenStream));
|
181
183
|
}
|
182
184
|
|
183
|
-
static FrtTokenStream *
|
184
|
-
|
185
|
-
|
186
|
-
ts->
|
187
|
-
ts->clone_i = &mb_ts_clone_i;
|
185
|
+
static FrtTokenStream *frt_cts_init(FrtTokenStream *ts) {
|
186
|
+
frt_ts_init(ts);
|
187
|
+
ts->reset = &frt_ts_reset;
|
188
|
+
ts->clone_i = &cts_clone_i;
|
188
189
|
ts->ref_cnt = 1;
|
189
190
|
return ts;
|
190
191
|
}
|
191
192
|
|
192
|
-
|
193
|
-
*
|
194
|
-
|
195
|
-
*
|
196
|
-
****************************************************************************/
|
197
|
-
|
198
|
-
void frt_a_deref(FrtAnalyzer *a)
|
199
|
-
{
|
200
|
-
if (--a->ref_cnt <= 0) {
|
201
|
-
a->destroy_i(a);
|
202
|
-
}
|
203
|
-
}
|
204
|
-
|
205
|
-
static void frt_a_standard_destroy_i(FrtAnalyzer *a)
|
206
|
-
{
|
207
|
-
if (a->current_ts) {
|
208
|
-
frt_ts_deref(a->current_ts);
|
209
|
-
}
|
210
|
-
free(a);
|
211
|
-
}
|
212
|
-
|
213
|
-
static FrtTokenStream *a_standard_get_ts(FrtAnalyzer *a,
|
214
|
-
FrtSymbol field,
|
215
|
-
char *text)
|
216
|
-
{
|
217
|
-
FrtTokenStream *ts;
|
218
|
-
(void)field;
|
219
|
-
ts = frt_ts_clone(a->current_ts);
|
220
|
-
return ts->reset(ts, text);
|
193
|
+
static FrtTokenStream *frt_cts_new(void) {
|
194
|
+
FrtTokenStream *ts = frt_cts_alloc();
|
195
|
+
return frt_cts_init(ts);
|
221
196
|
}
|
222
197
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
FrtSymbol field,
|
227
|
-
char *text))
|
228
|
-
{
|
229
|
-
FrtAnalyzer *a = FRT_ALLOC(FrtAnalyzer);
|
230
|
-
a->current_ts = ts;
|
231
|
-
a->destroy_i = (destroy_i ? destroy_i : &frt_a_standard_destroy_i);
|
232
|
-
a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
|
233
|
-
a->ref_cnt = 1;
|
234
|
-
return a;
|
235
|
-
}
|
198
|
+
/*****************************************************************************/
|
199
|
+
/*** Tokenizer ***************************************************************/
|
200
|
+
/*****************************************************************************/
|
236
201
|
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
*
|
241
|
-
****************************************************************************/
|
202
|
+
/*****************************************************************************/
|
203
|
+
/*** FrtNonTokenizer *********************************************************/
|
204
|
+
/*****************************************************************************/
|
242
205
|
|
243
|
-
|
244
|
-
* NonTokenizer
|
245
|
-
*/
|
246
|
-
static FrtToken *nt_next(FrtTokenStream *ts)
|
247
|
-
{
|
206
|
+
static FrtToken *nt_next(FrtTokenStream *ts) {
|
248
207
|
if (ts->t) {
|
249
208
|
size_t len = strlen(ts->t);
|
250
209
|
ts->t = NULL;
|
251
|
-
|
252
|
-
|
253
|
-
}
|
254
|
-
else {
|
210
|
+
return frt_tk_set(&(ts->token), ts->text, len, 0, len, 1, ts->encoding);
|
211
|
+
} else {
|
255
212
|
return NULL;
|
256
213
|
}
|
257
214
|
}
|
258
215
|
|
259
|
-
FrtTokenStream *frt_non_tokenizer_new()
|
260
|
-
|
261
|
-
FrtTokenStream *ts = cts_new();
|
216
|
+
FrtTokenStream *frt_non_tokenizer_new(void) {
|
217
|
+
FrtTokenStream *ts = frt_cts_new();
|
262
218
|
ts->next = &nt_next;
|
263
219
|
return ts;
|
264
220
|
}
|
265
221
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
FrtAnalyzer *frt_non_analyzer_new()
|
270
|
-
{
|
271
|
-
return frt_analyzer_new(frt_non_tokenizer_new(), NULL, NULL);
|
272
|
-
}
|
273
|
-
|
274
|
-
/****************************************************************************
|
275
|
-
*
|
276
|
-
* Whitespace
|
277
|
-
*
|
278
|
-
****************************************************************************/
|
222
|
+
/*****************************************************************************/
|
223
|
+
/*** FrtWhiteSpaceTokenizer **************************************************/
|
224
|
+
/*****************************************************************************/
|
279
225
|
|
280
|
-
/*
|
281
|
-
* WhitespaceTokenizer
|
282
|
-
*/
|
283
226
|
static FrtToken *wst_next(FrtTokenStream *ts)
|
284
227
|
{
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
t++;
|
290
|
-
}
|
291
|
-
|
292
|
-
if (*t == '\0') {
|
293
|
-
return NULL;
|
294
|
-
}
|
295
|
-
|
296
|
-
start = t;
|
297
|
-
while (*t != '\0' && !isspace(*t)) {
|
298
|
-
t++;
|
299
|
-
}
|
300
|
-
|
301
|
-
ts->t = t;
|
302
|
-
return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
303
|
-
}
|
304
|
-
|
305
|
-
FrtTokenStream *frt_whitespace_tokenizer_new()
|
306
|
-
{
|
307
|
-
FrtTokenStream *ts = cts_new();
|
308
|
-
ts->next = &wst_next;
|
309
|
-
return ts;
|
310
|
-
}
|
311
|
-
|
312
|
-
/*
|
313
|
-
* Multi-byte WhitespaceTokenizer
|
314
|
-
*/
|
315
|
-
static FrtToken *mb_wst_next(FrtTokenStream *ts)
|
316
|
-
{
|
317
|
-
int i;
|
228
|
+
int cp_len = 0;
|
229
|
+
OnigCodePoint cp;
|
230
|
+
rb_encoding *enc = ts->encoding;
|
231
|
+
char *end = ts->text + ts->length;
|
318
232
|
char *start;
|
319
233
|
char *t = ts->t;
|
320
|
-
wchar_t wchr;
|
321
|
-
mbstate_t *state = &(MBTS(ts)->state);
|
322
234
|
|
323
|
-
|
324
|
-
|
325
|
-
t += i;
|
326
|
-
i = mb_next_char(&wchr, t, state);
|
327
|
-
}
|
328
|
-
if (wchr == 0) {
|
235
|
+
cp = get_cp(t, end, &cp_len, enc);
|
236
|
+
if (cp < 1)
|
329
237
|
return NULL;
|
330
|
-
}
|
331
238
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
while (wchr != 0 && !iswspace(wchr)) {
|
336
|
-
t += i;
|
337
|
-
i = mb_next_char(&wchr, t, state);
|
239
|
+
while (cp_len > 0 && rb_enc_isspace(cp, enc)) {
|
240
|
+
t += cp_len;
|
241
|
+
cp = get_cp(t, end, &cp_len, enc);
|
338
242
|
}
|
339
|
-
ts->t = t;
|
340
|
-
return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
341
|
-
}
|
342
|
-
|
343
|
-
/*
|
344
|
-
* Lowercasing Multi-byte WhitespaceTokenizer
|
345
|
-
*/
|
346
|
-
static FrtToken *mb_wst_next_lc(FrtTokenStream *ts)
|
347
|
-
{
|
348
|
-
int i;
|
349
|
-
char *start;
|
350
|
-
char *t = ts->t;
|
351
|
-
wchar_t wchr;
|
352
|
-
wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *w, *w_end;
|
353
|
-
mbstate_t *state = &(MBTS(ts)->state);
|
354
|
-
|
355
|
-
w = wbuf;
|
356
|
-
w_end = &wbuf[FRT_MAX_WORD_SIZE];
|
357
243
|
|
358
|
-
|
359
|
-
|
360
|
-
t += i;
|
361
|
-
i = mb_next_char(&wchr, t, state);
|
362
|
-
}
|
363
|
-
if (wchr == 0) {
|
244
|
+
start = t;
|
245
|
+
if (start >= end)
|
364
246
|
return NULL;
|
365
|
-
}
|
366
247
|
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
if (w < w_end) {
|
373
|
-
*w++ = towlower(wchr);
|
374
|
-
}
|
375
|
-
t += i;
|
376
|
-
i = mb_next_char(&wchr, t, state);
|
377
|
-
}
|
378
|
-
*w = 0;
|
248
|
+
do {
|
249
|
+
t += cp_len;
|
250
|
+
cp = get_cp(t, end, &cp_len, enc);
|
251
|
+
} while (cp_len > 0 && !rb_enc_isspace(cp, enc));
|
252
|
+
|
379
253
|
ts->t = t;
|
380
|
-
return
|
381
|
-
(off_t)(t - ts->text), 1);
|
254
|
+
return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
382
255
|
}
|
383
256
|
|
384
|
-
FrtTokenStream *
|
385
|
-
|
386
|
-
FrtTokenStream *ts = mb_ts_new();
|
387
|
-
ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
|
388
|
-
return ts;
|
257
|
+
FrtTokenStream *frt_whitespace_tokenizer_alloc(void) {
|
258
|
+
return frt_cts_alloc();
|
389
259
|
}
|
390
260
|
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
{
|
396
|
-
FrtTokenStream *ts;
|
397
|
-
if (lowercase) {
|
398
|
-
ts = frt_lowercase_filter_new(frt_whitespace_tokenizer_new());
|
399
|
-
}
|
400
|
-
else {
|
401
|
-
ts = frt_whitespace_tokenizer_new();
|
402
|
-
}
|
403
|
-
return frt_analyzer_new(ts, NULL, NULL);
|
261
|
+
FrtTokenStream *frt_whitespace_tokenizer_init(FrtTokenStream *ts) {
|
262
|
+
ts = frt_cts_init(ts);
|
263
|
+
ts->next = &wst_next;
|
264
|
+
return ts;
|
404
265
|
}
|
405
266
|
|
406
|
-
|
407
|
-
|
408
|
-
return
|
267
|
+
FrtTokenStream *frt_whitespace_tokenizer_new(void) {
|
268
|
+
FrtTokenStream *ts = frt_whitespace_tokenizer_alloc();
|
269
|
+
return frt_whitespace_tokenizer_init(ts);
|
409
270
|
}
|
410
271
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
*
|
415
|
-
****************************************************************************/
|
272
|
+
/*****************************************************************************/
|
273
|
+
/*** FrtLetterTokenizer ******************************************************/
|
274
|
+
/*****************************************************************************/
|
416
275
|
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
276
|
+
static FrtToken *lt_next(FrtTokenStream *ts) {
|
277
|
+
int cp_len = 0;
|
278
|
+
OnigCodePoint cp;
|
279
|
+
rb_encoding *enc = ts->encoding;
|
280
|
+
char *end = ts->text + ts->length;
|
422
281
|
char *start;
|
423
282
|
char *t = ts->t;
|
424
283
|
|
425
|
-
|
426
|
-
|
427
|
-
}
|
428
|
-
|
429
|
-
if (*t == '\0') {
|
284
|
+
cp = get_cp(t, end, &cp_len, enc);
|
285
|
+
if (cp < 1)
|
430
286
|
return NULL;
|
431
|
-
}
|
432
287
|
|
433
|
-
|
434
|
-
|
435
|
-
t
|
436
|
-
}
|
437
|
-
|
438
|
-
ts->t = t;
|
439
|
-
return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
440
|
-
}
|
441
|
-
|
442
|
-
FrtTokenStream *frt_letter_tokenizer_new()
|
443
|
-
{
|
444
|
-
FrtTokenStream *ts = cts_new();
|
445
|
-
ts->next = <_next;
|
446
|
-
return ts;
|
447
|
-
}
|
448
|
-
|
449
|
-
/*
|
450
|
-
* Multi-byte LetterTokenizer
|
451
|
-
*/
|
452
|
-
static FrtToken *mb_lt_next(FrtTokenStream *ts)
|
453
|
-
{
|
454
|
-
int i;
|
455
|
-
char *start;
|
456
|
-
char *t = ts->t;
|
457
|
-
wchar_t wchr;
|
458
|
-
mbstate_t *state = &(MBTS(ts)->state);
|
459
|
-
|
460
|
-
i = mb_next_char(&wchr, t, state);
|
461
|
-
while (wchr != 0 && !iswalpha(wchr)) {
|
462
|
-
t += i;
|
463
|
-
i = mb_next_char(&wchr, t, state);
|
464
|
-
}
|
465
|
-
|
466
|
-
if (wchr == 0) {
|
467
|
-
return NULL;
|
288
|
+
while (cp_len > 0 && !rb_enc_isalpha(cp, enc)) {
|
289
|
+
t += cp_len;
|
290
|
+
cp = get_cp(t, end, &cp_len, enc);
|
468
291
|
}
|
469
292
|
|
470
293
|
start = t;
|
471
|
-
|
472
|
-
i = mb_next_char(&wchr, t, state);
|
473
|
-
while (wchr != 0 && iswalpha(wchr)) {
|
474
|
-
t += i;
|
475
|
-
i = mb_next_char(&wchr, t, state);
|
476
|
-
}
|
477
|
-
ts->t = t;
|
478
|
-
return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
479
|
-
}
|
480
|
-
|
481
|
-
/*
|
482
|
-
* Lowercasing Multi-byte LetterTokenizer
|
483
|
-
*/
|
484
|
-
static FrtToken *mb_lt_next_lc(FrtTokenStream *ts)
|
485
|
-
{
|
486
|
-
int i;
|
487
|
-
char *start;
|
488
|
-
char *t = ts->t;
|
489
|
-
wchar_t wchr;
|
490
|
-
wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *w, *w_end;
|
491
|
-
mbstate_t *state = &(MBTS(ts)->state);
|
492
|
-
|
493
|
-
w = wbuf;
|
494
|
-
w_end = &wbuf[FRT_MAX_WORD_SIZE];
|
495
|
-
|
496
|
-
i = mb_next_char(&wchr, t, state);
|
497
|
-
while (wchr != 0 && !iswalpha(wchr)) {
|
498
|
-
t += i;
|
499
|
-
i = mb_next_char(&wchr, t, state);
|
500
|
-
}
|
501
|
-
if (wchr == 0) {
|
502
|
-
return NULL;
|
503
|
-
}
|
504
|
-
|
505
|
-
start = t;
|
506
|
-
t += i;
|
507
|
-
*w++ = towlower(wchr);
|
508
|
-
i = mb_next_char(&wchr, t, state);
|
509
|
-
while (wchr != 0 && iswalpha(wchr)) {
|
510
|
-
if (w < w_end) {
|
511
|
-
*w++ = towlower(wchr);
|
512
|
-
}
|
513
|
-
t += i;
|
514
|
-
i = mb_next_char(&wchr, t, state);
|
515
|
-
}
|
516
|
-
*w = 0;
|
517
|
-
ts->t = t;
|
518
|
-
return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
|
519
|
-
(off_t)(t - ts->text), 1);
|
520
|
-
}
|
521
|
-
|
522
|
-
FrtTokenStream *frt_mb_letter_tokenizer_new(bool lowercase)
|
523
|
-
{
|
524
|
-
FrtTokenStream *ts = mb_ts_new();
|
525
|
-
ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
|
526
|
-
return ts;
|
527
|
-
}
|
528
|
-
|
529
|
-
/*
|
530
|
-
* LetterAnalyzers
|
531
|
-
*/
|
532
|
-
FrtAnalyzer *frt_letter_analyzer_new(bool lowercase)
|
533
|
-
{
|
534
|
-
FrtTokenStream *ts;
|
535
|
-
if (lowercase) {
|
536
|
-
ts = frt_lowercase_filter_new(frt_letter_tokenizer_new());
|
537
|
-
}
|
538
|
-
else {
|
539
|
-
ts = frt_letter_tokenizer_new();
|
540
|
-
}
|
541
|
-
return frt_analyzer_new(ts, NULL, NULL);
|
542
|
-
}
|
543
|
-
|
544
|
-
FrtAnalyzer *frt_mb_letter_analyzer_new(bool lowercase)
|
545
|
-
{
|
546
|
-
return frt_analyzer_new(frt_mb_letter_tokenizer_new(lowercase), NULL, NULL);
|
547
|
-
}
|
548
|
-
|
549
|
-
/****************************************************************************
|
550
|
-
*
|
551
|
-
* Standard
|
552
|
-
*
|
553
|
-
****************************************************************************/
|
554
|
-
|
555
|
-
#define STDTS(token_stream) ((FrtStandardTokenizer *)(token_stream))
|
556
|
-
|
557
|
-
/*
|
558
|
-
* FrtStandardTokenizer
|
559
|
-
*/
|
560
|
-
static FrtToken *std_next(FrtTokenStream *ts)
|
561
|
-
{
|
562
|
-
FrtStandardTokenizer *std_tz = STDTS(ts);
|
563
|
-
const char *start = NULL;
|
564
|
-
const char *end = NULL;
|
565
|
-
int len;
|
566
|
-
FrtToken *tk = &(CTS(ts)->token);
|
567
|
-
|
568
|
-
switch (std_tz->type) {
|
569
|
-
case FRT_STT_ASCII:
|
570
|
-
frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1,
|
571
|
-
&start, &end, &len);
|
572
|
-
break;
|
573
|
-
case FRT_STT_MB:
|
574
|
-
frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1,
|
575
|
-
&start, &end, &len);
|
576
|
-
break;
|
577
|
-
case FRT_STT_UTF8:
|
578
|
-
frt_std_scan_utf8(ts->t, tk->text, sizeof(tk->text) - 1,
|
579
|
-
&start, &end, &len);
|
580
|
-
break;
|
581
|
-
}
|
582
|
-
|
583
|
-
if (len == 0)
|
294
|
+
if (start >= end)
|
584
295
|
return NULL;
|
585
296
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
tk->pos_inc = 1;
|
591
|
-
return &(CTS(ts)->token);
|
592
|
-
}
|
297
|
+
do {
|
298
|
+
t += cp_len;
|
299
|
+
cp = get_cp(t, end, &cp_len, enc);
|
300
|
+
} while (cp_len > 0 && rb_enc_isalpha(cp, enc));
|
593
301
|
|
594
|
-
|
595
|
-
|
596
|
-
return frt_ts_clone_size(orig_ts, sizeof(FrtStandardTokenizer));
|
302
|
+
ts->t = t;
|
303
|
+
return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
597
304
|
}
|
598
305
|
|
599
|
-
|
600
|
-
|
601
|
-
FrtTokenStream *ts = frt_ts_new(FrtStandardTokenizer);
|
602
|
-
|
603
|
-
ts->clone_i = &std_ts_clone_i;
|
604
|
-
ts->next = &std_next;
|
605
|
-
|
606
|
-
return ts;
|
306
|
+
FrtTokenStream *frt_letter_tokenizer_alloc(void) {
|
307
|
+
return frt_cts_alloc();
|
607
308
|
}
|
608
309
|
|
609
|
-
FrtTokenStream *
|
610
|
-
|
611
|
-
|
612
|
-
STDTS(ts)->type = FRT_STT_ASCII;
|
613
|
-
return ts;
|
614
|
-
}
|
615
|
-
|
616
|
-
FrtTokenStream *frt_mb_standard_tokenizer_new()
|
617
|
-
{
|
618
|
-
FrtTokenStream *ts = std_ts_new();
|
619
|
-
STDTS(ts)->type = FRT_STT_MB;
|
310
|
+
FrtTokenStream *frt_letter_tokenizer_init(FrtTokenStream *ts) {
|
311
|
+
ts = frt_cts_init(ts);
|
312
|
+
ts->next = <_next;
|
620
313
|
return ts;
|
621
314
|
}
|
622
315
|
|
623
|
-
FrtTokenStream *
|
624
|
-
|
625
|
-
|
626
|
-
STDTS(ts)->type = FRT_STT_UTF8;
|
627
|
-
return ts;
|
316
|
+
FrtTokenStream *frt_letter_tokenizer_new(void) {
|
317
|
+
FrtTokenStream *ts = frt_letter_tokenizer_alloc();
|
318
|
+
return frt_letter_tokenizer_init(ts);
|
628
319
|
}
|
629
320
|
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
*
|
634
|
-
****************************************************************************/
|
635
|
-
|
636
|
-
#define LSTDTS(token_stream) ((FrtLegacyStandardTokenizer *)(token_stream))
|
321
|
+
/*****************************************************************************/
|
322
|
+
/*** FrtStandardTokenizer ****************************************************/
|
323
|
+
/*****************************************************************************/
|
637
324
|
|
638
|
-
|
639
|
-
*
|
640
|
-
*/
|
641
|
-
static int legacy_std_get_alpha(FrtTokenStream *ts, char *token)
|
642
|
-
{
|
643
|
-
int i = 0;
|
325
|
+
static int std_get_alnum(FrtTokenStream *ts, char *token, OnigCodePoint cp, int *cp_len_p, OnigCodePoint *cp_out_p, rb_encoding *enc) {
|
326
|
+
char *end = ts->text + ts->length;
|
644
327
|
char *t = ts->t;
|
645
|
-
|
646
|
-
|
647
|
-
token[i] = t[i];
|
648
|
-
}
|
649
|
-
i++;
|
650
|
-
}
|
651
|
-
return i;
|
652
|
-
}
|
653
|
-
|
654
|
-
static int mb_legacy_std_get_alpha(FrtTokenStream *ts, char *token)
|
655
|
-
{
|
656
|
-
char *t = ts->t;
|
657
|
-
wchar_t wchr;
|
658
|
-
int i;
|
659
|
-
mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
|
328
|
+
char *tt = ts->t;
|
329
|
+
int cp_len = *cp_len_p;
|
660
330
|
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
t +=
|
665
|
-
|
666
|
-
}
|
667
|
-
|
668
|
-
i = (int)(t - ts->t);
|
669
|
-
if (i > FRT_MAX_WORD_SIZE) {
|
670
|
-
i = FRT_MAX_WORD_SIZE - 1;
|
331
|
+
while (cp > 0 && rb_enc_isalnum(cp, enc)) {
|
332
|
+
if ((t - ts->t + cp_len) < FRT_MAX_WORD_SIZE)
|
333
|
+
tt += cp_len;
|
334
|
+
t += cp_len;
|
335
|
+
cp = get_cp(t, end, &cp_len, enc);
|
671
336
|
}
|
672
|
-
memcpy(token, ts->t, i);
|
673
|
-
return i;
|
674
|
-
}
|
675
|
-
|
676
|
-
static int isnumpunc(char c)
|
677
|
-
{
|
678
|
-
return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
|
679
|
-
|| c == '-');
|
680
|
-
}
|
681
|
-
|
682
|
-
static int w_isnumpunc(wchar_t c)
|
683
|
-
{
|
684
|
-
return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
|
685
|
-
|| c == L'-');
|
686
|
-
}
|
687
|
-
|
688
|
-
static int isurlpunc(char c)
|
689
|
-
{
|
690
|
-
return (c == '.' || c == '/' || c == '-' || c == '_');
|
691
|
-
}
|
692
|
-
|
693
|
-
static int isurlc(char c)
|
694
|
-
{
|
695
|
-
return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
|
696
|
-
}
|
697
337
|
|
698
|
-
|
699
|
-
|
700
|
-
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
|
701
|
-
}
|
702
|
-
|
703
|
-
static int isurlxatc(char c)
|
704
|
-
{
|
705
|
-
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
|
706
|
-
|| isalnum(c));
|
707
|
-
}
|
338
|
+
memcpy(token, ts->t, tt - ts->t);
|
339
|
+
token[tt - ts->t] = '\0';
|
708
340
|
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
return false; /* most common so check first. */
|
713
|
-
}
|
714
|
-
if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
|
715
|
-
*c == '@' || *c == '\'' || *c == ':') {
|
716
|
-
return true;
|
717
|
-
}
|
718
|
-
return false;
|
719
|
-
}
|
720
|
-
|
721
|
-
static bool mb_legacy_std_is_tok_char(char *t)
|
722
|
-
{
|
723
|
-
wchar_t c;
|
724
|
-
mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
|
725
|
-
|
726
|
-
if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
|
727
|
-
/* error which we can handle next time round. For now just return
|
728
|
-
* false so that we can return a token */
|
729
|
-
return false;
|
730
|
-
}
|
731
|
-
if (iswspace(c)) {
|
732
|
-
return false; /* most common so check first. */
|
733
|
-
}
|
734
|
-
if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
|
735
|
-
|| c == L':') {
|
736
|
-
return true;
|
737
|
-
}
|
738
|
-
return false;
|
341
|
+
*cp_out_p = cp;
|
342
|
+
*cp_len_p = cp_len;
|
343
|
+
return t - ts->t;
|
739
344
|
}
|
740
345
|
|
741
346
|
/* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
|
@@ -743,242 +348,261 @@ static bool mb_legacy_std_is_tok_char(char *t)
|
|
743
348
|
* (alnum) = [a-zA-Z0-9]
|
744
349
|
* (punc) = [_\/.,-]
|
745
350
|
*/
|
746
|
-
static int
|
747
|
-
|
748
|
-
|
749
|
-
|
351
|
+
static int std_get_number(FrtTokenStream *ts, char *start, char *end, OnigCodePoint cp, int cp_len_a, rb_encoding *enc) {
|
352
|
+
|
353
|
+
OnigCodePoint cp_1 = 0;
|
354
|
+
char *t = start;
|
355
|
+
int cp_len = cp_len_a;
|
356
|
+
int cp_1_len = 0;
|
750
357
|
int last_seen_digit = 2;
|
751
358
|
int seen_digit = false;
|
752
359
|
|
753
|
-
while (last_seen_digit >= 0) {
|
754
|
-
while ((
|
755
|
-
if ((last_seen_digit < 2) &&
|
360
|
+
while (cp > 0 && last_seen_digit >= 0) {
|
361
|
+
while ((cp > 0) && rb_enc_isalnum(cp, enc)) {
|
362
|
+
if ((last_seen_digit < 2) && rb_enc_isdigit(cp, enc)) {
|
756
363
|
last_seen_digit = 2;
|
757
364
|
}
|
758
|
-
if ((seen_digit == false) &&
|
365
|
+
if ((seen_digit == false) && rb_enc_isdigit(cp, enc)) {
|
759
366
|
seen_digit = true;
|
760
367
|
}
|
761
|
-
|
368
|
+
t += cp_len;
|
369
|
+
cp = get_cp(t, end, &cp_len, enc);
|
762
370
|
}
|
763
371
|
last_seen_digit--;
|
764
|
-
|
765
|
-
|
766
|
-
if (last_seen_digit >= 0) {
|
767
|
-
count = i;
|
768
|
-
}
|
372
|
+
cp_1 = get_cp(t + cp_len, end, &cp_1_len, enc);
|
373
|
+
if (!cp_isnumpunc(cp) || !rb_enc_isalnum(cp_1, enc)) {
|
769
374
|
break;
|
770
375
|
}
|
771
|
-
|
772
|
-
|
376
|
+
t += cp_len;
|
377
|
+
cp = cp_1;
|
378
|
+
cp_len = cp_1_len;
|
773
379
|
}
|
774
380
|
if (seen_digit) {
|
775
|
-
return
|
776
|
-
}
|
777
|
-
else {
|
381
|
+
return t - start;
|
382
|
+
} else {
|
778
383
|
return 0;
|
779
384
|
}
|
780
385
|
}
|
781
386
|
|
782
|
-
static int
|
783
|
-
|
387
|
+
static int std_get_apostrophe(FrtTokenStream *ts, char *input, OnigCodePoint cp, int *cp_len_p, rb_encoding *enc) {
|
388
|
+
int cp_len = *cp_len_p;
|
389
|
+
char *end = ts->text + ts->length;
|
784
390
|
char *t = input;
|
785
391
|
|
786
|
-
while (
|
787
|
-
t
|
392
|
+
while (cp_len > 0 && (rb_enc_isalpha(cp, enc) || cp == cp_apostrophe)) {
|
393
|
+
t += cp_len;
|
394
|
+
cp = get_cp(t, end, &cp_len, enc);
|
788
395
|
}
|
789
|
-
|
790
396
|
return (int)(t - input);
|
791
397
|
}
|
792
398
|
|
793
|
-
static
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
int
|
798
|
-
|
799
|
-
|
800
|
-
|
399
|
+
static char *std_get_url(FrtTokenStream *ts, char *start, char *end, char *token, int *len, int bufred) {
|
400
|
+
rb_encoding *enc = ts->encoding;
|
401
|
+
OnigCodePoint cp;
|
402
|
+
OnigCodePoint prev_cp = 0;
|
403
|
+
int cp_len = 0;
|
404
|
+
int prev_cp_len = 0;
|
405
|
+
char *t = start;
|
406
|
+
char *tt = start;
|
801
407
|
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
}
|
806
|
-
return (int)(t - input);
|
807
|
-
}
|
808
|
-
|
809
|
-
static char *std_get_url(char *input, char *token, int i, int *len)
|
810
|
-
{
|
811
|
-
char *next = NULL;
|
812
|
-
while (isurlc(input[i])) {
|
813
|
-
if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
|
408
|
+
cp = get_cp(t, end, &cp_len, enc);
|
409
|
+
while (cp > 0 && cp_enc_isurlc(cp, enc)) {
|
410
|
+
if (cp_isurlpunc(cp) && cp_isurlpunc(prev_cp)) {
|
814
411
|
break; /* can't have two puncs in a row */
|
815
412
|
}
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
413
|
+
prev_cp = cp;
|
414
|
+
prev_cp_len = cp_len;
|
415
|
+
t += cp_len;
|
416
|
+
if (((t + cp_len) - start) <= (FRT_MAX_WORD_SIZE - bufred))
|
417
|
+
tt += cp_len;
|
418
|
+
cp = get_cp(t, end, &cp_len, enc);
|
820
419
|
}
|
821
|
-
next = input + i;
|
822
420
|
|
823
|
-
/*
|
824
|
-
if (
|
825
|
-
|
421
|
+
/* strip trailing punc */
|
422
|
+
if (t == tt && cp_isurlpunc(prev_cp)) {
|
423
|
+
tt -= prev_cp_len;
|
826
424
|
}
|
827
425
|
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
}
|
832
|
-
*len = i;
|
833
|
-
token[i] = '\0';
|
426
|
+
*len = (tt - start) + bufred;
|
427
|
+
memcpy(token, start, tt - start);
|
428
|
+
token[tt - start] = '\0';
|
834
429
|
|
835
|
-
return
|
430
|
+
return t;
|
836
431
|
}
|
837
432
|
|
838
|
-
/* Company names can contain '@' and '&' like AT&T and Excite@Home.
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
i++;
|
845
|
-
}
|
433
|
+
/* Company names can contain '@' and '&' like AT&T and Excite@Home. */
|
434
|
+
static int std_get_company_name(FrtTokenStream *ts, char *start, char* end) {
|
435
|
+
rb_encoding *enc = ts->encoding;
|
436
|
+
char * t = start;
|
437
|
+
OnigCodePoint cp;
|
438
|
+
int cp_len = 0;
|
846
439
|
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
{
|
852
|
-
char *t = ts->t;
|
853
|
-
while (*t != '\0' && !isalnum(*t)) {
|
854
|
-
if (isnumpunc(*t) && isdigit(t[1])) break;
|
855
|
-
t++;
|
440
|
+
cp = get_cp(t, end, &cp_len, enc);
|
441
|
+
while (cp > 0 && (rb_enc_isalpha(cp, enc) || cp == cp_at || cp == cp_ampersand)) {
|
442
|
+
t += cp_len;
|
443
|
+
cp = get_cp(t, end, &cp_len, enc);
|
856
444
|
}
|
857
445
|
|
858
|
-
|
859
|
-
|
860
|
-
return (*t != '\0');
|
446
|
+
return t - start;
|
861
447
|
}
|
862
448
|
|
863
|
-
static
|
864
|
-
|
865
|
-
int
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
449
|
+
static int std_advance_to_start(FrtTokenStream *ts, int *cp_len_p, OnigCodePoint *cp_out_p, rb_encoding *enc) {
|
450
|
+
int cp_len = 0;
|
451
|
+
int cp_next = 0;
|
452
|
+
int cp_len_next = 0;
|
453
|
+
OnigCodePoint cp;
|
454
|
+
char *end = ts->text + ts->length;
|
455
|
+
char *t = ts->t;
|
870
456
|
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
457
|
+
cp = get_cp(t, end, &cp_len, enc);
|
458
|
+
while (cp > 0 && !rb_enc_isalnum(cp, enc)) {
|
459
|
+
if (cp_isnumpunc(cp)) {
|
460
|
+
cp_next = get_cp(t + cp_len, end, &cp_len_next, enc);
|
461
|
+
if (cp_next > 0 && rb_enc_isdigit(cp_next, enc))
|
462
|
+
break;
|
463
|
+
}
|
464
|
+
t += cp_len;
|
465
|
+
cp = get_cp(t, end, &cp_len, enc);
|
875
466
|
}
|
876
|
-
|
877
|
-
|
467
|
+
ts->t = t;
|
468
|
+
*cp_out_p = cp;
|
469
|
+
*cp_len_p = cp_len;
|
470
|
+
return (t < end);
|
878
471
|
}
|
879
472
|
|
880
|
-
static FrtToken *
|
881
|
-
{
|
882
|
-
FrtLegacyStandardTokenizer *std_tz = LSTDTS(ts);
|
473
|
+
static FrtToken *std_next(FrtTokenStream *ts) {
|
883
474
|
char *s;
|
884
475
|
char *t;
|
885
476
|
char *start = NULL;
|
477
|
+
char *end;
|
886
478
|
char *num_end = NULL;
|
887
479
|
char token[FRT_MAX_WORD_SIZE + 1];
|
480
|
+
OnigCodePoint cp = 0;
|
481
|
+
OnigCodePoint cp_1 = 0;
|
482
|
+
OnigCodePoint cp_2 = 0;
|
483
|
+
OnigCodePoint prev_cp = 0;
|
484
|
+
int cp_len = 0;
|
485
|
+
int cp_1_len = 0;
|
486
|
+
int cp_2_len = 0;
|
888
487
|
int token_i = 0;
|
889
488
|
int len;
|
890
489
|
bool is_acronym;
|
891
490
|
bool seen_at_symbol;
|
491
|
+
rb_encoding *enc = ts->encoding;
|
892
492
|
|
893
|
-
|
894
|
-
if (!
|
493
|
+
/* advance to start and return first cp and len */
|
494
|
+
if (!std_advance_to_start(ts, &cp_len, &cp, enc))
|
895
495
|
return NULL;
|
896
|
-
}
|
897
496
|
|
497
|
+
end = ts->text + ts->length;
|
898
498
|
start = t = ts->t;
|
899
|
-
|
499
|
+
|
500
|
+
/* get all alnums */
|
501
|
+
token_i = std_get_alnum(ts, token, cp, &cp_len, &cp, enc);
|
900
502
|
t += token_i;
|
901
503
|
|
902
|
-
if (
|
504
|
+
if (t >= end && token_i > 0) {
|
505
|
+
ts->t += token_i;
|
506
|
+
return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
507
|
+
}
|
508
|
+
|
509
|
+
// already got cp and cp_len from get_alnum above
|
510
|
+
// cp = get_cp(t, end, &cp_len, enc);
|
511
|
+
if (cp < 1)
|
512
|
+
return NULL;
|
513
|
+
|
514
|
+
if (!cp_enc_istok(cp, enc)) {
|
903
515
|
/* very common case, ie a plain word, so check and return */
|
904
|
-
ts->t = t;
|
905
|
-
return frt_tk_set_ts(&(
|
516
|
+
ts->t = t + cp_len;
|
517
|
+
return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
906
518
|
}
|
907
519
|
|
908
|
-
if (
|
909
|
-
t +=
|
520
|
+
if (cp == cp_apostrophe) { /* apostrophe case. */
|
521
|
+
t += std_get_apostrophe(ts, t, cp, &cp_len, enc);
|
910
522
|
ts->t = t;
|
911
523
|
len = (int)(t - start);
|
912
524
|
/* strip possesive */
|
525
|
+
/* TODO: wont work with multibyte */
|
913
526
|
if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
|
914
527
|
t -= 2;
|
915
|
-
frt_tk_set_ts(&(
|
916
|
-
|
528
|
+
frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
529
|
+
ts->token.end += 2;
|
917
530
|
}
|
918
531
|
else if (t[-1] == '\'') {
|
919
532
|
t -= 1;
|
920
|
-
frt_tk_set_ts(&(
|
921
|
-
|
533
|
+
frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
534
|
+
ts->token.end += 1;
|
922
535
|
}
|
923
536
|
else {
|
924
|
-
frt_tk_set_ts(&(
|
537
|
+
frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
925
538
|
}
|
926
|
-
|
927
|
-
return &(CTS(ts)->token);
|
539
|
+
return &(ts->token);
|
928
540
|
}
|
929
541
|
|
930
|
-
|
931
|
-
|
542
|
+
// already got cp and cp_len from get_alnum above
|
543
|
+
// cp = get_cp(t, end, &cp_len, enc);
|
544
|
+
if (cp == cp_ampersand) { /* ampersand case. */
|
545
|
+
t += std_get_company_name(ts, t, end);
|
932
546
|
ts->t = t;
|
933
|
-
return frt_tk_set_ts(&(
|
547
|
+
return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
934
548
|
}
|
935
549
|
|
936
|
-
|
937
|
-
|
550
|
+
// already got cp and cp_len from get_alnum above
|
551
|
+
// cp = get_cp(start, end, &cp_len, enc);
|
552
|
+
if ((rb_enc_isdigit(cp, enc) || cp_isnumpunc(cp))
|
553
|
+
&& ((len = std_get_number(ts, start, end, cp, cp_len, enc)) > 0)) { /* possibly a number */
|
938
554
|
num_end = start + len;
|
939
|
-
|
555
|
+
cp = get_cp(num_end, end, &cp_len, enc);
|
556
|
+
if (cp > 0 && !cp_enc_istok(cp, enc)) { /* won't find a longer token */
|
940
557
|
ts->t = num_end;
|
941
|
-
return frt_tk_set_ts(&(
|
558
|
+
return frt_tk_set_ts(&(ts->token), start, num_end, ts->text, 1, enc);
|
942
559
|
}
|
943
560
|
/* else there may be a longer token so check */
|
944
561
|
}
|
945
562
|
|
946
|
-
|
563
|
+
// already got cp and cp_len from get_alnum or the last block above
|
564
|
+
// cp = get_cp(t, end, &cp_len, enc);
|
565
|
+
cp_1 = get_cp(t + cp_len, end, &cp_1_len, enc);
|
566
|
+
cp_2 = get_cp(t + cp_len + cp_1_len, end, &cp_2_len, enc);
|
567
|
+
if (cp == cp_colon && cp_1 == cp_slash && cp_2 == cp_slash) {
|
947
568
|
/* check for a known url start */
|
948
569
|
token[token_i] = '\0';
|
949
|
-
t +=
|
950
|
-
token_i +=
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
(memcmp(token, "ftp", 3) == 0 ||
|
956
|
-
memcmp(token, "http", 4) == 0 ||
|
957
|
-
memcmp(token, "https", 5) == 0 ||
|
958
|
-
memcmp(token, "file", 4) == 0)) {
|
959
|
-
ts->t = std_get_url(t, token, 0, &len); /* dispose of first part of the URL */
|
570
|
+
t += cp_len + cp_1_len + cp_2_len;
|
571
|
+
token_i += cp_len + cp_1_len + cp_2_len;
|
572
|
+
cp = get_cp(t, end, &cp_len, enc);
|
573
|
+
while (cp > 0 && cp == cp_slash) {
|
574
|
+
t += cp_len;
|
575
|
+
cp = get_cp(t, end, &cp_len, enc);
|
960
576
|
}
|
961
|
-
|
577
|
+
if (rb_enc_isalpha(cp, enc) &&
|
578
|
+
(memcmp(token, "ftp", 3) == 0 ||
|
579
|
+
memcmp(token, "http", 4) == 0 ||
|
580
|
+
memcmp(token, "https", 5) == 0 ||
|
581
|
+
memcmp(token, "file", 4) == 0)) {
|
582
|
+
ts->t = std_get_url(ts, t, end, token, &len, 0); /* dispose of first part of the URL */
|
583
|
+
} else { /* still treat as url but keep the first part */
|
962
584
|
token_i = (int)(t - start);
|
963
585
|
memcpy(token, start, token_i * sizeof(char));
|
964
|
-
ts->t = std_get_url(
|
586
|
+
ts->t = std_get_url(ts, t, end, token + token_i, &len, token_i); /* keep start */
|
965
587
|
}
|
966
|
-
return frt_tk_set(&(
|
588
|
+
return frt_tk_set(&(ts->token), token, len,
|
967
589
|
(off_t)(start - ts->text),
|
968
|
-
(off_t)(ts->t - ts->text), 1);
|
590
|
+
(off_t)(ts->t - ts->text), 1, enc);
|
969
591
|
}
|
970
592
|
|
971
593
|
/* now see how long a url we can find. */
|
972
594
|
is_acronym = true;
|
973
595
|
seen_at_symbol = false;
|
974
|
-
|
975
|
-
|
596
|
+
|
597
|
+
cp = get_cp(t, end, &cp_len, enc);
|
598
|
+
while (cp_enc_isurlxatc(cp, enc)) {
|
599
|
+
if (is_acronym && !rb_enc_isalpha(cp, enc) && (cp != cp_dot)) {
|
976
600
|
is_acronym = false;
|
977
601
|
}
|
978
|
-
if (
|
602
|
+
if (cp_isurlxatpunc(cp) && cp_isurlxatpunc(prev_cp)) {
|
979
603
|
break; /* can't have two punctuation characters in a row */
|
980
604
|
}
|
981
|
-
if (
|
605
|
+
if (cp == cp_at) {
|
982
606
|
if (seen_at_symbol) {
|
983
607
|
break; /* we can only have one @ symbol */
|
984
608
|
}
|
@@ -986,10 +610,12 @@ static FrtToken *legacy_std_next(FrtTokenStream *ts)
|
|
986
610
|
seen_at_symbol = true;
|
987
611
|
}
|
988
612
|
}
|
989
|
-
|
613
|
+
prev_cp = cp;
|
614
|
+
t += cp_len;
|
615
|
+
cp = get_cp(t, end, &cp_len, enc);
|
990
616
|
}
|
991
|
-
|
992
|
-
t
|
617
|
+
if (cp_isurlxatpunc(prev_cp) && t > ts->t) {
|
618
|
+
t -= cp_len; /* strip trailing punctuation */
|
993
619
|
}
|
994
620
|
|
995
621
|
if (t < ts->t || (num_end != NULL && num_end < ts->t)) {
|
@@ -999,140 +625,119 @@ static FrtToken *legacy_std_next(FrtTokenStream *ts)
|
|
999
625
|
ts->t = t;
|
1000
626
|
|
1001
627
|
if (is_acronym) { /* check it is one letter followed by one '.' */
|
1002
|
-
|
1003
|
-
|
628
|
+
cp_len = 0;
|
629
|
+
for (s = start; s < t - 1; s += cp_len) {
|
630
|
+
cp = get_cp(s, end, &cp_len, enc);
|
631
|
+
cp_1 = get_cp(s + cp_len, end, &cp_1_len, enc);
|
632
|
+
if (rb_enc_isalpha(cp, enc) && (cp_1 != cp_dot))
|
1004
633
|
is_acronym = false;
|
1005
634
|
}
|
1006
635
|
}
|
1007
636
|
if (is_acronym) { /* strip '.'s */
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
637
|
+
cp_len = 0;
|
638
|
+
for (s = start + token_i; s < t; s += cp_len) {
|
639
|
+
cp = get_cp(s, end, &cp_len, enc);
|
640
|
+
if (cp > 0 && cp != cp_dot) {
|
641
|
+
memcpy(token + token_i, s, cp_len);
|
642
|
+
token_i += cp_len;
|
1012
643
|
}
|
1013
644
|
}
|
1014
|
-
|
645
|
+
token[token_i] = '\0';
|
646
|
+
frt_tk_set(&(ts->token), token, token_i,
|
1015
647
|
(off_t)(start - ts->text),
|
1016
|
-
(off_t)(t - ts->text), 1);
|
1017
|
-
}
|
1018
|
-
|
1019
|
-
frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
648
|
+
(off_t)(t - ts->text), 1, enc);
|
649
|
+
} else { /* just return the url as is */
|
650
|
+
frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
|
1020
651
|
}
|
1021
|
-
}
|
1022
|
-
else { /* return the number */
|
652
|
+
} else { /* return the number */
|
1023
653
|
ts->t = num_end;
|
1024
|
-
frt_tk_set_ts(&(
|
654
|
+
frt_tk_set_ts(&(ts->token), start, num_end, ts->text, 1, enc);
|
1025
655
|
}
|
1026
|
-
|
1027
|
-
return &(CTS(ts)->token);
|
656
|
+
return &(ts->token);
|
1028
657
|
}
|
1029
658
|
|
1030
|
-
static FrtTokenStream *
|
1031
|
-
|
1032
|
-
return frt_ts_clone_size(orig_ts, sizeof(FrtLegacyStandardTokenizer));
|
659
|
+
static FrtTokenStream *std_ts_clone_i(FrtTokenStream *orig_ts) {
|
660
|
+
return frt_ts_clone_size(orig_ts, sizeof(FrtTokenStream));
|
1033
661
|
}
|
1034
662
|
|
1035
|
-
|
1036
|
-
|
1037
|
-
FrtTokenStream *ts = frt_ts_new(FrtLegacyStandardTokenizer);
|
1038
|
-
|
1039
|
-
ts->clone_i = &legacy_std_ts_clone_i;
|
1040
|
-
ts->next = &legacy_std_next;
|
1041
|
-
|
1042
|
-
return ts;
|
663
|
+
FrtTokenStream *frt_standard_tokenizer_alloc(void) {
|
664
|
+
return (FrtTokenStream *)frt_ecalloc(sizeof(FrtTokenStream));
|
1043
665
|
}
|
1044
666
|
|
1045
|
-
FrtTokenStream *
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
LSTDTS(ts)->advance_to_start = &legacy_std_advance_to_start;
|
1050
|
-
LSTDTS(ts)->get_alpha = &legacy_std_get_alpha;
|
1051
|
-
LSTDTS(ts)->is_tok_char = &legacy_std_is_tok_char;
|
1052
|
-
LSTDTS(ts)->get_apostrophe = &legacy_std_get_apostrophe;
|
1053
|
-
|
667
|
+
FrtTokenStream *frt_standard_tokenizer_init(FrtTokenStream *ts) {
|
668
|
+
ts = frt_ts_init(ts);
|
669
|
+
ts->clone_i = &std_ts_clone_i;
|
670
|
+
ts->next = &std_next;
|
1054
671
|
return ts;
|
1055
672
|
}
|
1056
673
|
|
1057
|
-
FrtTokenStream *
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
LSTDTS(ts)->advance_to_start = &mb_legacy_std_advance_to_start;
|
1062
|
-
LSTDTS(ts)->get_alpha = &mb_legacy_std_get_alpha;
|
1063
|
-
LSTDTS(ts)->is_tok_char = &mb_legacy_std_is_tok_char;
|
1064
|
-
LSTDTS(ts)->get_apostrophe = &mb_legacy_std_get_apostrophe;
|
1065
|
-
|
1066
|
-
return ts;
|
674
|
+
FrtTokenStream *frt_standard_tokenizer_new(void) {
|
675
|
+
FrtTokenStream *ts = frt_standard_tokenizer_alloc();
|
676
|
+
return frt_standard_tokenizer_init(ts);
|
1067
677
|
}
|
1068
678
|
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
*
|
1073
|
-
****************************************************************************/
|
679
|
+
/*****************************************************************************/
|
680
|
+
/*** FrtFilters **************************************************************/
|
681
|
+
/*****************************************************************************/
|
1074
682
|
|
1075
683
|
#define TkFilt(filter) ((FrtTokenFilter *)(filter))
|
1076
684
|
|
1077
|
-
FrtTokenStream *frt_filter_clone_size(FrtTokenStream *ts, size_t size)
|
1078
|
-
{
|
685
|
+
FrtTokenStream *frt_filter_clone_size(FrtTokenStream *ts, size_t size) {
|
1079
686
|
FrtTokenStream *ts_new = frt_ts_clone_size(ts, size);
|
1080
687
|
TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
|
1081
688
|
return ts_new;
|
1082
689
|
}
|
1083
690
|
|
1084
|
-
static FrtTokenStream *filter_clone_i(FrtTokenStream *ts)
|
1085
|
-
{
|
691
|
+
static FrtTokenStream *filter_clone_i(FrtTokenStream *ts) {
|
1086
692
|
return frt_filter_clone_size(ts, sizeof(FrtTokenFilter));
|
1087
693
|
}
|
1088
694
|
|
1089
|
-
static FrtTokenStream *filter_reset(FrtTokenStream *ts, char *text)
|
1090
|
-
|
1091
|
-
TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
|
695
|
+
static FrtTokenStream *filter_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
|
696
|
+
TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text, encoding);
|
1092
697
|
return ts;
|
1093
698
|
}
|
1094
699
|
|
1095
|
-
static void filter_destroy_i(FrtTokenStream *ts)
|
1096
|
-
{
|
700
|
+
static void filter_destroy_i(FrtTokenStream *ts) {
|
1097
701
|
frt_ts_deref(TkFilt(ts)->sub_ts);
|
1098
702
|
free(ts);
|
1099
703
|
}
|
1100
704
|
|
1101
|
-
FrtTokenStream *
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
TkFilt(ts)->sub_ts = sub_ts;
|
1106
|
-
|
1107
|
-
ts->clone_i = &filter_clone_i;
|
1108
|
-
ts->destroy_i = &filter_destroy_i;
|
1109
|
-
ts->reset = &filter_reset;
|
1110
|
-
ts->ref_cnt = 1;
|
705
|
+
FrtTokenStream *frt_tf_alloc_i(size_t size) {
|
706
|
+
return (FrtTokenStream *)frt_ecalloc(size);
|
707
|
+
}
|
1111
708
|
|
709
|
+
FrtTokenStream *frt_tf_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
|
710
|
+
ts->clone_i = &filter_clone_i;
|
711
|
+
ts->destroy_i = &filter_destroy_i;
|
712
|
+
ts->reset = &filter_reset;
|
713
|
+
ts->ref_cnt = 1;
|
714
|
+
TkFilt(ts)->sub_ts = sub_ts;
|
1112
715
|
return ts;
|
1113
716
|
}
|
1114
717
|
|
1115
|
-
|
1116
|
-
*
|
1117
|
-
|
718
|
+
FrtTokenStream *frt_tf_new_i(size_t size, FrtTokenStream *sub_ts) {
|
719
|
+
FrtTokenStream *ts = frt_tf_alloc_i(size);
|
720
|
+
return frt_tf_init(ts, sub_ts);
|
721
|
+
}
|
722
|
+
|
723
|
+
/*****************************************************************************/
|
724
|
+
/**** FrtStopFilter **********************************************************/
|
725
|
+
/*****************************************************************************/
|
1118
726
|
|
1119
727
|
#define StopFilt(filter) ((FrtStopFilter *)(filter))
|
1120
728
|
|
1121
|
-
static void sf_destroy_i(FrtTokenStream *ts)
|
1122
|
-
{
|
729
|
+
static void sf_destroy_i(FrtTokenStream *ts) {
|
1123
730
|
frt_h_destroy(StopFilt(ts)->words);
|
1124
731
|
filter_destroy_i(ts);
|
1125
732
|
}
|
1126
733
|
|
1127
|
-
static FrtTokenStream *sf_clone_i(FrtTokenStream *orig_ts)
|
1128
|
-
{
|
734
|
+
static FrtTokenStream *sf_clone_i(FrtTokenStream *orig_ts) {
|
1129
735
|
FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtMappingFilter));
|
1130
736
|
FRT_REF(StopFilt(new_ts)->words);
|
1131
737
|
return new_ts;
|
1132
738
|
}
|
1133
739
|
|
1134
|
-
static FrtToken *sf_next(FrtTokenStream *ts)
|
1135
|
-
{
|
740
|
+
static FrtToken *sf_next(FrtTokenStream *ts) {
|
1136
741
|
int pos_inc = 0;
|
1137
742
|
FrtHash *words = StopFilt(ts)->words;
|
1138
743
|
FrtTokenFilter *tf = TkFilt(ts);
|
@@ -1150,71 +755,76 @@ static FrtToken *sf_next(FrtTokenStream *ts)
|
|
1150
755
|
return tk;
|
1151
756
|
}
|
1152
757
|
|
1153
|
-
FrtTokenStream *
|
1154
|
-
|
1155
|
-
|
758
|
+
FrtTokenStream *frt_stop_filter_alloc(void) {
|
759
|
+
return (FrtTokenStream *)frt_ecalloc(sizeof(FrtStopFilter));
|
760
|
+
}
|
761
|
+
|
762
|
+
FrtTokenStream *frt_stop_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
|
763
|
+
frt_tf_init(ts, sub_ts);
|
764
|
+
ts->next = &sf_next;
|
765
|
+
ts->destroy_i = &sf_destroy_i;
|
766
|
+
ts->clone_i = &sf_clone_i;
|
767
|
+
return ts;
|
768
|
+
}
|
769
|
+
|
770
|
+
void frt_stop_filter_set_words_len(FrtTokenStream *ts, const char **words, int len) {
|
1156
771
|
int i;
|
1157
772
|
char *word;
|
1158
773
|
FrtHash *word_table = frt_h_new_str(&free, (frt_free_ft) NULL);
|
1159
|
-
FrtTokenStream *ts = tf_new(FrtStopFilter, sub_ts);
|
1160
|
-
|
1161
774
|
for (i = 0; i < len; i++) {
|
1162
775
|
word = frt_estrdup(words[i]);
|
1163
776
|
frt_h_set(word_table, word, word);
|
1164
777
|
}
|
1165
778
|
StopFilt(ts)->words = word_table;
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
779
|
+
}
|
780
|
+
|
781
|
+
FrtTokenStream *frt_stop_filter_new_with_words_len(FrtTokenStream *sub_ts, const char **words, int len) {
|
782
|
+
FrtTokenStream *ts = frt_stop_filter_alloc();
|
783
|
+
ts = frt_stop_filter_init(ts, sub_ts);
|
784
|
+
frt_stop_filter_set_words_len(ts, words, len);
|
1169
785
|
return ts;
|
1170
786
|
}
|
1171
787
|
|
1172
|
-
|
1173
|
-
const char **words)
|
1174
|
-
{
|
788
|
+
void frt_stop_filter_set_words(FrtTokenStream *ts, const char **words) {
|
1175
789
|
char *word;
|
1176
790
|
FrtHash *word_table = frt_h_new_str(&free, (frt_free_ft) NULL);
|
1177
|
-
FrtTokenStream *ts = tf_new(FrtStopFilter, sub_ts);
|
1178
|
-
|
1179
791
|
while (*words) {
|
1180
792
|
word = frt_estrdup(*words);
|
1181
793
|
frt_h_set(word_table, word, word);
|
1182
794
|
words++;
|
1183
795
|
}
|
1184
|
-
|
1185
796
|
StopFilt(ts)->words = word_table;
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
797
|
+
}
|
798
|
+
|
799
|
+
FrtTokenStream *frt_stop_filter_new_with_words(FrtTokenStream *sub_ts, const char **words) {
|
800
|
+
FrtTokenStream *ts = frt_stop_filter_alloc();
|
801
|
+
frt_stop_filter_init(ts, sub_ts);
|
802
|
+
frt_stop_filter_set_words(ts, words);
|
1189
803
|
return ts;
|
1190
804
|
}
|
1191
805
|
|
1192
|
-
FrtTokenStream *frt_stop_filter_new(FrtTokenStream *
|
1193
|
-
|
1194
|
-
return frt_stop_filter_new_with_words(ts, FRT_FULL_ENGLISH_STOP_WORDS);
|
806
|
+
FrtTokenStream *frt_stop_filter_new(FrtTokenStream *sub_ts) {
|
807
|
+
return frt_stop_filter_new_with_words(sub_ts, FRT_FULL_ENGLISH_STOP_WORDS);
|
1195
808
|
}
|
1196
809
|
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
810
|
+
/*****************************************************************************/
|
811
|
+
/*** MappingFilter ***********************************************************/
|
812
|
+
/*****************************************************************************/
|
1200
813
|
|
1201
814
|
#define MFilt(filter) ((FrtMappingFilter *)(filter))
|
1202
815
|
|
1203
|
-
static void mf_destroy_i(FrtTokenStream *ts)
|
1204
|
-
{
|
816
|
+
static void mf_destroy_i(FrtTokenStream *ts) {
|
1205
817
|
frt_mulmap_destroy(MFilt(ts)->mapper);
|
1206
818
|
filter_destroy_i(ts);
|
1207
819
|
}
|
1208
820
|
|
1209
|
-
static FrtTokenStream *mf_clone_i(FrtTokenStream *orig_ts)
|
1210
|
-
{
|
821
|
+
static FrtTokenStream *mf_clone_i(FrtTokenStream *orig_ts) {
|
1211
822
|
FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtMappingFilter));
|
1212
823
|
FRT_REF(MFilt(new_ts)->mapper);
|
1213
824
|
return new_ts;
|
1214
825
|
}
|
1215
826
|
|
1216
|
-
static FrtToken *mf_next(FrtTokenStream *ts)
|
1217
|
-
{
|
827
|
+
static FrtToken *mf_next(FrtTokenStream *ts) {
|
1218
828
|
char buf[FRT_MAX_WORD_SIZE + 1];
|
1219
829
|
FrtMultiMapper *mapper = MFilt(ts)->mapper;
|
1220
830
|
FrtTokenFilter *tf = TkFilt(ts);
|
@@ -1226,48 +836,53 @@ static FrtToken *mf_next(FrtTokenStream *ts)
|
|
1226
836
|
return tk;
|
1227
837
|
}
|
1228
838
|
|
1229
|
-
static FrtTokenStream *mf_reset(FrtTokenStream *ts, char *text)
|
1230
|
-
{
|
839
|
+
static FrtTokenStream *mf_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
|
1231
840
|
FrtMultiMapper *mm = MFilt(ts)->mapper;
|
1232
|
-
if (mm->d_size == 0)
|
841
|
+
if (mm->d_size == 0)
|
1233
842
|
frt_mulmap_compile(MFilt(ts)->mapper);
|
1234
|
-
|
1235
|
-
filter_reset(ts, text);
|
843
|
+
filter_reset(ts, text, encoding);
|
1236
844
|
return ts;
|
1237
845
|
}
|
1238
846
|
|
1239
|
-
FrtTokenStream *
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
ts
|
1245
|
-
ts->
|
1246
|
-
ts->
|
847
|
+
FrtTokenStream *frt_mapping_filter_alloc(void) {
|
848
|
+
return (FrtTokenStream *)frt_ecalloc(sizeof(FrtMappingFilter));
|
849
|
+
}
|
850
|
+
|
851
|
+
void frt_mapping_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
|
852
|
+
frt_tf_init(ts, sub_ts);
|
853
|
+
ts->next = &mf_next;
|
854
|
+
ts->destroy_i = &mf_destroy_i;
|
855
|
+
ts->clone_i = &mf_clone_i;
|
856
|
+
ts->reset = &mf_reset;
|
857
|
+
MFilt(ts)->mapper = frt_mulmap_new();
|
858
|
+
}
|
859
|
+
|
860
|
+
FrtTokenStream *frt_mapping_filter_new(FrtTokenStream *sub_ts) {
|
861
|
+
FrtTokenStream *ts = frt_mapping_filter_alloc();
|
862
|
+
frt_mapping_filter_init(ts, sub_ts);
|
1247
863
|
return ts;
|
1248
864
|
}
|
1249
865
|
|
1250
|
-
FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern,
|
1251
|
-
const char *replacement)
|
1252
|
-
{
|
866
|
+
FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern, const char *replacement) {
|
1253
867
|
frt_mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
|
1254
868
|
return ts;
|
1255
869
|
}
|
1256
870
|
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
871
|
+
/*****************************************************************************/
|
872
|
+
/*** FrtHyphenFilter *********************************************************/
|
873
|
+
/*****************************************************************************/
|
1260
874
|
|
1261
875
|
#define HyphenFilt(filter) ((FrtHyphenFilter *)(filter))
|
1262
876
|
|
1263
|
-
static FrtTokenStream *hf_clone_i(FrtTokenStream *orig_ts)
|
1264
|
-
{
|
877
|
+
static FrtTokenStream *hf_clone_i(FrtTokenStream *orig_ts) {
|
1265
878
|
FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtHyphenFilter));
|
1266
879
|
return new_ts;
|
1267
880
|
}
|
1268
881
|
|
1269
|
-
static FrtToken *hf_next(FrtTokenStream *ts)
|
1270
|
-
|
882
|
+
static FrtToken *hf_next(FrtTokenStream *ts) {
|
883
|
+
int cp_len = 0;
|
884
|
+
OnigCodePoint cp;
|
885
|
+
rb_encoding *enc = utf8_encoding;
|
1271
886
|
FrtHyphenFilter *hf = HyphenFilt(ts);
|
1272
887
|
FrtTokenFilter *tf = TkFilt(ts);
|
1273
888
|
FrtToken *tk = hf->tk;
|
@@ -1282,38 +897,48 @@ static FrtToken *hf_next(FrtTokenStream *ts)
|
|
1282
897
|
hf->pos += text_len + 1;
|
1283
898
|
tk->len = text_len;
|
1284
899
|
return tk;
|
1285
|
-
}
|
1286
|
-
|
1287
|
-
char *
|
900
|
+
} else {
|
901
|
+
char *t;
|
902
|
+
char *end;
|
903
|
+
|
1288
904
|
bool seen_hyphen = false;
|
1289
905
|
bool seen_other_punc = false;
|
1290
906
|
hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
|
1291
907
|
if (NULL == tk) return NULL;
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
908
|
+
t = tk->text;
|
909
|
+
end = tk->text + tk->len;
|
910
|
+
get_cp(t, end, &cp_len, enc);
|
911
|
+
t += cp_len; // skip first
|
912
|
+
cp = get_cp(t, end, &cp_len, enc);
|
913
|
+
while (cp > 0) {
|
914
|
+
if (cp == cp_dash || cp == cp_hyphen) {
|
1295
915
|
seen_hyphen = true;
|
1296
|
-
}
|
1297
|
-
else if (!isalpha(*p)) {
|
916
|
+
} else if (!rb_enc_isalpha(cp, enc)) {
|
1298
917
|
seen_other_punc = true;
|
1299
918
|
break;
|
1300
919
|
}
|
1301
|
-
|
920
|
+
t += cp_len;
|
921
|
+
cp = get_cp(t, end, &cp_len, enc);
|
1302
922
|
}
|
1303
923
|
if (seen_hyphen && !seen_other_punc) {
|
1304
924
|
char *q = hf->text;
|
1305
925
|
char *r = tk->text;
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
926
|
+
t = tk->text;
|
927
|
+
end = tk->text + tk->len;
|
928
|
+
cp = 0;
|
929
|
+
cp = get_cp(t, end, &cp_len, enc);
|
930
|
+
while (cp > 0) {
|
931
|
+
if (cp == cp_dash || cp == cp_hyphen) {
|
1309
932
|
*q = '\0';
|
933
|
+
q++;
|
934
|
+
} else {
|
935
|
+
memcpy(q, t, cp_len);
|
936
|
+
if (r!=t) memcpy(r, t, cp_len);
|
937
|
+
r += cp_len;
|
938
|
+
q += cp_len;
|
1310
939
|
}
|
1311
|
-
|
1312
|
-
|
1313
|
-
r++;
|
1314
|
-
}
|
1315
|
-
q++;
|
1316
|
-
p++;
|
940
|
+
t += cp_len;
|
941
|
+
cp = get_cp(t, end, &cp_len, enc);
|
1317
942
|
}
|
1318
943
|
*r = *q = '\0';
|
1319
944
|
hf->start = tk->start;
|
@@ -1325,89 +950,76 @@ static FrtToken *hf_next(FrtTokenStream *ts)
|
|
1325
950
|
return tk;
|
1326
951
|
}
|
1327
952
|
|
1328
|
-
FrtTokenStream *
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
953
|
+
FrtTokenStream *frt_hyphen_filter_alloc(void) {
|
954
|
+
return (FrtTokenStream *)frt_ecalloc(sizeof(FrtHyphenFilter));
|
955
|
+
}
|
956
|
+
|
957
|
+
FrtTokenStream *frt_hyphen_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
|
958
|
+
frt_tf_init(ts, sub_ts);
|
959
|
+
ts->next = &hf_next;
|
960
|
+
ts->clone_i = &hf_clone_i;
|
1333
961
|
return ts;
|
1334
962
|
}
|
1335
963
|
|
1336
|
-
|
1337
|
-
*
|
1338
|
-
|
964
|
+
FrtTokenStream *frt_hyphen_filter_new(FrtTokenStream *sub_ts) {
|
965
|
+
FrtTokenStream *ts = frt_hyphen_filter_alloc();
|
966
|
+
return frt_hyphen_filter_init(ts, sub_ts);
|
967
|
+
}
|
1339
968
|
|
969
|
+
/*****************************************************************************/
|
970
|
+
/*** FrtLowercaseFilter ******************************************************/
|
971
|
+
/*****************************************************************************/
|
972
|
+
|
973
|
+
static FrtToken *lcf_next(FrtTokenStream *ts) {
|
974
|
+
int len = 0;
|
975
|
+
OnigCaseFoldType fold_type = ONIGENC_CASE_DOWNCASE;
|
976
|
+
rb_encoding *enc = utf8_encoding; // Token encoding is always UTF-8
|
977
|
+
char buf[FRT_MAX_WORD_SIZE + 20]; // CASE_MAPPING_ADDITIONAL_LENGTH
|
978
|
+
char *buf_end = buf + FRT_MAX_WORD_SIZE + 19;
|
1340
979
|
|
1341
|
-
static FrtToken *mb_lcf_next(FrtTokenStream *ts)
|
1342
|
-
{
|
1343
|
-
wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *wchr;
|
1344
980
|
FrtToken *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
|
1345
|
-
|
1346
|
-
|
981
|
+
if (tk == NULL) { return tk; }
|
982
|
+
if (tk->len < 1) { return tk; }
|
1347
983
|
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
984
|
+
const OnigUChar *t = (const OnigUChar *)tk->text;
|
985
|
+
|
986
|
+
len = enc->case_map(&fold_type, &t, (const OnigUChar *)(tk->text + tk->len), (OnigUChar *)buf, (OnigUChar *)buf_end, enc);
|
987
|
+
tk->len = len;
|
988
|
+
memcpy(tk->text, buf, len);
|
989
|
+
tk->text[len] = '\0';
|
1351
990
|
|
1352
|
-
if ((x=mbstowcs(wbuf, tk->text, FRT_MAX_WORD_SIZE)) <= 0) return tk;
|
1353
|
-
wchr = wbuf;
|
1354
|
-
while (*wchr != 0) {
|
1355
|
-
*wchr = towlower(*wchr);
|
1356
|
-
wchr++;
|
1357
|
-
}
|
1358
|
-
tk->len = wcstombs(tk->text, wbuf, FRT_MAX_WORD_SIZE);
|
1359
|
-
if (tk->len <= 0) {
|
1360
|
-
strcpy(tk->text, "BAD_DATA");
|
1361
|
-
tk->len = 8;
|
1362
|
-
}
|
1363
|
-
tk->text[tk->len] = '\0';
|
1364
991
|
return tk;
|
1365
992
|
}
|
1366
993
|
|
1367
|
-
FrtTokenStream *
|
1368
|
-
|
1369
|
-
FrtTokenStream *ts = tf_new(FrtTokenFilter, sub_ts);
|
1370
|
-
ts->next = &mb_lcf_next;
|
1371
|
-
return ts;
|
994
|
+
FrtTokenStream *frt_lowercase_filter_alloc(void) {
|
995
|
+
return (FrtTokenStream *)frt_ecalloc(sizeof(FrtTokenFilter));
|
1372
996
|
}
|
1373
997
|
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
FrtToken *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
|
1378
|
-
if (tk == NULL) {
|
1379
|
-
return tk;
|
1380
|
-
}
|
1381
|
-
while (tk->text[i] != '\0') {
|
1382
|
-
tk->text[i] = tolower(tk->text[i]);
|
1383
|
-
i++;
|
1384
|
-
}
|
1385
|
-
return tk;
|
998
|
+
void frt_lowercase_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
|
999
|
+
frt_tf_init(ts, sub_ts);
|
1000
|
+
ts->next = &lcf_next;
|
1386
1001
|
}
|
1387
1002
|
|
1388
|
-
FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *sub_ts)
|
1389
|
-
|
1390
|
-
|
1391
|
-
ts->next = &lcf_next;
|
1003
|
+
FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *sub_ts) {
|
1004
|
+
FrtTokenStream *ts = frt_lowercase_filter_alloc();
|
1005
|
+
frt_lowercase_filter_init(ts, sub_ts);
|
1392
1006
|
return ts;
|
1393
1007
|
}
|
1394
1008
|
|
1395
|
-
|
1396
|
-
|
1397
|
-
|
1009
|
+
/*****************************************************************************/
|
1010
|
+
/*** FrtStemFilter ***********************************************************/
|
1011
|
+
/*****************************************************************************/
|
1398
1012
|
|
1399
1013
|
#define StemFilt(filter) ((FrtStemFilter *)(filter))
|
1400
1014
|
|
1401
|
-
static void stemf_destroy_i(FrtTokenStream *ts)
|
1402
|
-
{
|
1015
|
+
static void stemf_destroy_i(FrtTokenStream *ts) {
|
1403
1016
|
sb_stemmer_delete(StemFilt(ts)->stemmer);
|
1404
1017
|
free(StemFilt(ts)->algorithm);
|
1405
1018
|
free(StemFilt(ts)->charenc);
|
1406
1019
|
filter_destroy_i(ts);
|
1407
1020
|
}
|
1408
1021
|
|
1409
|
-
static FrtToken *stemf_next(FrtTokenStream *ts)
|
1410
|
-
{
|
1022
|
+
static FrtToken *stemf_next(FrtTokenStream *ts) {
|
1411
1023
|
int len;
|
1412
1024
|
const sb_symbol *stemmed;
|
1413
1025
|
struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
|
@@ -1428,26 +1040,27 @@ static FrtToken *stemf_next(FrtTokenStream *ts)
|
|
1428
1040
|
return tk;
|
1429
1041
|
}
|
1430
1042
|
|
1431
|
-
static FrtTokenStream *stemf_clone_i(FrtTokenStream *orig_ts)
|
1432
|
-
|
1433
|
-
|
1434
|
-
FrtStemFilter *
|
1435
|
-
|
1436
|
-
stemf->
|
1437
|
-
|
1438
|
-
stemf->algorithm =
|
1439
|
-
orig_stemf->algorithm ? frt_estrdup(orig_stemf->algorithm) : NULL;
|
1440
|
-
stemf->charenc =
|
1441
|
-
orig_stemf->charenc ? frt_estrdup(orig_stemf->charenc) : NULL;
|
1043
|
+
static FrtTokenStream *stemf_clone_i(FrtTokenStream *orig_ts) {
|
1044
|
+
FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtStemFilter));
|
1045
|
+
FrtStemFilter *stemf = StemFilt(new_ts);
|
1046
|
+
FrtStemFilter *orig_stemf = StemFilt(orig_ts);
|
1047
|
+
stemf->stemmer = sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
|
1048
|
+
stemf->algorithm = orig_stemf->algorithm ? frt_estrdup(orig_stemf->algorithm) : NULL;
|
1049
|
+
stemf->charenc = orig_stemf->charenc ? frt_estrdup(orig_stemf->charenc) : NULL;
|
1442
1050
|
return new_ts;
|
1443
1051
|
}
|
1444
1052
|
|
1445
|
-
FrtTokenStream *
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1053
|
+
FrtTokenStream *frt_stem_filter_alloc(void) {
|
1054
|
+
return (FrtTokenStream *)frt_ecalloc(sizeof(FrtStemFilter));
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
void frt_stem_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts, const char *algorithm) {
|
1058
|
+
frt_tf_init(ts, sub_ts);
|
1059
|
+
ts->next = &stemf_next;
|
1060
|
+
ts->destroy_i = &stemf_destroy_i;
|
1061
|
+
ts->clone_i = &stemf_clone_i;
|
1062
|
+
|
1449
1063
|
char *my_algorithm = NULL;
|
1450
|
-
char *my_charenc = NULL;
|
1451
1064
|
char *s = NULL;
|
1452
1065
|
|
1453
1066
|
if (algorithm) {
|
@@ -1459,186 +1072,178 @@ FrtTokenStream *frt_stem_filter_new(FrtTokenStream *ts, const char *algorithm,
|
|
1459
1072
|
*s = tolower(*s);
|
1460
1073
|
s++;
|
1461
1074
|
}
|
1462
|
-
StemFilt(
|
1075
|
+
StemFilt(ts)->algorithm = my_algorithm;
|
1463
1076
|
}
|
1464
1077
|
|
1465
|
-
|
1466
|
-
|
1078
|
+
StemFilt(ts)->stemmer = sb_stemmer_new(my_algorithm, "UTF_8");
|
1079
|
+
}
|
1467
1080
|
|
1468
|
-
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
}
|
1474
|
-
StemFilt(tf)->charenc = my_charenc;
|
1475
|
-
}
|
1081
|
+
FrtTokenStream *frt_stem_filter_new(FrtTokenStream *sub_ts, const char *algorithm) {
|
1082
|
+
FrtTokenStream *ts = frt_stem_filter_alloc();
|
1083
|
+
frt_stem_filter_init(ts, sub_ts, algorithm);
|
1084
|
+
return ts;
|
1085
|
+
}
|
1476
1086
|
|
1477
|
-
|
1087
|
+
/*****************************************************************************/
|
1088
|
+
/*** FrtAnalyzer *************************************************************/
|
1089
|
+
/*****************************************************************************/
|
1478
1090
|
|
1479
|
-
|
1480
|
-
|
1481
|
-
|
1482
|
-
return tf;
|
1091
|
+
void frt_a_deref(FrtAnalyzer *a) {
|
1092
|
+
if (--a->ref_cnt <= 0)
|
1093
|
+
a->destroy_i(a);
|
1483
1094
|
}
|
1484
1095
|
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1096
|
+
static void frt_a_standard_destroy_i(FrtAnalyzer *a) {
|
1097
|
+
if (a->current_ts)
|
1098
|
+
frt_ts_deref(a->current_ts);
|
1099
|
+
free(a);
|
1100
|
+
}
|
1490
1101
|
|
1491
|
-
|
1492
|
-
*
|
1493
|
-
|
1102
|
+
static FrtTokenStream *a_standard_get_ts(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding) {
|
1103
|
+
FrtTokenStream *ts;
|
1104
|
+
(void)field;
|
1105
|
+
ts = frt_ts_clone(a->current_ts);
|
1106
|
+
return ts->reset(ts, text, encoding);
|
1107
|
+
}
|
1494
1108
|
|
1495
|
-
FrtAnalyzer *
|
1496
|
-
|
1497
|
-
{
|
1498
|
-
FrtTokenStream *ts = frt_standard_tokenizer_new();
|
1499
|
-
if (lowercase) {
|
1500
|
-
ts = frt_lowercase_filter_new(ts);
|
1501
|
-
}
|
1502
|
-
ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words_len(ts, words, len));
|
1503
|
-
return frt_analyzer_new(ts, NULL, NULL);
|
1109
|
+
FrtAnalyzer *frt_analyzer_alloc(void) {
|
1110
|
+
return (FrtAnalyzer *) FRT_ALLOC(FrtAnalyzer);
|
1504
1111
|
}
|
1505
1112
|
|
1506
|
-
FrtAnalyzer *
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1513
|
-
ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
|
1514
|
-
return frt_analyzer_new(ts, NULL, NULL);
|
1113
|
+
void frt_analyzer_init(FrtAnalyzer *a, FrtTokenStream *ts, void (*destroy_i)(FrtAnalyzer *a),
|
1114
|
+
FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding)) {
|
1115
|
+
a->current_ts = ts;
|
1116
|
+
a->destroy_i = (destroy_i ? destroy_i : &frt_a_standard_destroy_i);
|
1117
|
+
a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
|
1118
|
+
a->ref_cnt = 1;
|
1119
|
+
a->ranalyzer = Qnil;
|
1515
1120
|
}
|
1516
1121
|
|
1517
|
-
FrtAnalyzer *
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
ts = frt_mb_lowercase_filter_new(ts);
|
1523
|
-
}
|
1524
|
-
ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
|
1525
|
-
return frt_analyzer_new(ts, NULL, NULL);
|
1122
|
+
FrtAnalyzer *frt_analyzer_new(FrtTokenStream *ts, void (*destroy_i)(FrtAnalyzer *a),
|
1123
|
+
FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding)) {
|
1124
|
+
FrtAnalyzer *a = frt_analyzer_alloc();
|
1125
|
+
frt_analyzer_init(a, ts, destroy_i, get_ts);
|
1126
|
+
return a;
|
1526
1127
|
}
|
1527
1128
|
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
}
|
1535
|
-
ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
|
1536
|
-
return frt_analyzer_new(ts, NULL, NULL);
|
1129
|
+
/*****************************************************************************/
|
1130
|
+
/*** FrtNonAnalyzer **********************************************************/
|
1131
|
+
/*****************************************************************************/
|
1132
|
+
|
1133
|
+
FrtAnalyzer *frt_non_analyzer_new(void) {
|
1134
|
+
return frt_analyzer_new(frt_non_tokenizer_new(), NULL, NULL);
|
1537
1135
|
}
|
1538
1136
|
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1137
|
+
/*****************************************************************************/
|
1138
|
+
/*** FrtWhiteSpaceAnalyzer ***************************************************/
|
1139
|
+
/*****************************************************************************/
|
1140
|
+
|
1141
|
+
FrtAnalyzer *frt_whitespace_analyzer_alloc(void) {
|
1142
|
+
return frt_analyzer_alloc();
|
1543
1143
|
}
|
1544
1144
|
|
1545
|
-
FrtAnalyzer *
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1145
|
+
void frt_whitespace_analyzer_init(FrtAnalyzer *a, bool lowercase) {
|
1146
|
+
FrtTokenStream *ts = frt_whitespace_tokenizer_new();
|
1147
|
+
if (lowercase)
|
1148
|
+
ts = frt_lowercase_filter_new(ts);
|
1149
|
+
frt_analyzer_init(a, ts, NULL, NULL);
|
1549
1150
|
}
|
1550
1151
|
|
1551
|
-
FrtAnalyzer *
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1152
|
+
FrtAnalyzer *frt_whitespace_analyzer_new(bool lowercase) {
|
1153
|
+
FrtAnalyzer *a = frt_whitespace_analyzer_alloc();
|
1154
|
+
frt_whitespace_analyzer_init(a, lowercase);
|
1155
|
+
return a;
|
1555
1156
|
}
|
1556
1157
|
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1158
|
+
/*****************************************************************************/
|
1159
|
+
/*** FrtLetterAnalyzer *******************************************************/
|
1160
|
+
/*****************************************************************************/
|
1560
1161
|
|
1561
|
-
FrtAnalyzer *
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1162
|
+
FrtAnalyzer *frt_letter_analyzer_alloc(void) {
|
1163
|
+
return frt_analyzer_alloc();
|
1164
|
+
}
|
1165
|
+
|
1166
|
+
void frt_letter_analyzer_init(FrtAnalyzer *a, bool lowercase) {
|
1167
|
+
FrtTokenStream *ts = frt_letter_tokenizer_new();
|
1168
|
+
if (lowercase)
|
1566
1169
|
ts = frt_lowercase_filter_new(ts);
|
1567
|
-
|
1568
|
-
ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
|
1569
|
-
return frt_analyzer_new(ts, NULL, NULL);
|
1170
|
+
frt_analyzer_init(a, ts, NULL, NULL);
|
1570
1171
|
}
|
1571
1172
|
|
1572
|
-
FrtAnalyzer *
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1173
|
+
FrtAnalyzer *frt_letter_analyzer_new(bool lowercase) {
|
1174
|
+
FrtAnalyzer *a = frt_letter_analyzer_alloc();
|
1175
|
+
frt_letter_analyzer_init(a, lowercase);
|
1176
|
+
return a;
|
1177
|
+
}
|
1178
|
+
|
1179
|
+
/*****************************************************************************/
|
1180
|
+
/*** FrtStandardAnalyzer *****************************************************/
|
1181
|
+
/*****************************************************************************/
|
1182
|
+
|
1183
|
+
FrtAnalyzer *frt_standard_analyzer_alloc(void) {
|
1184
|
+
return frt_analyzer_alloc();
|
1185
|
+
}
|
1186
|
+
|
1187
|
+
void frt_standard_analyzer_init(FrtAnalyzer *a, bool lowercase, const char **words) {
|
1188
|
+
FrtTokenStream *ts = frt_standard_tokenizer_new();
|
1189
|
+
if (lowercase)
|
1190
|
+
ts = frt_lowercase_filter_new(ts);
|
1579
1191
|
ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
|
1580
|
-
|
1192
|
+
frt_analyzer_init(a, ts, NULL, NULL);
|
1581
1193
|
}
|
1582
1194
|
|
1583
|
-
FrtAnalyzer *
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1195
|
+
FrtAnalyzer *frt_standard_analyzer_new_with_words(bool lowercase, const char **words) {
|
1196
|
+
FrtAnalyzer *a = frt_standard_analyzer_alloc();
|
1197
|
+
frt_standard_analyzer_init(a, lowercase, words);
|
1198
|
+
return a;
|
1587
1199
|
}
|
1588
1200
|
|
1589
|
-
FrtAnalyzer *
|
1590
|
-
|
1591
|
-
return frt_mb_legacy_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
|
1592
|
-
lowercase);
|
1201
|
+
FrtAnalyzer *frt_standard_analyzer_new(bool lowercase) {
|
1202
|
+
return frt_standard_analyzer_new_with_words(lowercase, FRT_FULL_ENGLISH_STOP_WORDS);
|
1593
1203
|
}
|
1594
1204
|
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
*
|
1599
|
-
****************************************************************************/
|
1205
|
+
/*****************************************************************************/
|
1206
|
+
/*** FrtPerFieldAnalyzer *****************************************************/
|
1207
|
+
/*****************************************************************************/
|
1600
1208
|
|
1601
|
-
static void pfa_destroy_i(FrtAnalyzer *self)
|
1602
|
-
{
|
1209
|
+
static void pfa_destroy_i(FrtAnalyzer *self) {
|
1603
1210
|
frt_h_destroy(PFA(self)->dict);
|
1604
1211
|
|
1605
1212
|
frt_a_deref(PFA(self)->default_a);
|
1606
1213
|
free(self);
|
1607
1214
|
}
|
1608
1215
|
|
1609
|
-
static FrtTokenStream *pfa_get_ts(FrtAnalyzer *self,
|
1610
|
-
FrtSymbol field, char *text)
|
1611
|
-
{
|
1216
|
+
static FrtTokenStream *pfa_get_ts(FrtAnalyzer *self, ID field, char *text, rb_encoding *encoding) {
|
1612
1217
|
FrtAnalyzer *a = (FrtAnalyzer *)frt_h_get(PFA(self)->dict, (void *)field);
|
1613
|
-
if (a == NULL)
|
1218
|
+
if (a == NULL)
|
1614
1219
|
a = PFA(self)->default_a;
|
1615
|
-
|
1616
|
-
return frt_a_get_ts(a, field, text);
|
1220
|
+
return frt_a_get_ts(a, field, text, encoding);
|
1617
1221
|
}
|
1618
1222
|
|
1619
|
-
static void pfa_sub_a_destroy_i(void *p)
|
1620
|
-
{
|
1223
|
+
static void pfa_sub_a_destroy_i(void *p) {
|
1621
1224
|
FrtAnalyzer *a = (FrtAnalyzer *) p;
|
1622
1225
|
frt_a_deref(a);
|
1623
1226
|
}
|
1624
1227
|
|
1625
|
-
void frt_pfa_add_field(FrtAnalyzer *self,
|
1626
|
-
FrtSymbol field,
|
1627
|
-
FrtAnalyzer *analyzer)
|
1628
|
-
{
|
1228
|
+
void frt_pfa_add_field(FrtAnalyzer *self, ID field, FrtAnalyzer *analyzer) {
|
1629
1229
|
frt_h_set(PFA(self)->dict, (void *)field, analyzer);
|
1630
1230
|
}
|
1631
1231
|
|
1632
|
-
FrtAnalyzer *
|
1633
|
-
|
1634
|
-
|
1635
|
-
|
1636
|
-
PFA(a)->default_a = default_a;
|
1637
|
-
PFA(a)->dict = frt_h_new_ptr(&pfa_sub_a_destroy_i);
|
1232
|
+
FrtAnalyzer *frt_per_field_analyzer_alloc(void) {
|
1233
|
+
return (FrtAnalyzer *)frt_ecalloc(sizeof(FrtPerFieldAnalyzer));
|
1234
|
+
}
|
1638
1235
|
|
1236
|
+
void frt_per_field_analyzer_init(FrtAnalyzer *a, FrtAnalyzer *default_a) {
|
1639
1237
|
a->destroy_i = &pfa_destroy_i;
|
1640
|
-
a->get_ts = pfa_get_ts;
|
1238
|
+
a->get_ts = &pfa_get_ts;
|
1641
1239
|
a->ref_cnt = 1;
|
1642
1240
|
|
1241
|
+
PFA(a)->default_a = default_a;
|
1242
|
+
PFA(a)->dict = frt_h_new_ptr(&pfa_sub_a_destroy_i);
|
1243
|
+
}
|
1244
|
+
|
1245
|
+
FrtAnalyzer *frt_per_field_analyzer_new(FrtAnalyzer *default_a) {
|
1246
|
+
FrtAnalyzer *a = frt_per_field_analyzer_alloc();
|
1247
|
+
frt_per_field_analyzer_init(a, default_a);
|
1643
1248
|
return a;
|
1644
1249
|
}
|