isomorfeus-ferret 0.12.7 → 0.13.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +85 -13
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  11. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib_blocksort.c +1094 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_huffman.c +205 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +497 -495
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +91 -200
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -18
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +603 -410
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
  47. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  48. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  49. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +68 -91
  50. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  51. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  52. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  53. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  54. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  55. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  56. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  57. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  58. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  59. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  60. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  61. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  62. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  63. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  64. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  66. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  67. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  68. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  69. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  70. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +22 -112
  72. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  73. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  74. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  76. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  78. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  80. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  81. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  82. data/ext/isomorfeus_ferret_ext/test.c +0 -17
  83. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  84. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  85. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  86. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  87. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  88. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  89. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  90. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  91. data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
  92. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  93. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  94. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  95. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  96. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  97. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  98. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  99. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  100. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  101. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  102. data/ext/isomorfeus_ferret_ext/test_search.c +60 -64
  103. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  104. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  105. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  106. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  107. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  109. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  110. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  111. data/lib/isomorfeus/ferret/version.rb +1 -1
  112. metadata +27 -57
  113. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  114. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  115. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  116. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  117. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  118. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  119. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  120. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  121. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  122. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  160. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  162. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  163. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  164. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  165. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  166. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,130 +1,63 @@
1
1
  #ifndef FRT_ANALYSIS_H
2
2
  #define FRT_ANALYSIS_H
3
3
 
4
- #include <wchar.h>
5
4
  #include "frt_global.h"
6
5
  #include "frt_hash.h"
7
6
  #include "frt_multimapper.h"
7
+ #include <ruby/encoding.h>
8
8
 
9
- /****************************************************************************
10
- *
11
- * FrtToken
12
- *
13
- ****************************************************************************/
9
+ /*****************************************************************************/
10
+ /*** FrtToken ****************************************************************/
11
+ /*****************************************************************************/
14
12
 
15
- typedef struct FrtToken
16
- {
17
- char text[FRT_MAX_WORD_SIZE];
18
- int len;
13
+ typedef struct FrtToken {
14
+ char text[FRT_MAX_WORD_SIZE];
15
+ int len;
19
16
  off_t start;
20
17
  off_t end;
21
- int pos_inc;
18
+ int pos_inc;
22
19
  } FrtToken;
23
20
 
24
21
  extern FrtToken *frt_tk_new();
25
- extern void frt_tk_destroy(void *p);
26
- extern FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, off_t start, off_t end, int pos_inc);
27
- extern FrtToken *frt_tk_set_no_len(FrtToken *tk, char *text, off_t start, off_t end, int pos_inc);
28
- extern int frt_tk_eq(FrtToken *tk1, FrtToken *tk2);
29
- extern int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2);
30
-
31
- /****************************************************************************
32
- *
33
- * FrtTokenStream
34
- *
35
- ****************************************************************************/
22
+ extern void frt_tk_destroy(void *p);
23
+ extern FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, off_t start, off_t end, int pos_inc, rb_encoding *encoding);
24
+ extern FrtToken *frt_tk_set_no_len(FrtToken *tk, char *text, off_t start, off_t end, int pos_inc, rb_encoding *encoding);
25
+ extern int frt_tk_eq(FrtToken *tk1, FrtToken *tk2);
26
+ extern int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2);
36
27
 
28
+ /*****************************************************************************/
29
+ /*** FrtTokenStream **********************************************************/
30
+ /*****************************************************************************/
37
31
 
38
32
  typedef struct FrtTokenStream FrtTokenStream;
39
- struct FrtTokenStream
40
- {
41
- char *t; /* ptr used to scan text */
42
- char *text;
43
- FrtToken *(*next)(FrtTokenStream *ts);
44
- FrtTokenStream *(*reset)(FrtTokenStream *ts, char *text);
45
- FrtTokenStream *(*clone_i)(FrtTokenStream *ts);
46
- void (*destroy_i)(FrtTokenStream *ts);
33
+ struct FrtTokenStream {
34
+ char *t; /* ptr used to scan text */
35
+ char *text;
36
+ int length;
37
+ rb_encoding *encoding;
38
+ FrtToken *(*next)(FrtTokenStream *ts);
39
+ FrtTokenStream *(*reset)(FrtTokenStream *ts, char *text, rb_encoding *encoding);
40
+ FrtTokenStream *(*clone_i)(FrtTokenStream *ts);
41
+ void (*destroy_i)(FrtTokenStream *ts);
47
42
  int ref_cnt;
43
+ VALUE rts;
44
+ FrtToken token;
48
45
  };
49
46
 
50
- #define frt_ts_new(type) frt_ts_new_i(sizeof(type))
51
47
  extern FrtTokenStream *frt_ts_new_i(size_t size);
48
+ extern FrtTokenStream *frt_ts_init(FrtTokenStream *ts);
49
+ extern FrtTokenStream *frt_ts_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding);
52
50
  extern FrtTokenStream *frt_ts_clone_size(FrtTokenStream *orig_ts, size_t size);
53
51
 
54
- typedef struct FrtCachedTokenStream
55
- {
56
- FrtTokenStream super;
57
- FrtToken token;
58
- } FrtCachedTokenStream;
59
-
60
- typedef struct FrtMultiByteTokenStream
61
- {
62
- FrtCachedTokenStream super;
63
- mbstate_t state;
64
- } FrtMultiByteTokenStream;
65
-
66
- typedef enum
67
- {
68
- FRT_STT_ASCII,
69
- FRT_STT_MB,
70
- FRT_STT_UTF8
71
- } FrtStandardTokenizerType;
72
-
73
- typedef struct FrtStandardTokenizer
74
- {
75
- FrtCachedTokenStream super;
76
- FrtStandardTokenizerType type;
77
- } FrtStandardTokenizer;
78
-
79
- typedef struct FrtLegacyStandardTokenizer
80
- {
81
- FrtCachedTokenStream super;
82
- bool (*advance_to_start)(FrtTokenStream *ts);
83
- bool (*is_tok_char)(char *c);
84
- int (*get_alpha)(FrtTokenStream *ts, char *token);
85
- int (*get_apostrophe)(char *input);
86
- } FrtLegacyStandardTokenizer;
87
-
88
- typedef struct FrtTokenFilter
89
- {
52
+ typedef struct FrtTokenFilter {
90
53
  FrtTokenStream super;
91
54
  FrtTokenStream *sub_ts;
92
55
  } FrtTokenFilter;
93
56
 
94
57
  extern FrtTokenStream *frt_filter_clone_size(FrtTokenStream *ts, size_t size);
95
- #define tf_new(type, sub) frt_tf_new_i(sizeof(type), sub)
58
+ #define frt_tf_new(type, sub) frt_tf_new_i(sizeof(type), sub)
96
59
  extern FrtTokenStream *frt_tf_new_i(size_t size, FrtTokenStream *sub_ts);
97
60
 
98
- typedef struct FrtStopFilter
99
- {
100
- FrtTokenFilter super;
101
- FrtHash *words;
102
- } FrtStopFilter;
103
-
104
- typedef struct FrtMappingFilter
105
- {
106
- FrtTokenFilter super;
107
- FrtMultiMapper *mapper;
108
- } FrtMappingFilter;
109
-
110
- typedef struct FrtHyphenFilter
111
- {
112
- FrtTokenFilter super;
113
- char text[FRT_MAX_WORD_SIZE];
114
- int start;
115
- int pos;
116
- int len;
117
- FrtToken *tk;
118
- } FrtHyphenFilter;
119
-
120
- typedef struct FrtStemFilter
121
- {
122
- FrtTokenFilter super;
123
- struct sb_stemmer *stemmer;
124
- char *algorithm;
125
- char *charenc;
126
- } FrtStemFilter;
127
-
128
61
  #define frt_ts_next(mts) mts->next(mts)
129
62
  #define frt_ts_clone(mts) mts->clone_i(mts)
130
63
 
@@ -132,22 +65,58 @@ extern void frt_ts_deref(FrtTokenStream *ts);
132
65
 
133
66
  extern FrtTokenStream *frt_non_tokenizer_new();
134
67
 
135
- extern FrtTokenStream *frt_whitespace_tokenizer_new();
136
- extern FrtTokenStream *frt_mb_whitespace_tokenizer_new(bool lowercase);
68
+ /*****************************************************************************/
69
+ /*** FrtWhiteSpaceTokenizer **************************************************/
70
+ /*****************************************************************************/
71
+
72
+ extern FrtTokenStream *frt_whitespace_tokenizer_alloc(void);
73
+ extern FrtTokenStream *frt_whitespace_tokenizer_init(FrtTokenStream *ts);
74
+ extern FrtTokenStream *frt_whitespace_tokenizer_new(void);
75
+
76
+ /*****************************************************************************/
77
+ /*** FrtLetterTokenizer ******************************************************/
78
+ /*****************************************************************************/
79
+
80
+ extern FrtTokenStream *frt_letter_tokenizer_alloc(void);
81
+ extern FrtTokenStream *frt_letter_tokenizer_init(FrtTokenStream *ts);
82
+ extern FrtTokenStream *frt_letter_tokenizer_new(void);
83
+
84
+ /*****************************************************************************/
85
+ /*** FrtStandardTokenizer ****************************************************/
86
+ /*****************************************************************************/
137
87
 
138
- extern FrtTokenStream *frt_letter_tokenizer_new();
139
- extern FrtTokenStream *frt_mb_letter_tokenizer_new(bool lowercase);
88
+ extern FrtTokenStream *frt_standard_tokenizer_alloc(void);
89
+ extern FrtTokenStream *frt_standard_tokenizer_init(FrtTokenStream *ts);
90
+ extern FrtTokenStream *frt_standard_tokenizer_new(void);
140
91
 
141
- extern FrtTokenStream *frt_standard_tokenizer_new();
142
- extern FrtTokenStream *frt_mb_standard_tokenizer_new();
143
- extern FrtTokenStream *frt_utf8_standard_tokenizer_new();
92
+ /*****************************************************************************/
93
+ /*** FrtHyphenFilter *********************************************************/
94
+ /*****************************************************************************/
144
95
 
145
- extern FrtTokenStream *frt_legacy_standard_tokenizer_new();
146
- extern FrtTokenStream *frt_mb_legacy_standard_tokenizer_new();
96
+ typedef struct FrtHyphenFilter {
97
+ FrtTokenFilter super;
98
+ char text[FRT_MAX_WORD_SIZE];
99
+ int start;
100
+ int pos;
101
+ int len;
102
+ FrtToken *tk;
103
+ } FrtHyphenFilter;
104
+
105
+ extern FrtTokenStream *frt_hyphen_filter_alloc(void);
106
+ extern FrtTokenStream *frt_hyphen_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
107
+ extern FrtTokenStream *frt_hyphen_filter_new(FrtTokenStream *sub_ts);
147
108
 
148
- extern FrtTokenStream *frt_hyphen_filter_new(FrtTokenStream *ts);
149
- extern FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *ts);
150
- extern FrtTokenStream *frt_mb_lowercase_filter_new(FrtTokenStream *ts);
109
+ /*****************************************************************************/
110
+ /*** FrtLowercaseFilter ******************************************************/
111
+ /*****************************************************************************/
112
+
113
+ extern FrtTokenStream *frt_lowercase_filter_alloc(void);
114
+ extern void frt_lowercase_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
115
+ extern FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *sub_ts);
116
+
117
+ /*****************************************************************************/
118
+ /*** FrtStopFilter ***********************************************************/
119
+ /*****************************************************************************/
151
120
 
152
121
  extern const char *FRT_ENGLISH_STOP_WORDS[];
153
122
  extern const char *FRT_FULL_ENGLISH_STOP_WORDS[];
@@ -165,83 +134,118 @@ extern const char *FRT_FULL_RUSSIAN_STOP_WORDS[];
165
134
  extern const char *FRT_FULL_FINNISH_STOP_WORDS[];
166
135
  extern const char *FRT_FULL_HUNGARIAN_STOP_WORDS[];
167
136
 
168
- extern FrtTokenStream *frt_stop_filter_new_with_words_len(FrtTokenStream *ts,
169
- const char **words, int len);
170
- extern FrtTokenStream *frt_stop_filter_new_with_words(FrtTokenStream *ts,
171
- const char **words);
172
- extern FrtTokenStream *frt_stop_filter_new(FrtTokenStream *ts);
173
- extern FrtTokenStream *frt_stem_filter_new(FrtTokenStream *ts, const char *algorithm,
174
- const char *charenc);
175
-
176
- extern FrtTokenStream *frt_mapping_filter_new(FrtTokenStream *ts);
177
- extern FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern,
178
- const char *replacement);
179
-
180
- /****************************************************************************
181
- *
182
- * FrtAnalyzer
183
- *
184
- ****************************************************************************/
185
-
186
- typedef struct FrtAnalyzer
187
- {
137
+ typedef struct FrtStopFilter {
138
+ FrtTokenFilter super;
139
+ FrtHash *words;
140
+ } FrtStopFilter;
141
+
142
+ extern FrtTokenStream *frt_stop_filter_alloc(void);
143
+ extern FrtTokenStream *frt_stop_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
144
+ extern void frt_stop_filter_set_words(FrtTokenStream *ts, const char **words);
145
+ extern void frt_stop_filter_set_words_len(FrtTokenStream *ts, const char **words, int len);
146
+ extern FrtTokenStream *frt_stop_filter_new(FrtTokenStream *sub_ts);
147
+ extern FrtTokenStream *frt_stop_filter_new_with_words(FrtTokenStream *sub_ts, const char **words);
148
+ extern FrtTokenStream *frt_stop_filter_new_with_words_len(FrtTokenStream *sub_ts, const char **words, int len);
149
+
150
+ /*****************************************************************************/
151
+ /*** FrtStemFilter ***********************************************************/
152
+ /*****************************************************************************/
153
+
154
+ typedef struct FrtStemFilter {
155
+ FrtTokenFilter super;
156
+ struct sb_stemmer *stemmer;
157
+ char *algorithm;
158
+ char *charenc;
159
+ } FrtStemFilter;
160
+
161
+ extern FrtTokenStream *frt_stem_filter_alloc(void);
162
+ extern void frt_stem_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts, const char *algorithm);
163
+ extern FrtTokenStream *frt_stem_filter_new(FrtTokenStream *sub_ts, const char *algorithm);
164
+
165
+ /*****************************************************************************/
166
+ /*** FrtMappingFilter ********************************************************/
167
+ /*****************************************************************************/
168
+
169
+ typedef struct FrtMappingFilter {
170
+ FrtTokenFilter super;
171
+ FrtMultiMapper *mapper;
172
+ } FrtMappingFilter;
173
+
174
+ extern FrtTokenStream *frt_mapping_filter_alloc(void);
175
+ extern void frt_mapping_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts);
176
+ extern FrtTokenStream *frt_mapping_filter_new(FrtTokenStream *sub_ts);
177
+ extern FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern, const char *replacement);
178
+
179
+ /*****************************************************************************/
180
+ /*** FrtAnalyzer *************************************************************/
181
+ /*****************************************************************************/
182
+
183
+ typedef struct FrtAnalyzer {
188
184
  FrtTokenStream *current_ts;
189
- FrtTokenStream *(*get_ts)(struct FrtAnalyzer *a, FrtSymbol field, char *text);
190
- void (*destroy_i)(struct FrtAnalyzer *a);
191
- int ref_cnt;
185
+ FrtTokenStream *(*get_ts)(struct FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding);
186
+ void (*destroy_i)(struct FrtAnalyzer *a);
187
+ int ref_cnt;
188
+ VALUE ranalyzer;
192
189
  } FrtAnalyzer;
193
190
 
194
191
  extern void frt_a_deref(FrtAnalyzer *a);
195
192
 
196
- #define frt_a_get_ts(ma, field, text) ma->get_ts(ma, field, text)
193
+ #define frt_a_get_ts(ma, field, text, encoding) ma->get_ts(ma, field, text, encoding)
194
+
195
+ extern FrtAnalyzer *frt_analyzer_alloc(void);
196
+ extern void frt_analyzer_init(FrtAnalyzer *a, FrtTokenStream *ts, void (*destroy)(FrtAnalyzer *a),
197
+ FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding));
198
+ extern FrtAnalyzer *frt_analyzer_new(FrtTokenStream *ts, void (*destroy)(FrtAnalyzer *a),
199
+ FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding));
197
200
 
198
- extern FrtAnalyzer *frt_analyzer_new(FrtTokenStream *ts,
199
- void (*destroy)(FrtAnalyzer *a),
200
- FrtTokenStream *(*get_ts)(FrtAnalyzer *a,
201
- FrtSymbol field,
202
- char *text));
203
- extern FrtAnalyzer *frt_non_analyzer_new();
201
+ /*****************************************************************************/
202
+ /*** FrtNonAnalyzer **********************************************************/
203
+ /*****************************************************************************/
204
+
205
+ extern FrtAnalyzer *frt_non_analyzer_new(void);
204
206
 
205
207
  extern void frt_a_standard_destroy(FrtAnalyzer *a);
206
208
 
209
+ /*****************************************************************************/
210
+ /*** FrtWhiteSpaceAnalyzer ***************************************************/
211
+ /*****************************************************************************/
212
+
213
+ extern FrtAnalyzer *frt_whitespace_analyzer_alloc(void);
214
+ extern void frt_whitespace_analyzer_init(FrtAnalyzer *a, bool lowercase);
207
215
  extern FrtAnalyzer *frt_whitespace_analyzer_new(bool lowercase);
208
- extern FrtAnalyzer *frt_mb_whitespace_analyzer_new(bool lowercase);
209
216
 
217
+ /*****************************************************************************/
218
+ /*** FrtLetterAnalyzer *******************************************************/
219
+ /*****************************************************************************/
220
+
221
+ extern FrtAnalyzer *frt_letter_analyzer_alloc(void);
222
+ extern void frt_letter_analyzer_init(FrtAnalyzer *a, bool lowercase);
210
223
  extern FrtAnalyzer *frt_letter_analyzer_new(bool lowercase);
211
- extern FrtAnalyzer *frt_mb_letter_analyzer_new(bool lowercase);
212
224
 
225
+ /*****************************************************************************/
226
+ /*** FrtStandardAnalyzer *****************************************************/
227
+ /*****************************************************************************/
228
+
229
+ extern FrtAnalyzer *frt_standard_analyzer_alloc(void);
230
+ extern void frt_standard_analyzer_init(FrtAnalyzer *a, bool lowercase, const char **words);
213
231
  extern FrtAnalyzer *frt_standard_analyzer_new(bool lowercase);
214
- extern FrtAnalyzer *frt_mb_standard_analyzer_new(bool lowercase);
215
- extern FrtAnalyzer *frt_utf8_standard_analyzer_new(bool lowercase);
216
-
217
- extern FrtAnalyzer *frt_standard_analyzer_new_with_words(
218
- const char **words, bool lowercase);
219
- extern FrtAnalyzer *frt_standard_analyzer_new_with_words_len(
220
- const char **words, int len, bool lowercase);
221
- extern FrtAnalyzer *frt_mb_standard_analyzer_new_with_words(
222
- const char **words, bool lowercase);
223
- extern FrtAnalyzer *frt_utf8_standard_analyzer_new_with_words(
224
- const char **words, bool lowercase);
225
-
226
- extern FrtAnalyzer *frt_legacy_standard_analyzer_new(bool lowercase);
227
- extern FrtAnalyzer *frt_mb_legacy_standard_analyzer_new(bool lowercase);
228
-
229
- extern FrtAnalyzer *frt_legacy_standard_analyzer_new_with_words(
230
- const char **words, bool lowercase);
231
- extern FrtAnalyzer *frt_mb_legacy_standard_analyzer_new_with_words(
232
- const char **words, bool lowercase);
232
+ extern FrtAnalyzer *frt_standard_analyzer_new_with_words(bool lowercase, const char **words);
233
+
234
+ /*****************************************************************************/
235
+ /*** FrtPerFieldAnalyzer *****************************************************/
236
+ /*****************************************************************************/
233
237
 
234
238
  #define PFA(analyzer) ((FrtPerFieldAnalyzer *)(analyzer))
235
- typedef struct FrtPerFieldAnalyzer
236
- {
237
- FrtAnalyzer super;
238
- FrtHash *dict;
239
- FrtAnalyzer *default_a;
239
+
240
+ typedef struct FrtPerFieldAnalyzer {
241
+ FrtAnalyzer super;
242
+ FrtHash *dict;
243
+ FrtAnalyzer *default_a;
240
244
  } FrtPerFieldAnalyzer;
241
245
 
242
- extern FrtAnalyzer *frt_per_field_analyzer_new(FrtAnalyzer *a);
243
- extern void frt_pfa_add_field(FrtAnalyzer *self,
244
- FrtSymbol field,
245
- FrtAnalyzer *analyzer);
246
+ extern FrtAnalyzer *frt_per_field_analyzer_alloc(void);
247
+ extern void frt_per_field_analyzer_init(FrtAnalyzer *a, FrtAnalyzer *default_a);
248
+ extern FrtAnalyzer *frt_per_field_analyzer_new(FrtAnalyzer *default_a);
249
+ extern void frt_pfa_add_field(FrtAnalyzer *self, ID field, FrtAnalyzer *analyzer);
246
250
 
247
251
  #endif
@@ -109,13 +109,11 @@ void *frt_ary_remove_i(void **ary, int index)
109
109
  }
110
110
  }
111
111
 
112
- void frt_ary_delete_i(void **ary, int index, void (*free_elem)(void *p))
113
- {
112
+ void frt_ary_delete_i(void **ary, int index, void (*free_elem)(void *p)) {
114
113
  free_elem(frt_ary_remove(ary, index));
115
114
  }
116
115
 
117
- void frt_ary_destroy_i(void **ary, void (*free_elem)(void *p))
118
- {
116
+ void frt_ary_destroy_i(void **ary, void (*free_elem)(void *p)) {
119
117
  int i;
120
118
  for (i = frt_ary_sz(ary) - 1; i >= 0; i--) {
121
119
  free_elem(ary[i]);
@@ -1,8 +1,7 @@
1
1
  #include "frt_bitvector.h"
2
2
  #include <string.h>
3
3
 
4
- FrtBitVector *frt_bv_new_capa(int capa)
5
- {
4
+ FrtBitVector *frt_bv_new_capa(int capa) {
6
5
  FrtBitVector *bv = FRT_ALLOC_AND_ZERO(FrtBitVector);
7
6
 
8
7
  /* The capacity passed by the user is number of bits allowed, however we
@@ -11,37 +10,33 @@ FrtBitVector *frt_bv_new_capa(int capa)
11
10
  bv->bits = FRT_ALLOC_AND_ZERO_N(frt_u32, bv->capa);
12
11
  bv->curr_bit = -1;
13
12
  bv->ref_cnt = 1;
13
+ bv->rbv = Qnil;
14
14
  return bv;
15
15
  }
16
16
 
17
- FrtBitVector *frt_bv_new()
18
- {
17
+ FrtBitVector *frt_bv_new(void) {
19
18
  return frt_bv_new_capa(FRT_BV_INIT_CAPA);
20
19
  }
21
20
 
22
- void frt_bv_destroy(FrtBitVector *bv)
23
- {
21
+ void frt_bv_destroy(FrtBitVector *bv) {
24
22
  if (--(bv->ref_cnt) == 0) {
25
23
  free(bv->bits);
26
24
  free(bv);
27
25
  }
28
26
  }
29
27
 
30
- void frt_bv_clear(FrtBitVector *bv)
31
- {
28
+ void frt_bv_clear(FrtBitVector *bv) {
32
29
  memset(bv->bits, 0, bv->capa * sizeof(frt_u32));
33
30
  bv->extends_as_ones = 0;
34
31
  bv->count = 0;
35
32
  bv->size = 0;
36
33
  }
37
34
 
38
- void frt_bv_scan_reset(FrtBitVector *bv)
39
- {
35
+ void frt_bv_scan_reset(FrtBitVector *bv) {
40
36
  bv->curr_bit = -1;
41
37
  }
42
38
 
43
- int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2)
44
- {
39
+ int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2) {
45
40
  frt_u32 *bits, *bits2;
46
41
  int min_size, word_size, ext_word_size = 0, i;
47
42
  if (bv1 == bv2) {
@@ -65,8 +60,7 @@ int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2)
65
60
  if (bv1->size > min_size) {
66
61
  bits = bv1->bits;
67
62
  ext_word_size = FRT_TO_WORD(bv1->size);
68
- }
69
- else if (bv2->size > min_size) {
63
+ } else if (bv2->size > min_size) {
70
64
  bits = bv2->bits;
71
65
  ext_word_size = FRT_TO_WORD(bv2->size);
72
66
  }
@@ -81,8 +75,7 @@ int frt_bv_eq(FrtBitVector *bv1, FrtBitVector *bv2)
81
75
  return true;
82
76
  }
83
77
 
84
- unsigned long long frt_bv_hash(FrtBitVector *bv)
85
- {
78
+ unsigned long long frt_bv_hash(FrtBitVector *bv) {
86
79
  unsigned long long hash = 0;
87
80
  const frt_u32 empty_word = bv->extends_as_ones ? 0xFFFFFFFF : 0;
88
81
  int i;