isomorfeus-ferret 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (164) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +54 -1
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
  11. data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  47. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  48. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
  49. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  50. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  51. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  52. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  53. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  54. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  55. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  56. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  57. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  58. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  59. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  60. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  61. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  62. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  63. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  64. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  66. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  67. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  68. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  69. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  70. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  72. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  73. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  74. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  76. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  78. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  80. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  81. data/ext/isomorfeus_ferret_ext/test.c +1 -2
  82. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  83. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  84. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  85. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  86. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  87. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  88. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  89. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  90. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  91. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  92. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  93. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  94. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  95. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  96. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  97. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  98. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  99. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  100. data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
  101. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  102. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  103. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  104. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  105. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  106. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  107. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  109. data/lib/isomorfeus/ferret/version.rb +1 -1
  110. metadata +27 -57
  111. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  112. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  113. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  114. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  115. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  116. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  117. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  118. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  119. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  120. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  160. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  162. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  163. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  164. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,23 +1,16 @@
1
- #include <locale.h>
2
1
  #include "frt_analysis.h"
3
2
  #include "isomorfeus_ferret.h"
3
+ #include <ruby.h>
4
4
  #include <ruby/re.h>
5
- #include <ruby/st.h>
6
-
7
- static char *frb_locale = NULL;
8
5
 
9
6
  static VALUE mAnalysis;
10
7
 
11
8
  static VALUE cToken;
12
- static VALUE cAsciiLetterTokenizer;
13
9
  static VALUE cLetterTokenizer;
14
- static VALUE cAsciiWhiteSpaceTokenizer;
15
10
  static VALUE cWhiteSpaceTokenizer;
16
- static VALUE cAsciiStandardTokenizer;
17
11
  static VALUE cStandardTokenizer;
18
12
  static VALUE cRegExpTokenizer;
19
13
 
20
- static VALUE cAsciiLowerCaseFilter;
21
14
  static VALUE cLowerCaseFilter;
22
15
  static VALUE cStopFilter;
23
16
  static VALUE cMappingFilter;
@@ -25,11 +18,8 @@ static VALUE cHyphenFilter;
25
18
  static VALUE cStemFilter;
26
19
 
27
20
  static VALUE cAnalyzer;
28
- static VALUE cAsciiLetterAnalyzer;
29
21
  static VALUE cLetterAnalyzer;
30
- static VALUE cAsciiWhiteSpaceAnalyzer;
31
22
  static VALUE cWhiteSpaceAnalyzer;
32
- static VALUE cAsciiStandardAnalyzer;
33
23
  static VALUE cStandardAnalyzer;
34
24
  static VALUE cPerFieldAnalyzer;
35
25
  static VALUE cRegExpAnalyzer;
@@ -47,12 +37,10 @@ static ID id_token_stream;
47
37
 
48
38
  static VALUE object_space;
49
39
 
50
- extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
51
- int, struct re_registers *);
40
+ extern rb_encoding *utf8_encoding;
41
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, int, struct re_registers *);
52
42
 
53
- int
54
- frb_rb_hash_size(VALUE hash)
55
- {
43
+ int frb_rb_hash_size(VALUE hash) {
56
44
  #ifdef RHASH_SIZE
57
45
  return RHASH_SIZE(hash);
58
46
  #else
@@ -66,9 +54,7 @@ frb_rb_hash_size(VALUE hash)
66
54
  *
67
55
  ****************************************************************************/
68
56
 
69
- static char **
70
- get_stopwords(VALUE rstop_words)
71
- {
57
+ static char **get_stopwords(VALUE rstop_words) {
72
58
  char **stop_words;
73
59
  int i, len;
74
60
  VALUE rstr;
@@ -89,60 +75,43 @@ get_stopwords(VALUE rstop_words)
89
75
  *
90
76
  ****************************************************************************/
91
77
 
92
- typedef struct RToken {
93
- VALUE text;
94
- int start;
95
- int end;
96
- int pos_inc;
97
- } RToken;
98
-
99
- static void
100
- frb_token_free(void *p)
101
- {
78
+ static void frb_token_free(void *p) {
102
79
  free(p);
103
80
  }
104
81
 
105
- static void
106
- frb_token_mark(void *p)
107
- {
108
- RToken *token = (RToken *)p;
109
- rb_gc_mark(token->text);
110
- }
111
-
112
- static VALUE
113
- frb_token_alloc(VALUE klass)
114
- {
115
- return Data_Wrap_Struct(klass, &frb_token_mark, &frb_token_free,
116
- ALLOC(RToken));
117
- }
118
-
119
- static VALUE
120
- get_token(FrtToken *tk)
121
- {
122
- RToken *token = ALLOC(RToken);
123
-
124
- token->text = rb_str_new2(tk->text);
125
- token->start = tk->start;
126
- token->end = tk->end;
127
- token->pos_inc = tk->pos_inc;
128
- return Data_Wrap_Struct(cToken, &frb_token_mark, &frb_token_free, token);
129
- }
130
-
131
- FrtToken *
132
- frb_set_token(FrtToken *tk, VALUE rt)
133
- {
134
- RToken *rtk;
135
-
136
- if (rt == Qnil) return NULL;
137
-
138
- Data_Get_Struct(rt, RToken, rtk);
139
- frt_tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
140
- rtk->start, rtk->end, rtk->pos_inc);
82
+ static size_t frb_token_size(const void *p) {
83
+ return sizeof(FrtToken);
84
+ (void)p;
85
+ }
86
+
87
+ const rb_data_type_t frb_token_t = {
88
+ .wrap_struct_name = "FrbToken",
89
+ .function = {
90
+ .dmark = NULL,
91
+ .dfree = frb_token_free,
92
+ .dsize = frb_token_size,
93
+ .dcompact = NULL,
94
+ .reserved = {0},
95
+ },
96
+ .parent = NULL,
97
+ .data = NULL,
98
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
99
+ };
100
+
101
+ static VALUE frb_token_alloc(VALUE rclass) {
102
+ FrtToken *tk = frt_tk_new();
103
+ return TypedData_Wrap_Struct(rclass, &frb_token_t, tk);
104
+ }
105
+
106
+ FrtToken *frb_set_token(FrtToken *tk, VALUE rt) {
107
+ FrtToken *rtk;
108
+ if (rt == Qnil)
109
+ return NULL;
110
+ TypedData_Get_Struct(rt, FrtToken, &frb_token_t, rtk);
111
+ frt_tk_set(tk, rtk->text, rtk->len, rtk->start, rtk->end, rtk->pos_inc, utf8_encoding);
141
112
  return tk;
142
113
  }
143
114
 
144
- #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
145
-
146
115
  /*
147
116
  * call-seq:
148
117
  * Token.new(text, start, end, pos_inc = 1) -> new Token
@@ -174,21 +143,17 @@ frb_set_token(FrtToken *tk, VALUE rt)
174
143
  * pos_inc:: the position increment of a token. See above.
175
144
  * return:: a newly created and assigned Token object
176
145
  */
177
- static VALUE
178
- frb_token_init(int argc, VALUE *argv, VALUE self)
179
- {
180
- RToken *token;
181
- VALUE rtext, rstart, rend, rpos_inc, rtype;
182
- GET_TK(token, self);
183
- token->pos_inc = 1;
184
- switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
185
- &rend, &rpos_inc, &rtype)) {
186
- case 5: /* type gets ignored at this stage */
187
- case 4: token->pos_inc = FIX2INT(rpos_inc);
146
+ static VALUE frb_token_init(int argc, VALUE *argv, VALUE self) {
147
+ FrtToken *tk;
148
+ char *text;
149
+ int pos_inc = 1;
150
+ VALUE rtext, rstart, rend, rpos_inc;
151
+ switch (rb_scan_args(argc, argv, "31", &rtext, &rstart, &rend, &rpos_inc)) {
152
+ case 4: pos_inc = FIX2INT(rpos_inc);
188
153
  }
189
- token->text = rb_obj_as_string(rtext);
190
- token->start = FIX2INT(rstart);
191
- token->end = FIX2INT(rend);
154
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
155
+ text = rs2s(rtext);
156
+ frt_tk_set(tk, text, strlen(text), FIX2INT(rstart), FIX2INT(rend), pos_inc, rb_enc_get(rtext));
192
157
  return self;
193
158
  }
194
159
 
@@ -204,27 +169,11 @@ frb_token_init(int argc, VALUE *argv, VALUE self)
204
169
  * pos_inc=) then, they are sorted by the end offset and then
205
170
  * lexically by the token text.
206
171
  */
207
- static VALUE
208
- frb_token_cmp(VALUE self, VALUE rother)
209
- {
210
- RToken *token, *other;
211
- int cmp;
212
- GET_TK(token, self);
213
- GET_TK(other, rother);
214
- if (token->start > other->start) {
215
- cmp = 1;
216
- } else if (token->start < other->start) {
217
- cmp = -1;
218
- } else {
219
- if (token->end > other->end) {
220
- cmp = 1;
221
- } else if (token->end < other->end) {
222
- cmp = -1;
223
- } else {
224
- cmp = strcmp(rs2s(token->text), rs2s(other->text));
225
- }
226
- }
227
- return INT2FIX(cmp);
172
+ static VALUE frb_token_cmp(VALUE self, VALUE rother) {
173
+ FrtToken *tk, *other;
174
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
175
+ TypedData_Get_Struct(rother, FrtToken, &frb_token_t, other);
176
+ return INT2FIX(frt_tk_cmp(tk, other));
228
177
  }
229
178
 
230
179
  /*
@@ -233,12 +182,12 @@ frb_token_cmp(VALUE self, VALUE rother)
233
182
  *
234
183
  * Returns the text that this token represents
235
184
  */
236
- static VALUE
237
- frb_token_get_text(VALUE self)
238
- {
239
- RToken *token;
240
- GET_TK(token, self);
241
- return token->text;
185
+ static VALUE frb_token_get_text(VALUE self) {
186
+ FrtToken *tk;
187
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
188
+ VALUE rtext = rb_str_new2(tk->text);
189
+ rb_enc_associate(rtext, utf8_encoding);
190
+ return rtext;
242
191
  }
243
192
 
244
193
  /*
@@ -247,13 +196,12 @@ frb_token_get_text(VALUE self)
247
196
  *
248
197
  * Set the text for this token.
249
198
  */
250
- static VALUE
251
- frb_token_set_text(VALUE self, VALUE rtext)
252
- {
253
- RToken *token;
254
- GET_TK(token, self);
255
- token->text = rtext;
256
- return rtext;
199
+ static VALUE frb_token_set_text(VALUE self, VALUE rtext) {
200
+ FrtToken *tk;
201
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
202
+ char *text = rs2s(rtext);
203
+ frt_tk_set(tk, text, strlen(text), tk->start, tk->end, tk->pos_inc, rb_enc_get(rtext));
204
+ return frb_token_get_text(self);
257
205
  }
258
206
 
259
207
  /*
@@ -262,12 +210,10 @@ frb_token_set_text(VALUE self, VALUE rtext)
262
210
  *
263
211
  * Start byte-position of this token
264
212
  */
265
- static VALUE
266
- frb_token_get_start_offset(VALUE self)
267
- {
268
- RToken *token;
269
- GET_TK(token, self);
270
- return INT2FIX(token->start);
213
+ static VALUE frb_token_get_start_offset(VALUE self) {
214
+ FrtToken *tk;
215
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
216
+ return INT2FIX(tk->start);
271
217
  }
272
218
 
273
219
  /*
@@ -276,12 +222,10 @@ frb_token_get_start_offset(VALUE self)
276
222
  *
277
223
  * End byte-position of this token
278
224
  */
279
- static VALUE
280
- frb_token_get_end_offset(VALUE self)
281
- {
282
- RToken *token;
283
- GET_TK(token, self);
284
- return INT2FIX(token->end);
225
+ static VALUE frb_token_get_end_offset(VALUE self) {
226
+ FrtToken *tk;
227
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
228
+ return INT2FIX(tk->end);
285
229
  }
286
230
 
287
231
  /*
@@ -290,12 +234,10 @@ frb_token_get_end_offset(VALUE self)
290
234
  *
291
235
  * Position Increment for this token
292
236
  */
293
- static VALUE
294
- frb_token_get_pos_inc(VALUE self)
295
- {
296
- RToken *token;
297
- GET_TK(token, self);
298
- return INT2FIX(token->pos_inc);
237
+ static VALUE frb_token_get_pos_inc(VALUE self) {
238
+ FrtToken *tk;
239
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
240
+ return INT2FIX(tk->pos_inc);
299
241
  }
300
242
 
301
243
  /*
@@ -304,12 +246,10 @@ frb_token_get_pos_inc(VALUE self)
304
246
  *
305
247
  * Set start byte-position of this token
306
248
  */
307
- static VALUE
308
- frb_token_set_start_offset(VALUE self, VALUE rstart)
309
- {
310
- RToken *token;
311
- GET_TK(token, self);
312
- token->start = FIX2INT(rstart);
249
+ static VALUE frb_token_set_start_offset(VALUE self, VALUE rstart) {
250
+ FrtToken *tk;
251
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
252
+ tk->start = FIX2INT(rstart);
313
253
  return rstart;
314
254
  }
315
255
 
@@ -319,12 +259,10 @@ frb_token_set_start_offset(VALUE self, VALUE rstart)
319
259
  *
320
260
  * Set end byte-position of this token
321
261
  */
322
- static VALUE
323
- frb_token_set_end_offset(VALUE self, VALUE rend)
324
- {
325
- RToken *token;
326
- GET_TK(token, self);
327
- token->end = FIX2INT(rend);
262
+ static VALUE frb_token_set_end_offset(VALUE self, VALUE rend) {
263
+ FrtToken *tk;
264
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
265
+ tk->end = FIX2INT(rend);
328
266
  return rend;
329
267
  }
330
268
 
@@ -355,12 +293,10 @@ frb_token_set_end_offset(VALUE self, VALUE rend)
355
293
  * when the terms occur with no intervening stop words.
356
294
  *
357
295
  */
358
- static VALUE
359
- frb_token_set_pos_inc(VALUE self, VALUE rpos_inc)
360
- {
361
- RToken *token;
362
- GET_TK(token, self);
363
- token->pos_inc = FIX2INT(rpos_inc);
296
+ static VALUE frb_token_set_pos_inc(VALUE self, VALUE rpos_inc) {
297
+ FrtToken *tk;
298
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
299
+ tk->pos_inc = FIX2INT(rpos_inc);
364
300
  return rpos_inc;
365
301
  }
366
302
 
@@ -370,16 +306,16 @@ frb_token_set_pos_inc(VALUE self, VALUE rpos_inc)
370
306
  *
371
307
  * Return a string representation of the token
372
308
  */
373
- static VALUE
374
- frb_token_to_s(VALUE self)
375
- {
376
- RToken *token;
309
+ static VALUE frb_token_to_s(VALUE self) {
310
+ FrtToken *tk;
377
311
  char *buf;
378
- GET_TK(token, self);
379
- buf = alloca(RSTRING_LEN(token->text) + 80);
380
- sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
381
- token->start, token->end, token->pos_inc);
382
- return rb_str_new2(buf);
312
+ VALUE rstr;
313
+ TypedData_Get_Struct(self, FrtToken, &frb_token_t, tk);
314
+ buf = alloca(strlen(tk->text) + 80);
315
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", tk->text, (int)tk->start, (int)tk->end, tk->pos_inc);
316
+ rstr = rb_str_new2(buf);
317
+ rb_enc_associate(rstr, utf8_encoding);
318
+ return rstr;
383
319
  }
384
320
 
385
321
  /****************************************************************************
@@ -388,57 +324,77 @@ frb_token_to_s(VALUE self)
388
324
  *
389
325
  ****************************************************************************/
390
326
 
391
- #define GET_TS(ts, self) Data_Get_Struct(self, FrtTokenStream, ts)
392
-
393
- static void
394
- frb_ts_mark(void *p)
395
- {
396
- FrtTokenStream *ts = (FrtTokenStream *)p;
397
- if (ts->text) frb_gc_mark(&ts->text);
327
+ static void frb_ts_free(void *p) {
328
+ frt_ts_deref((FrtTokenStream *)p);
398
329
  }
399
330
 
400
- static void
401
- frb_ts_free(FrtTokenStream *ts)
402
- {
403
- if (object_get(&ts->text) != Qnil) {
404
- object_del(&ts->text);
405
- }
406
- object_del(ts);
407
- frt_ts_deref(ts);
331
+ static size_t frb_ts_size(const void *p) {
332
+ return sizeof(FrtTokenStream);
333
+ (void)p;
408
334
  }
409
335
 
410
- static void frb_rets_free(FrtTokenStream *ts);
411
- static void frb_rets_mark(FrtTokenStream *ts);
336
+ typedef struct RegExpTokenStream {
337
+ FrtTokenStream super;
338
+ VALUE regex;
339
+ VALUE proc;
340
+ long curr_ind;
341
+ } RegExpTokenStream;
342
+
343
+ static void frb_rets_mark(void *p) {
344
+ RegExpTokenStream *ts = (RegExpTokenStream *)p;
345
+ rb_gc_mark(ts->regex);
346
+ rb_gc_mark(ts->proc);
347
+ }
348
+
349
+ static size_t frb_rets_size(const void *p) {
350
+ return sizeof(RegExpTokenStream);
351
+ (void)p;
352
+ }
353
+
354
+ const rb_data_type_t frb_token_stream_t = {
355
+ .wrap_struct_name = "FrbTokenStream",
356
+ .function = {
357
+ .dmark = NULL,
358
+ .dfree = frb_ts_free,
359
+ .dsize = frb_ts_size,
360
+ .dcompact = NULL,
361
+ .reserved = {0},
362
+ },
363
+ .parent = NULL,
364
+ .data = NULL,
365
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
366
+ };
367
+
368
+ const rb_data_type_t frb_reg_exp_token_stream_t = {
369
+ .wrap_struct_name = "FrbRegExpTokenStream",
370
+ .function = {
371
+ .dmark = frb_rets_mark,
372
+ .dfree = frb_ts_free,
373
+ .dsize = frb_rets_size,
374
+ .dcompact = NULL,
375
+ .reserved = {0},
376
+ },
377
+ .parent = &frb_token_stream_t,
378
+ .data = NULL,
379
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
380
+ };
381
+
412
382
  static FrtToken *rets_next(FrtTokenStream *ts);
413
383
 
414
- static VALUE
415
- get_rb_token_stream(FrtTokenStream *ts)
416
- {
417
- VALUE rts = object_get(ts);
418
- if (rts == Qnil) {
384
+ static VALUE get_rb_token_stream(FrtTokenStream *ts) {
385
+ VALUE rts = ts->rts;
386
+
387
+ if (rts == 0 || rts == Qnil) {
419
388
  if (ts->next == &rets_next) {
420
- rts = Data_Wrap_Struct(cTokenStream, &frb_rets_mark,
421
- &frb_rets_free, ts);
389
+ rts = TypedData_Wrap_Struct(cTokenStream, &frb_reg_exp_token_stream_t, ts);
422
390
  } else {
423
- rts = Data_Wrap_Struct(cTokenStream, &frb_ts_mark,
424
- &frb_ts_free, ts);
391
+ rts = TypedData_Wrap_Struct(cTokenStream, &frb_token_stream_t, ts);
425
392
  }
426
- object_add(ts, rts);
393
+ ts->rts = rts;
427
394
  }
428
395
  return rts;
429
396
  }
430
397
 
431
- static VALUE
432
- get_wrapped_ts(VALUE self, VALUE rstr, FrtTokenStream *ts)
433
- {
434
- StringValue(rstr);
435
- ts->reset(ts, rs2s(rstr));
436
- Frt_Wrap_Struct(self, &frb_ts_mark, &frb_ts_free, ts);
437
- object_add(&ts->text, rstr);
438
- object_add(ts, self);
439
- return self;
440
- }
441
-
442
398
  /*
443
399
  * call-seq:
444
400
  * token_stream.text = text -> text
@@ -448,17 +404,12 @@ get_wrapped_ts(VALUE self, VALUE rstr, FrtTokenStream *ts)
448
404
  *
449
405
  * token_stream.text = File.read(file_name)
450
406
  */
451
- static VALUE
452
- frb_ts_set_text(VALUE self, VALUE rtext)
453
- {
407
+ static VALUE frb_ts_set_text(VALUE self, VALUE rtext) {
454
408
  FrtTokenStream *ts;
455
- Data_Get_Struct(self, FrtTokenStream, ts);
409
+ // TypedData_Get_Struct(self, FrtTokenStream, &frb_token_stream_t, ts);
410
+ ts = DATA_PTR(self);
456
411
  StringValue(rtext);
457
- ts->reset(ts, rs2s(rtext));
458
-
459
- /* prevent garbage collection */
460
- rb_ivar_set(self, id_text, rtext);
461
-
412
+ ts->reset(ts, rs2s(rtext), rb_enc_get(rtext));
462
413
  return rtext;
463
414
  }
464
415
 
@@ -468,17 +419,14 @@ frb_ts_set_text(VALUE self, VALUE rtext)
468
419
  *
469
420
  * Return the text that the TokenStream is tokenizing
470
421
  */
471
- static VALUE
472
- frb_ts_get_text(VALUE self)
473
- {
422
+ static VALUE frb_ts_get_text(VALUE self) {
474
423
  VALUE rtext = Qnil;
475
424
  FrtTokenStream *ts;
476
- Data_Get_Struct(self, FrtTokenStream, ts);
477
- if ((rtext = object_get(&ts->text)) == Qnil) {
478
- if (ts->text) {
479
- rtext = rb_str_new2(ts->text);
480
- object_set(&ts->text, rtext);
481
- }
425
+ // TypedData_Get_Struct(self, FrtTokenStream, &frb_token_stream_t, ts);
426
+ ts = DATA_PTR(self);
427
+ if (ts->text) {
428
+ rtext = rb_str_new2(ts->text);
429
+ rb_enc_associate(rtext, ts->encoding);
482
430
  }
483
431
  return rtext;
484
432
  }
@@ -490,18 +438,16 @@ frb_ts_get_text(VALUE self)
490
438
  * Return the next token from the TokenStream or nil if there are no more
491
439
  * tokens.
492
440
  */
493
- static VALUE
494
- frb_ts_next(VALUE self)
495
- {
496
- FrtTokenStream *ts;
497
- FrtToken *next;
498
- GET_TS(ts, self);
499
- next = ts->next(ts);
500
- if (next == NULL) {
441
+ static VALUE frb_ts_next(VALUE self) {
442
+ FrtTokenStream *ts = DATA_PTR(self);
443
+ FrtToken *next_tk;
444
+ FrtToken *tk = ts->next(ts);
445
+ if (tk == NULL) {
501
446
  return Qnil;
502
447
  }
503
-
504
- return get_token(next);
448
+ next_tk = frt_tk_new();
449
+ frt_tk_set(next_tk, tk->text, tk->len, tk->start, tk->end, tk->pos_inc, utf8_encoding);
450
+ return TypedData_Wrap_Struct(cToken, &frb_token_t, next_tk);
505
451
  }
506
452
 
507
453
  /****************************************************************************
@@ -510,83 +456,73 @@ frb_ts_next(VALUE self)
510
456
 
511
457
  #define TkFilt(filter) ((FrtTokenFilter *)(filter))
512
458
 
513
- static void
514
- frb_tf_mark(void *p)
515
- {
459
+ static void frb_tf_mark(void *p) {
516
460
  FrtTokenStream *ts = (FrtTokenStream *)p;
517
- if (TkFilt(ts)->sub_ts) {
518
- frb_gc_mark(&TkFilt(ts)->sub_ts);
519
- }
461
+ if (TkFilt(ts)->sub_ts->rts)
462
+ rb_gc_mark(TkFilt(ts)->sub_ts->rts);
520
463
  }
521
464
 
522
- static void
523
- frb_tf_free(FrtTokenStream *ts)
524
- {
525
- if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
526
- object_del(&TkFilt(ts)->sub_ts);
527
- }
528
- object_del(ts);
529
- frt_ts_deref(ts);
465
+ static void frb_tf_free(void *p) {
466
+ frt_ts_deref((FrtTokenStream *)p);
530
467
  }
531
468
 
469
+ static size_t frb_tf_size(const void *p) {
470
+ return sizeof(FrtTokenFilter);
471
+ (void)p;
472
+ }
473
+
474
+ const rb_data_type_t frb_token_filter_t = {
475
+ .wrap_struct_name = "FrbTokenFilter",
476
+ .function = {
477
+ .dmark = frb_tf_mark,
478
+ .dfree = frb_tf_free,
479
+ .dsize = frb_tf_size,
480
+ .dcompact = NULL,
481
+ .reserved = {0},
482
+ },
483
+ .parent = NULL,
484
+ .data = NULL,
485
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
486
+ };
532
487
 
533
488
  /****************************************************************************
534
- * CWrappedTokenStream
489
+ * Wrapped TokenStream
535
490
  ****************************************************************************/
536
491
 
537
- #define CachedTS(token_stream) ((FrtCachedTokenStream *)(token_stream))
538
- #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
539
-
540
- typedef struct CWrappedTokenStream {
541
- FrtCachedTokenStream super;
542
- VALUE rts;
543
- } CWrappedTokenStream;
544
-
545
- static void
546
- cwrts_destroy_i(FrtTokenStream *ts)
547
- {
548
- if (object_get(&ts->text) != Qnil) {
549
- object_del(&ts->text);
550
- }
492
+ static void cwrts_destroy_i(FrtTokenStream *ts) {
551
493
  rb_hash_delete(object_space, ((VALUE)ts)|1);
552
494
  free(ts);
553
495
  }
554
496
 
555
- static FrtToken *
556
- cwrts_next(FrtTokenStream *ts)
557
- {
558
- VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
559
- return frb_set_token(&(CachedTS(ts)->token), rtoken);
497
+ static FrtToken *cwrts_next(FrtTokenStream *ts) {
498
+ VALUE rtoken = rb_funcall(ts->rts, id_next, 0);
499
+ return frb_set_token(&(ts->token), rtoken);
560
500
  }
561
501
 
562
- static FrtTokenStream *
563
- cwrts_reset(FrtTokenStream *ts, char *text)
564
- {
502
+ static FrtTokenStream *cwrts_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
565
503
  ts->t = ts->text = text;
566
- rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
504
+ ts->length = strlen(text);
505
+ ts->encoding = encoding;
506
+ rb_funcall(ts->rts, id_reset, 1, rb_str_new2(text));
567
507
  return ts;
568
508
  }
569
509
 
570
- static FrtTokenStream *
571
- cwrts_clone_i(FrtTokenStream *orig_ts)
572
- {
573
- FrtTokenStream *new_ts = frt_ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
574
- VALUE rts = CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
510
+ static FrtTokenStream *cwrts_clone_i(FrtTokenStream *orig_ts) {
511
+ FrtTokenStream *new_ts = frt_ts_clone_size(orig_ts, sizeof(FrtTokenStream));
512
+ VALUE rts = new_ts->rts = rb_funcall(orig_ts->rts, id_clone, 0);
575
513
  rb_hash_aset(object_space, ((VALUE)new_ts)|1, rts);
576
514
  return new_ts;
577
515
  }
578
516
 
579
- static FrtTokenStream *
580
- frb_get_cwrapped_rts(VALUE rts)
581
- {
517
+ static FrtTokenStream *frb_get_cwrapped_rts(VALUE rts) {
582
518
  FrtTokenStream *ts;
583
519
  if (frb_is_cclass(rts) && DATA_PTR(rts)) {
584
- GET_TS(ts, rts);
520
+ // TypedData_Get_Struct(rts, FrtTokenStream, &frb_token_stream_t, ts);
521
+ ts = DATA_PTR(rts);
585
522
  FRT_REF(ts);
586
- }
587
- else {
588
- ts = frt_ts_new(CWrappedTokenStream);
589
- CWTS(ts)->rts = rts;
523
+ } else {
524
+ ts = frt_ts_new_i(sizeof(FrtTokenStream));
525
+ ts->rts = rts;
590
526
  ts->next = &cwrts_next;
591
527
  ts->reset = &cwrts_reset;
592
528
  ts->clone_i = &cwrts_clone_i;
@@ -619,43 +555,11 @@ static const char *TOKEN_RE =
619
555
  ")";
620
556
  static VALUE rtoken_re;
621
557
 
622
- typedef struct RegExpTokenStream {
623
- FrtCachedTokenStream super;
624
- VALUE rtext;
625
- VALUE regex;
626
- VALUE proc;
627
- long curr_ind;
628
- } RegExpTokenStream;
629
-
630
- static void
631
- rets_destroy_i(FrtTokenStream *ts)
632
- {
633
- if (object_get(&ts->text) != Qnil) {
634
- object_del(&ts->text);
635
- }
558
+ static void rets_destroy_i(FrtTokenStream *ts) {
636
559
  rb_hash_delete(object_space, ((VALUE)ts)|1);
637
560
  free(ts);
638
561
  }
639
562
 
640
- static void
641
- frb_rets_free(FrtTokenStream *ts)
642
- {
643
- if (object_get(&ts->text) != Qnil) {
644
- object_del(&ts->text);
645
- }
646
- object_del(ts);
647
- frt_ts_deref(ts);
648
- }
649
-
650
- static void
651
- frb_rets_mark(FrtTokenStream *ts)
652
- {
653
- if (ts->text) frb_gc_mark(&ts->text);
654
- rb_gc_mark(RETS(ts)->rtext);
655
- rb_gc_mark(RETS(ts)->regex);
656
- rb_gc_mark(RETS(ts)->proc);
657
- }
658
-
659
563
  /*
660
564
  * call-seq:
661
565
  * tokenizer.text = text -> text
@@ -663,17 +567,11 @@ frb_rets_mark(FrtTokenStream *ts)
663
567
  * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
664
568
  * tokenize the text from the beginning.
665
569
  */
666
- static VALUE
667
- frb_rets_set_text(VALUE self, VALUE rtext)
668
- {
570
+ static VALUE frb_rets_set_text(VALUE self, VALUE rtext) {
669
571
  FrtTokenStream *ts;
670
- GET_TS(ts, self);
671
-
672
- rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
572
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_reg_exp_token_stream_t, ts);
673
573
  StringValue(rtext);
674
- RETS(ts)->rtext = rtext;
675
- RETS(ts)->curr_ind = 0;
676
-
574
+ ts->reset(ts, rs2s(rtext), rb_enc_get(rtext));
677
575
  return rtext;
678
576
  }
679
577
 
@@ -683,12 +581,12 @@ frb_rets_set_text(VALUE self, VALUE rtext)
683
581
  *
684
582
  * Get the text being tokenized by the tokenizer.
685
583
  */
686
- static VALUE
687
- frb_rets_get_text(VALUE self)
688
- {
584
+ static VALUE frb_rets_get_text(VALUE self) {
689
585
  FrtTokenStream *ts;
690
- GET_TS(ts, self);
691
- return RETS(ts)->rtext;
586
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_reg_exp_token_stream_t, ts);
587
+ VALUE rstr = rb_str_new2(ts->text);
588
+ rb_enc_associate(rstr, ts->encoding);
589
+ return rstr;
692
590
  }
693
591
 
694
592
  // partly lifted from ruby 1.9 string.c
@@ -696,92 +594,84 @@ frb_rets_get_text(VALUE self)
696
594
  #define BEG(no) regs->beg[no]
697
595
  #define END(no) regs->end[no]
698
596
  #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
699
- static VALUE
700
- scan_once(VALUE str, VALUE pat, long *start)
701
- {
702
- VALUE match;
703
- struct re_registers *regs;
704
-
705
- if (rb_reg_search(pat, str, *start, 0) >= 0) {
706
- match = rb_backref_get();
707
- regs = RMATCH_REGS(match);
708
- if (BEG(0) == END(0)) {
709
- rb_encoding *enc = STR_ENC_GET(str);
710
- /*
711
- * Always consume at least one character of the input string
712
- */
713
- if (RSTRING_LEN(str) > END(0))
714
- *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
715
- RSTRING_END(str), enc);
716
- else
717
- *start = END(0)+1;
718
- }
719
- else {
720
- *start = END(0);
597
+ static VALUE scan_once(VALUE rstr, VALUE pat, long *start) {
598
+ VALUE match;
599
+ struct re_registers *regs;
600
+ if (rb_reg_search(pat, rstr, *start, 0) >= 0) {
601
+ match = rb_backref_get();
602
+ regs = RMATCH_REGS(match);
603
+ if (BEG(0) == END(0)) {
604
+ rb_encoding *enc = STR_ENC_GET(rstr);
605
+ /* Always consume at least one character of the input string */
606
+ if (RSTRING_LEN(rstr) > END(0))
607
+ *start = END(0)+rb_enc_mbclen(RSTRING_PTR(rstr)+END(0), RSTRING_END(rstr), enc);
608
+ else
609
+ *start = END(0)+1;
610
+ } else {
611
+ *start = END(0);
612
+ }
613
+ return rb_reg_nth_match(0, match);
721
614
  }
722
- return rb_reg_nth_match(0, match);
723
- }
724
615
  return Qnil;
725
616
  }
726
- //
727
617
 
728
- static FrtToken *
729
- rets_next(FrtTokenStream *ts)
730
- {
731
- VALUE ret;
732
- long rtok_len;
733
- int beg, end;
734
- Check_Type(RETS(ts)->regex, T_REGEXP);
735
- ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
736
- if (NIL_P(ret)) return NULL;
737
-
738
- Check_Type(ret, T_STRING);
739
- rtok_len = RSTRING_LEN(ret);
740
- beg = RETS(ts)->curr_ind - rtok_len;
741
- end = RETS(ts)->curr_ind;
742
-
743
- if (NIL_P(RETS(ts)->proc)) {
744
- return frt_tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
745
- beg, end, 1);
746
- } else {
747
- VALUE rtok;
748
- rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
749
- return frt_tk_set(&(CachedTS(ts)->token), rs2s(rtok),
750
- RSTRING_LEN(rtok), beg, end, 1);
751
- }
752
- }
753
-
754
- static FrtTokenStream *
755
- rets_reset(FrtTokenStream *ts, char *text)
756
- {
757
- RETS(ts)->rtext = rb_str_new2(text);
618
+ static FrtToken *rets_next(FrtTokenStream *ts) {
619
+ VALUE ret;
620
+ long rtok_len;
621
+ int beg, end;
622
+ Check_Type(RETS(ts)->regex, T_REGEXP);
623
+
624
+ VALUE rstr = rb_str_new_static(ts->text, ts->length);
625
+ rb_enc_associate(rstr, ts->encoding);
626
+
627
+ ret = scan_once(rstr, RETS(ts)->regex, &(RETS(ts)->curr_ind));
628
+
629
+ if (NIL_P(ret))
630
+ return NULL;
631
+
632
+ Check_Type(ret, T_STRING);
633
+ rtok_len = RSTRING_LEN(ret);
634
+ beg = RETS(ts)->curr_ind - rtok_len;
635
+ end = RETS(ts)->curr_ind;
636
+
637
+ if (NIL_P(RETS(ts)->proc)) {
638
+ return frt_tk_set(&(ts->token), rs2s(ret), rtok_len, beg, end, 1, rb_enc_get(ret));
639
+ } else {
640
+ VALUE rtok;
641
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
642
+ return frt_tk_set(&(ts->token), rs2s(rtok), RSTRING_LEN(rtok), beg, end, 1, rb_enc_get(rtok));
643
+ }
644
+ }
645
+
646
+ static FrtTokenStream *rets_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
647
+ frt_ts_reset(ts, text, encoding);
758
648
  RETS(ts)->curr_ind = 0;
759
649
  return ts;
760
650
  }
761
651
 
762
- static FrtTokenStream *
763
- rets_clone_i(FrtTokenStream *orig_ts)
764
- {
652
+ static FrtTokenStream *rets_clone_i(FrtTokenStream *orig_ts) {
765
653
  FrtTokenStream *ts = frt_ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
766
654
  return ts;
767
655
  }
768
656
 
769
- static FrtTokenStream *
770
- rets_new(VALUE rtext, VALUE regex, VALUE proc)
771
- {
772
- FrtTokenStream *ts = frt_ts_new(RegExpTokenStream);
657
+ FrtTokenStream *rets_alloc(void) {
658
+ return (FrtTokenStream *)frt_ecalloc(sizeof(RegExpTokenStream));
659
+ }
773
660
 
774
- if (rtext != Qnil) {
775
- rtext = StringValue(rtext);
776
- rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
777
- }
661
+ FrtTokenStream *rets_init(FrtTokenStream *ts, VALUE rtext, VALUE regex, VALUE proc) {
662
+ ts = frt_ts_init(ts);
778
663
  ts->reset = &rets_reset;
779
664
  ts->next = &rets_next;
780
665
  ts->clone_i = &rets_clone_i;
781
666
  ts->destroy_i = &rets_destroy_i;
782
667
 
783
- RETS(ts)->curr_ind = 0;
784
- RETS(ts)->rtext = rtext;
668
+ if (rtext != Qnil) {
669
+ rtext = StringValue(rtext);
670
+ ts->reset(ts, rs2s(rtext), rb_enc_get(rtext));
671
+ } else {
672
+ RETS(ts)->curr_ind = 0;
673
+ }
674
+
785
675
  RETS(ts)->proc = proc;
786
676
 
787
677
  if (NIL_P(regex)) {
@@ -790,10 +680,20 @@ rets_new(VALUE rtext, VALUE regex, VALUE proc)
790
680
  Check_Type(regex, T_REGEXP);
791
681
  RETS(ts)->regex = regex;
792
682
  }
793
-
794
683
  return ts;
795
684
  }
796
685
 
686
+ static FrtTokenStream *rets_new(VALUE rtext, VALUE regex, VALUE proc) {
687
+ FrtTokenStream *ts = rets_alloc();
688
+ return rets_init(ts, rtext, regex, proc);
689
+ }
690
+
691
+
692
+ static VALUE frb_reg_exp_tokenizer_alloc(VALUE rclass) {
693
+ FrtTokenStream *ts = rets_alloc();
694
+ return TypedData_Wrap_Struct(rclass, &frb_reg_exp_token_stream_t, ts);
695
+ }
696
+
797
697
  /*
798
698
  * call-seq:
799
699
  * RegExpTokenizer.new(input, /[[:alpha:]]+/)
@@ -803,18 +703,13 @@ rets_new(VALUE rtext, VALUE regex, VALUE proc)
803
703
  * input:: text to tokenizer
804
704
  * regexp:: regular expression used to recognize tokens in the input
805
705
  */
806
- static VALUE
807
- frb_rets_init(int argc, VALUE *argv, VALUE self)
808
- {
706
+ static VALUE frb_rets_init(int argc, VALUE *argv, VALUE self) {
809
707
  VALUE rtext, regex, proc;
810
- FrtTokenStream *ts;
811
-
812
708
  rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
813
-
814
- ts = rets_new(rtext, regex, proc);
815
-
816
- Frt_Wrap_Struct(self, &frb_rets_mark, &frb_rets_free, ts);
817
- object_add(ts, self);
709
+ FrtTokenStream *ts;
710
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_reg_exp_token_stream_t, ts);
711
+ rets_init(ts, rtext, regex, proc);
712
+ ts->rts = self;
818
713
  return self;
819
714
  }
820
715
 
@@ -822,129 +717,72 @@ frb_rets_init(int argc, VALUE *argv, VALUE self)
822
717
  * Tokenizers
823
718
  ****************************************************************************/
824
719
 
825
- #define TS_ARGS(dflt) \
826
- bool lower;\
827
- VALUE rlower, rstr;\
828
- rb_scan_args(argc, argv, "11", &rstr, &rlower);\
829
- lower = (argc ? RTEST(rlower) : dflt)
830
-
831
720
  /*
832
721
  * call-seq:
833
- * AsciiLetterTokenizer.new() -> tokenizer
722
+ * LetterTokenizer.new() -> tokenizer
834
723
  *
835
- * Create a new AsciiLetterTokenizer
724
+ * Create a new LetterTokenizer.
836
725
  */
837
- static VALUE
838
- frb_a_letter_tokenizer_init(VALUE self, VALUE rstr)
839
- {
840
- return get_wrapped_ts(self, rstr, frt_letter_tokenizer_new());
726
+ static VALUE frb_letter_tokenizer_alloc(VALUE rclass) {
727
+ FrtTokenStream *ts = frt_letter_tokenizer_new();
728
+ return TypedData_Wrap_Struct(rclass, &frb_token_stream_t, ts);
841
729
  }
842
730
 
843
- /*
844
- * call-seq:
845
- * LetterTokenizer.new(lower = true) -> tokenizer
846
- *
847
- * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
848
- * is done according the current locale.
849
- *
850
- * lower:: set to false if you don't wish to downcase tokens
851
- */
852
- static VALUE
853
- frb_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
854
- {
855
- TS_ARGS(false);
856
- #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
857
- if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
858
- #endif
859
- return get_wrapped_ts(self, rstr, frt_mb_letter_tokenizer_new(lower));
731
+ static VALUE frb_letter_tokenizer_init(int argc, VALUE *argv, VALUE self) {
732
+ VALUE rstr;
733
+ rb_scan_args(argc, argv, "1", &rstr);
734
+ FrtTokenStream *ts;
735
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_token_stream_t, ts);
736
+ ts->reset(ts, rs2s(rstr), rb_enc_get(rstr));
737
+ ts->rts = self;
738
+ return self;
860
739
  }
861
740
 
862
741
  /*
863
742
  * call-seq:
864
- * AsciiWhiteSpaceTokenizer.new() -> tokenizer
743
+ * WhiteSpaceTokenizer.new -> tokenizer
865
744
  *
866
- * Create a new AsciiWhiteSpaceTokenizer
745
+ * Create a new WhiteSpaceTokenizer.
867
746
  */
868
- static VALUE
869
- frb_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
870
- {
871
- return get_wrapped_ts(self, rstr, frt_whitespace_tokenizer_new());
747
+ static VALUE frb_whitespace_tokenizer_alloc(VALUE rclass) {
748
+ FrtTokenStream *ts = frt_whitespace_tokenizer_new();
749
+ return TypedData_Wrap_Struct(rclass, &frb_token_stream_t, ts);
872
750
  }
873
751
 
874
- /*
875
- * call-seq:
876
- * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
877
- *
878
- * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
879
- * Downcasing is done according the current locale.
880
- *
881
- * lower:: set to false if you don't wish to downcase tokens
882
- */
883
- static VALUE
884
- frb_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
885
- {
886
- TS_ARGS(false);
887
- #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
888
- if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
889
- #endif
890
- return get_wrapped_ts(self, rstr, frt_mb_whitespace_tokenizer_new(lower));
752
+ static VALUE frb_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self) {
753
+ VALUE rstr;
754
+ rb_scan_args(argc, argv, "1", &rstr);
755
+ FrtTokenStream *ts;
756
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_token_stream_t, ts);
757
+ ts->reset(ts, rs2s(rstr), rb_enc_get(rstr));
758
+ ts->rts = self;
759
+ return self;
891
760
  }
892
761
 
893
762
  /*
894
763
  * call-seq:
895
- * AsciiStandardTokenizer.new() -> tokenizer
764
+ * StandardTokenizer.new -> tokenizer
896
765
  *
897
- * Create a new AsciiStandardTokenizer
766
+ * Create a new StandardTokenizer.
898
767
  */
899
- static VALUE
900
- frb_a_standard_tokenizer_init(VALUE self, VALUE rstr)
901
- {
902
- return get_wrapped_ts(self, rstr, frt_standard_tokenizer_new());
768
+ static VALUE frb_standard_tokenizer_alloc(VALUE rclass) {
769
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
770
+ return TypedData_Wrap_Struct(rclass, &frb_token_stream_t, ts);
903
771
  }
904
772
 
905
- /*
906
- * call-seq:
907
- * StandardTokenizer.new(lower = true) -> tokenizer
908
- *
909
- * Create a new StandardTokenizer which optionally downcases tokens.
910
- * Downcasing is done according the current locale.
911
- *
912
- * lower:: set to false if you don't wish to downcase tokens
913
- */
914
- static VALUE
915
- frb_standard_tokenizer_init(VALUE self, VALUE rstr)
916
- {
917
- #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
918
- if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
919
- #endif
920
- return get_wrapped_ts(self, rstr, frt_mb_standard_tokenizer_new());
773
+ static VALUE frb_standard_tokenizer_init(int argc, VALUE *argv, VALUE self) {
774
+ VALUE rstr;
775
+ rb_scan_args(argc, argv, "1", &rstr);
776
+ FrtTokenStream *ts;
777
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_token_stream_t, ts);
778
+ ts->reset(ts, rs2s(rstr), rb_enc_get(rstr));
779
+ ts->rts = self;
780
+ return self;
921
781
  }
922
782
 
923
783
  /****************************************************************************
924
784
  * Filters
925
785
  ****************************************************************************/
926
-
927
-
928
- /*
929
- * call-seq:
930
- * AsciiLowerCaseFilter.new(token_stream) -> token_stream
931
- *
932
- * Create an AsciiLowerCaseFilter which normalizes a token's text to
933
- * lowercase but only for ASCII characters. For other characters use
934
- * LowerCaseFilter.
935
- */
936
- static VALUE
937
- frb_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
938
- {
939
- FrtTokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
940
- ts = frt_lowercase_filter_new(ts);
941
- object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
942
-
943
- Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
944
- object_add(ts, self);
945
- return self;
946
- }
947
-
948
786
  /*
949
787
  * call-seq:
950
788
  * LowerCaseFilter.new(token_stream) -> token_stream
@@ -952,18 +790,33 @@ frb_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
952
790
  * Create an LowerCaseFilter which normalizes a token's text to
953
791
  * lowercase based on the current locale.
954
792
  */
955
- static VALUE
956
- frb_lowercase_filter_init(VALUE self, VALUE rsub_ts)
957
- {
958
- FrtTokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
959
- #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
960
- if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
961
- #endif
962
- ts = frt_mb_lowercase_filter_new(ts);
963
- object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
964
793
 
965
- Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
966
- object_add(ts, self);
794
+ const rb_data_type_t frb_lowercase_filter_t = {
795
+ .wrap_struct_name = "FrbLowercaseFilter",
796
+ .function = {
797
+ .dmark = frb_tf_mark,
798
+ .dfree = frb_tf_free,
799
+ .dsize = frb_tf_size,
800
+ .dcompact = NULL,
801
+ .reserved = {0},
802
+ },
803
+ .parent = NULL,
804
+ .data = NULL,
805
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
806
+ };
807
+
808
+ static VALUE frb_lowercase_filter_alloc(VALUE rclass) {
809
+ FrtTokenStream *tf = frt_lowercase_filter_alloc();
810
+ return TypedData_Wrap_Struct(rclass, &frb_lowercase_filter_t, tf);
811
+ }
812
+
813
+ static VALUE frb_lowercase_filter_init(VALUE self, VALUE rsub_ts) {
814
+ FrtTokenStream *sub_ts = frb_get_cwrapped_rts(rsub_ts);
815
+ FrtTokenStream *ts;
816
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_lowercase_filter_t, ts);
817
+ frt_lowercase_filter_init(ts, sub_ts);
818
+ TkFilt(ts)->sub_ts->rts = rsub_ts;
819
+ ts->rts = self;
967
820
  return self;
968
821
  }
969
822
 
@@ -977,15 +830,38 @@ frb_lowercase_filter_init(VALUE self, VALUE rsub_ts)
977
830
  * search for "e-mail", "email" and "mail" will all match. This filter is
978
831
  * used by default by the StandardAnalyzer.
979
832
  */
980
- static VALUE
981
- frb_hyphen_filter_init(VALUE self, VALUE rsub_ts)
982
- {
983
- FrtTokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
984
- ts = frt_hyphen_filter_new(ts);
985
- object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
986
833
 
987
- Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
988
- object_add(ts, self);
834
+ static size_t frb_hyphen_filter_size(const void *p) {
835
+ return sizeof(FrtHyphenFilter);
836
+ (void)p;
837
+ }
838
+
839
+ const rb_data_type_t frb_hyphen_filter_t = {
840
+ .wrap_struct_name = "FrbHyphenFilter",
841
+ .function = {
842
+ .dmark = frb_tf_mark,
843
+ .dfree = frb_tf_free,
844
+ .dsize = frb_hyphen_filter_size,
845
+ .dcompact = NULL,
846
+ .reserved = {0},
847
+ },
848
+ .parent = NULL,
849
+ .data = NULL,
850
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
851
+ };
852
+
853
+ static VALUE frb_hyphen_filter_alloc(VALUE rclass) {
854
+ FrtTokenStream *hf = frt_hyphen_filter_alloc();
855
+ return TypedData_Wrap_Struct(rclass, &frb_hyphen_filter_t, hf);
856
+ }
857
+
858
+ static VALUE frb_hyphen_filter_init(VALUE self, VALUE rsub_ts) {
859
+ FrtTokenStream *sub_ts = frb_get_cwrapped_rts(rsub_ts);
860
+ FrtTokenStream *ts;
861
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_hyphen_filter_t, ts);
862
+ frt_hyphen_filter_init(ts, sub_ts);
863
+ TkFilt(ts)->sub_ts->rts = rsub_ts;
864
+ ts->rts = self;
989
865
  return self;
990
866
  }
991
867
 
@@ -1002,31 +878,54 @@ frb_hyphen_filter_init(VALUE self, VALUE rsub_ts)
1002
878
  * defaults to a list of English stop-words. The
1003
879
  * Ferret::Analysis contains a number of stop-word lists.
1004
880
  */
1005
- static VALUE
1006
- frb_stop_filter_init(int argc, VALUE *argv, VALUE self)
1007
- {
881
+
882
+ static size_t frb_stop_filter_size(const void *p) {
883
+ return sizeof(FrtStopFilter);
884
+ (void)p;
885
+ }
886
+
887
+ const rb_data_type_t frb_stop_filter_t = {
888
+ .wrap_struct_name = "FrbStopFilter",
889
+ .function = {
890
+ .dmark = frb_tf_mark,
891
+ .dfree = frb_tf_free,
892
+ .dsize = frb_stop_filter_size,
893
+ .dcompact = NULL,
894
+ .reserved = {0},
895
+ },
896
+ .parent = NULL,
897
+ .data = NULL,
898
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
899
+ };
900
+
901
+ static VALUE frb_stop_filter_alloc(VALUE rclass) {
902
+ FrtTokenStream *sf = frt_stop_filter_alloc();
903
+ return TypedData_Wrap_Struct(rclass, &frb_stop_filter_t, sf);
904
+ }
905
+
906
+ static VALUE frb_stop_filter_init(int argc, VALUE *argv, VALUE self) {
1008
907
  VALUE rsub_ts, rstop_words;
908
+ FrtTokenStream *sub_ts;
1009
909
  FrtTokenStream *ts;
1010
910
  rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
1011
- ts = frb_get_cwrapped_rts(rsub_ts);
911
+ sub_ts = frb_get_cwrapped_rts(rsub_ts);
912
+
913
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_stop_filter_t, ts);
1012
914
  if (rstop_words != Qnil) {
1013
915
  char **stop_words = get_stopwords(rstop_words);
1014
- ts = frt_stop_filter_new_with_words(ts, (const char **)stop_words);
1015
-
916
+ frt_stop_filter_init(ts, sub_ts);
917
+ frt_stop_filter_set_words(ts, (const char **)stop_words);
1016
918
  free(stop_words);
1017
919
  } else {
1018
- ts = frt_stop_filter_new(ts);
920
+ frt_stop_filter_init(ts, sub_ts);
1019
921
  }
1020
- object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
922
+ TkFilt(ts)->sub_ts->rts = rsub_ts;
1021
923
 
1022
- Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1023
- object_add(ts, self);
924
+ ts->rts = self;
1024
925
  return self;
1025
926
  }
1026
927
 
1027
- static void frb_add_mapping_i(FrtTokenStream *mf, VALUE from,
1028
- const char *to)
1029
- {
928
+ static void frb_add_mapping_i(FrtTokenStream *mf, VALUE from, const char *to) {
1030
929
  switch (TYPE(from)) {
1031
930
  case T_STRING:
1032
931
  frt_mapping_filter_add(mf, rs2s(from), to);
@@ -1042,8 +941,7 @@ static void frb_add_mapping_i(FrtTokenStream *mf, VALUE from,
1042
941
  }
1043
942
  }
1044
943
 
1045
- static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1046
- {
944
+ static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg) {
1047
945
  if (key == Qundef) {
1048
946
  return ST_CONTINUE;
1049
947
  } else {
@@ -1067,8 +965,7 @@ static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1067
965
  for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
1068
966
  frb_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
1069
967
  }
1070
- }
1071
- else {
968
+ } else {
1072
969
  frb_add_mapping_i(mf, key, to);
1073
970
  }
1074
971
  }
@@ -1100,18 +997,40 @@ static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1100
997
  * ['è','é','ê','ë','ē','ę'] => 'e'
1101
998
  * })
1102
999
  */
1103
- static VALUE
1104
- frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1105
- {
1000
+
1001
+ static size_t frb_mapping_filter_size(const void *p) {
1002
+ return sizeof(FrtMappingFilter);
1003
+ (void)p;
1004
+ }
1005
+
1006
+ const rb_data_type_t frb_mapping_filter_t = {
1007
+ .wrap_struct_name = "FrbMappingFilter",
1008
+ .function = {
1009
+ .dmark = frb_tf_mark,
1010
+ .dfree = frb_tf_free,
1011
+ .dsize = frb_mapping_filter_size,
1012
+ .dcompact = NULL,
1013
+ .reserved = {0},
1014
+ },
1015
+ .parent = NULL,
1016
+ .data = NULL,
1017
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
1018
+ };
1019
+
1020
+ static VALUE frb_mapping_filter_alloc(VALUE rclass) {
1021
+ FrtTokenStream *mf = frt_mapping_filter_alloc();
1022
+ return TypedData_Wrap_Struct(rclass, &frb_mapping_filter_t, mf);
1023
+ }
1024
+
1025
+ static VALUE frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping) {
1106
1026
  FrtTokenStream *ts;
1107
- ts = frb_get_cwrapped_rts(rsub_ts);
1108
- ts = frt_mapping_filter_new(ts);
1027
+ FrtTokenStream *sub_ts = frb_get_cwrapped_rts(rsub_ts);
1028
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_mapping_filter_t, ts);
1029
+ frt_mapping_filter_init(ts, sub_ts);
1109
1030
  rb_hash_foreach(mapping, frb_add_mappings_i, (VALUE)ts);
1110
1031
  frt_mulmap_compile(((FrtMappingFilter *)ts)->mapper);
1111
- object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1112
-
1113
- Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1114
- object_add(ts, self);
1032
+ TkFilt(ts)->sub_ts->rts = rsub_ts;
1033
+ ts->rts = self;
1115
1034
  return self;
1116
1035
  }
1117
1036
 
@@ -1128,29 +1047,49 @@ frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1128
1047
  *
1129
1048
  * token_stream:: TokenStream to be filtered
1130
1049
  * algorithm:: The algorithm (or language) to use
1131
- * encoding:: The encoding of the data (default: "UTF-8")
1132
1050
  */
1133
- static VALUE
1134
- frb_stem_filter_init(int argc, VALUE *argv, VALUE self)
1135
- {
1136
- VALUE rsub_ts, ralgorithm, rcharenc;
1051
+
1052
+ static size_t frb_stem_filter_size(const void *p) {
1053
+ return sizeof(FrtStemFilter);
1054
+ (void)p;
1055
+ }
1056
+
1057
+ const rb_data_type_t frb_stem_filter_t = {
1058
+ .wrap_struct_name = "FrbStemFilter",
1059
+ .function = {
1060
+ .dmark = frb_tf_mark,
1061
+ .dfree = frb_tf_free,
1062
+ .dsize = frb_stem_filter_size,
1063
+ .dcompact = NULL,
1064
+ .reserved = {0},
1065
+ },
1066
+ .parent = NULL,
1067
+ .data = NULL,
1068
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
1069
+ };
1070
+
1071
+ static VALUE frb_stem_filter_alloc(VALUE rclass) {
1072
+ FrtTokenStream *sf = frt_stem_filter_alloc();
1073
+ return TypedData_Wrap_Struct(rclass, &frb_stem_filter_t, sf);
1074
+ }
1075
+
1076
+ static VALUE frb_stem_filter_init(int argc, VALUE *argv, VALUE self) {
1077
+ VALUE rsub_ts, ralgorithm;
1137
1078
  const char *algorithm = "english";
1138
- char *charenc = NULL;
1079
+ FrtTokenStream *sub_ts;
1139
1080
  FrtTokenStream *ts;
1140
- rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
1141
- ts = frb_get_cwrapped_rts(rsub_ts);
1142
- switch (argc) {
1143
- case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
1144
- case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
1145
- }
1146
- ts = frt_stem_filter_new(ts, algorithm, charenc);
1147
- object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1081
+ TypedData_Get_Struct(self, FrtTokenStream, &frb_stem_filter_t, ts);
1082
+
1083
+ rb_scan_args(argc, argv, "11", &rsub_ts, &ralgorithm);
1084
+ sub_ts = frb_get_cwrapped_rts(rsub_ts);
1085
+ if (argc == 2)
1086
+ algorithm = rs2s(rb_obj_as_string(ralgorithm));
1148
1087
 
1149
- Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1150
- object_add(ts, self);
1088
+ frt_stem_filter_init(ts, sub_ts, algorithm);
1089
+ TkFilt(ts)->sub_ts->rts = rsub_ts;
1090
+ ts->rts = self;
1151
1091
  if (((FrtStemFilter *)ts)->stemmer == NULL) {
1152
- rb_raise(rb_eArgError, "No stemmer could be found with the encoding "
1153
- "%s and the language %s", charenc, algorithm);
1092
+ rb_raise(rb_eArgError, "No stemmer could be found for the %s language.", algorithm);
1154
1093
  }
1155
1094
  return self;
1156
1095
  }
@@ -1165,79 +1104,82 @@ frb_stem_filter_init(int argc, VALUE *argv, VALUE self)
1165
1104
  * CWrappedAnalyzer Methods
1166
1105
  ****************************************************************************/
1167
1106
 
1168
- #define GET_A(a, self) Data_Get_Struct(self, FrtAnalyzer, a)
1169
-
1170
- #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
1171
- typedef struct CWrappedAnalyzer
1172
- {
1107
+ typedef struct CWrappedAnalyzer {
1173
1108
  FrtAnalyzer super;
1174
1109
  VALUE ranalyzer;
1175
1110
  } CWrappedAnalyzer;
1176
1111
 
1177
- static void
1178
- cwa_destroy_i(FrtAnalyzer *a)
1179
- {
1112
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
1113
+
1114
+ static void frb_analyzer_free(void *p) {
1115
+ frt_a_deref((FrtAnalyzer *)p);
1116
+ }
1117
+
1118
+ static size_t frb_analyzer_size(const void *p) {
1119
+ return sizeof(CWrappedAnalyzer);
1120
+ (void)p;
1121
+ }
1122
+
1123
+ const rb_data_type_t frb_analyzer_t = {
1124
+ .wrap_struct_name = "FrbAnalyzer",
1125
+ .function = {
1126
+ .dmark = NULL,
1127
+ .dfree = frb_analyzer_free,
1128
+ .dsize = frb_analyzer_size,
1129
+ .dcompact = NULL,
1130
+ .reserved = {0},
1131
+ },
1132
+ .parent = NULL,
1133
+ .data = NULL,
1134
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
1135
+ };
1136
+
1137
+ static void cwa_destroy_i(FrtAnalyzer *a) {
1180
1138
  rb_hash_delete(object_space, ((VALUE)a)|1);
1181
1139
  free(a);
1182
1140
  }
1183
1141
 
1184
- static FrtTokenStream *
1185
- cwa_get_ts(FrtAnalyzer *a, FrtSymbol field, char *text)
1186
- {
1187
- VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1188
- rb_str_new_cstr(rb_id2name(field)), rb_str_new_cstr(text));
1142
+ static FrtTokenStream *cwa_get_ts(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding) {
1143
+ VALUE rstr = rb_str_new_cstr(text);
1144
+ rb_enc_associate(rstr, encoding);
1145
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2, rb_str_new_cstr(rb_id2name(field)), rstr);
1189
1146
  return frb_get_cwrapped_rts(rts);
1190
1147
  }
1191
1148
 
1192
- FrtAnalyzer *
1193
- frb_get_cwrapped_analyzer(VALUE ranalyzer)
1194
- {
1149
+ FrtAnalyzer *frb_get_cwrapped_analyzer(VALUE ranalyzer) {
1195
1150
  FrtAnalyzer *a = NULL;
1196
1151
  if (frb_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1197
- Data_Get_Struct(ranalyzer, FrtAnalyzer, a);
1152
+ // TypedData_Get_Struct(ranalyzer, FrtAnalyzer, &frb_analyzer_t, a);
1153
+ a = DATA_PTR(ranalyzer);
1198
1154
  FRT_REF(a);
1199
- }
1200
- else {
1155
+ } else {
1201
1156
  a = (FrtAnalyzer *)frt_ecalloc(sizeof(CWrappedAnalyzer));
1202
1157
  a->destroy_i = &cwa_destroy_i;
1203
1158
  a->get_ts = &cwa_get_ts;
1204
1159
  a->ref_cnt = 1;
1205
- ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1160
+ CWA(a)->ranalyzer = ranalyzer;
1206
1161
  /* prevent from being garbage collected */
1207
1162
  rb_hash_aset(object_space, ((VALUE)a)|1, ranalyzer);
1208
1163
  }
1209
1164
  return a;
1210
1165
  }
1211
1166
 
1212
- static void
1213
- frb_analyzer_free(FrtAnalyzer *a)
1214
- {
1215
- object_del(a);
1216
- frt_a_deref(a);
1217
- }
1218
-
1219
- VALUE
1220
- frb_get_analyzer(FrtAnalyzer *a)
1221
- {
1167
+ VALUE frb_get_analyzer(FrtAnalyzer *a) {
1222
1168
  VALUE self = Qnil;
1223
1169
  if (a) {
1224
- self = object_get(a);
1225
- if (self == Qnil) {
1226
- self = Data_Wrap_Struct(cAnalyzer, NULL, &frb_analyzer_free, a);
1170
+ self = a->ranalyzer;
1171
+ if (self == 0 || self == Qnil) {
1172
+ self = TypedData_Wrap_Struct(cAnalyzer, &frb_analyzer_t, a);
1227
1173
  FRT_REF(a);
1228
- object_add(a, self);
1174
+ a->ranalyzer = self;
1229
1175
  }
1230
1176
  }
1231
1177
  return self;
1232
1178
  }
1233
1179
 
1234
- VALUE
1235
- get_rb_ts_from_a(FrtAnalyzer *a, VALUE rfield, VALUE rstring)
1236
- {
1237
- FrtTokenStream *ts = frt_a_get_ts(a, frb_field(rfield), rs2s(rstring));
1238
-
1180
+ VALUE get_rb_ts_from_a(FrtAnalyzer *a, VALUE rfield, VALUE rstring) {
1181
+ FrtTokenStream *ts = frt_a_get_ts(a, frb_field(rfield), rs2s(rstring), rb_enc_get(rstring));
1239
1182
  /* Make sure that there is no entry already */
1240
- object_set(&ts->text, rstring);
1241
1183
  return get_rb_token_stream(ts);
1242
1184
  }
1243
1185
 
@@ -1252,16 +1194,11 @@ get_rb_ts_from_a(FrtAnalyzer *a, VALUE rfield, VALUE rstring)
1252
1194
  * field_name:: name of the field to be tokenized
1253
1195
  * input:: data from the field to be tokenized
1254
1196
  */
1255
- static VALUE
1256
- frb_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1257
- {
1197
+ static VALUE frb_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring) {
1258
1198
  /* NOTE: Any changes made to this method may also need to be applied to
1259
1199
  * frb_re_analyzer_token_stream */
1260
- FrtAnalyzer *a;
1261
- GET_A(a, self);
1262
-
1200
+ FrtAnalyzer *a = DATA_PTR(self);
1263
1201
  StringValue(rstring);
1264
-
1265
1202
  return get_rb_ts_from_a(a, rfield, rstring);
1266
1203
  }
1267
1204
 
@@ -1271,27 +1208,24 @@ VALUE rlower;\
1271
1208
  rb_scan_args(argc, argv, "01", &rlower);\
1272
1209
  lower = (argc ? RTEST(rlower) : dflt)
1273
1210
 
1274
- /*
1275
- * call-seq:
1276
- * AsciiWhiteSpaceAnalyzer.new(lower = false) -> analyzer
1277
- *
1278
- * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1279
- * but can optionally leave case as is. Lowercasing will only be done to
1280
- * ASCII characters.
1281
- *
1282
- * lower:: set to false if you don't want the field's tokens to be downcased
1283
- */
1284
- static VALUE
1285
- frb_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1286
- {
1211
+ static VALUE frb_analyzer_alloc(VALUE rclass) {
1212
+ FrtAnalyzer *a = frt_letter_analyzer_alloc();
1213
+ return TypedData_Wrap_Struct(rclass, &frb_analyzer_t, a);
1214
+ }
1215
+
1216
+ static VALUE frb_analyzer_init(int argc, VALUE *argv, VALUE self) {
1287
1217
  FrtAnalyzer *a;
1288
- GET_LOWER(false);
1289
- a = frt_whitespace_analyzer_new(lower);
1290
- Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1291
- object_add(a, self);
1218
+ GET_LOWER(true);
1219
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_analyzer_t, a);
1220
+ frt_letter_analyzer_init(a, lower);
1221
+ a->ranalyzer = self;
1292
1222
  return self;
1293
1223
  }
1294
1224
 
1225
+
1226
+ /*****************************************************************************/
1227
+ /*** WhiteSpaceAnalyzer ******************************************************/
1228
+ /*****************************************************************************/
1295
1229
  /*
1296
1230
  * call-seq:
1297
1231
  * WhiteSpaceAnalyzer.new(lower = false) -> analyzer
@@ -1302,38 +1236,18 @@ frb_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1302
1236
  *
1303
1237
  * lower:: set to false if you don't want the field's tokens to be downcased
1304
1238
  */
1305
- static VALUE
1306
- frb_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1307
- {
1308
- FrtAnalyzer *a;
1309
- GET_LOWER(false);
1310
- #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
1311
- if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1312
- #endif
1313
- a = frt_mb_whitespace_analyzer_new(lower);
1314
- Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1315
- object_add(a, self);
1316
- return self;
1239
+
1240
+ static VALUE frb_whitespace_analyzer_alloc(VALUE rclass) {
1241
+ FrtAnalyzer *a = frt_whitespace_analyzer_alloc();
1242
+ return TypedData_Wrap_Struct(rclass, &frb_analyzer_t, a);
1317
1243
  }
1318
1244
 
1319
- /*
1320
- * call-seq:
1321
- * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1322
- *
1323
- * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1324
- * but can optionally leave case as is. Lowercasing will only be done to
1325
- * ASCII characters.
1326
- *
1327
- * lower:: set to false if you don't want the field's tokens to be downcased
1328
- */
1329
- static VALUE
1330
- frb_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1331
- {
1245
+ static VALUE frb_whitespace_analyzer_init(int argc, VALUE *argv, VALUE self) {
1332
1246
  FrtAnalyzer *a;
1333
- GET_LOWER(true);
1334
- a = frt_letter_analyzer_new(lower);
1335
- Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1336
- object_add(a, self);
1247
+ GET_LOWER(false);
1248
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_analyzer_t, a);
1249
+ frt_whitespace_analyzer_init(a, lower);
1250
+ a->ranalyzer = self;
1337
1251
  return self;
1338
1252
  }
1339
1253
 
@@ -1347,17 +1261,18 @@ frb_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1347
1261
  *
1348
1262
  * lower:: set to false if you don't want the field's tokens to be downcased
1349
1263
  */
1350
- static VALUE
1351
- frb_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1352
- {
1264
+
1265
+ static VALUE frb_letter_analyzer_alloc(VALUE rclass) {
1266
+ FrtAnalyzer *a = frt_letter_analyzer_alloc();
1267
+ return TypedData_Wrap_Struct(rclass, &frb_analyzer_t, a);
1268
+ }
1269
+
1270
+ static VALUE frb_letter_analyzer_init(int argc, VALUE *argv, VALUE self) {
1353
1271
  FrtAnalyzer *a;
1354
1272
  GET_LOWER(true);
1355
- #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
1356
- if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1357
- #endif
1358
- a = frt_mb_letter_analyzer_new(lower);
1359
- Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1360
- object_add(a, self);
1273
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_analyzer_t, a);
1274
+ frt_letter_analyzer_init(a, lower);
1275
+ a->ranalyzer = self;
1361
1276
  return self;
1362
1277
  }
1363
1278
 
@@ -1374,39 +1289,6 @@ get_rstopwords(const char **stop_words)
1374
1289
  return rstopwords;
1375
1290
  }
1376
1291
 
1377
- /*
1378
- * call-seq:
1379
- * AsciiStandardAnalyzer.new(lower = true, stop_words = FRT_FULL_ENGLISH_STOP_WORDS)
1380
- * -> analyzer
1381
- *
1382
- * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1383
- * can optionally leave case as is. Lowercasing will be done based on the
1384
- * current locale. You can also set the list of stop-words to be used by the
1385
- * StopFilter.
1386
- *
1387
- * lower:: set to false if you don't want the field's tokens to be downcased
1388
- * stop_words:: list of stop-words to pass to the StopFilter
1389
- */
1390
- static VALUE
1391
- frb_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1392
- {
1393
- bool lower;
1394
- VALUE rlower, rstop_words;
1395
- FrtAnalyzer *a;
1396
- rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1397
- lower = ((rlower == Qnil) ? true : RTEST(rlower));
1398
- if (rstop_words != Qnil) {
1399
- char **stop_words = get_stopwords(rstop_words);
1400
- a = frt_standard_analyzer_new_with_words((const char **)stop_words, lower);
1401
- free(stop_words);
1402
- } else {
1403
- a = frt_standard_analyzer_new(lower);
1404
- }
1405
- Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1406
- object_add(a, self);
1407
- return self;
1408
- }
1409
-
1410
1292
  /*
1411
1293
  * call-seq:
1412
1294
  * StandardAnalyzer.new(stop_words = FRT_FULL_ENGLISH_STOP_WORDS, lower=true)
@@ -1420,39 +1302,37 @@ frb_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1420
1302
  * lower:: set to false if you don't want the field's tokens to be downcased
1421
1303
  * stop_words:: list of stop-words to pass to the StopFilter
1422
1304
  */
1423
- static VALUE
1424
- frb_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1425
- {
1305
+ static VALUE frb_standard_analyzer_alloc(VALUE rclass) {
1306
+ FrtAnalyzer *a = frt_standard_analyzer_alloc();
1307
+ return TypedData_Wrap_Struct(rclass, &frb_analyzer_t, a);
1308
+ }
1309
+
1310
+ static VALUE frb_standard_analyzer_init(int argc, VALUE *argv, VALUE self) {
1426
1311
  bool lower;
1427
1312
  VALUE rlower, rstop_words;
1428
1313
  FrtAnalyzer *a;
1429
- #if !defined POSH_OS_WIN32 && !defined POSH_OS_WIN64
1430
- if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1431
- #endif
1432
1314
  rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1433
1315
  lower = ((rlower == Qnil) ? true : RTEST(rlower));
1316
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_analyzer_t, a);
1434
1317
  if (rstop_words != Qnil) {
1435
1318
  char **stop_words = get_stopwords(rstop_words);
1436
- a = frt_mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1319
+ frt_standard_analyzer_init(a, lower, (const char **)stop_words);
1437
1320
  free(stop_words);
1438
1321
  } else {
1439
- a = frt_mb_standard_analyzer_new(lower);
1322
+ frt_standard_analyzer_init(a, lower, FRT_FULL_ENGLISH_STOP_WORDS);
1440
1323
  }
1441
- Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1442
- object_add(a, self);
1324
+ a->ranalyzer = self;
1443
1325
  return self;
1444
1326
  }
1445
1327
 
1446
- static void
1447
- frb_h_mark_values_i(void *key, void *value, void *arg)
1448
- {
1449
- frb_gc_mark(value);
1328
+ static void frb_h_mark_values_i(void *key, void *value, void *arg) {
1329
+ if (((FrtAnalyzer *)value)->ranalyzer)
1330
+ rb_gc_mark(((FrtAnalyzer *)value)->ranalyzer);
1450
1331
  }
1451
1332
 
1452
- static void
1453
- frb_pfa_mark(void *p)
1454
- {
1455
- frb_gc_mark(PFA(p)->default_a);
1333
+ static void frb_pfa_mark(void *p) {
1334
+ if (PFA(p)->default_a->ranalyzer)
1335
+ rb_gc_mark(PFA(p)->default_a->ranalyzer);
1456
1336
  frt_h_each(PFA(p)->dict, &frb_h_mark_values_i, NULL);
1457
1337
  }
1458
1338
 
@@ -1468,13 +1348,37 @@ frb_pfa_mark(void *p)
1468
1348
  * default_analyzer:: analyzer to be used on fields that aren't otherwise
1469
1349
  * specified
1470
1350
  */
1471
- static VALUE
1472
- frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1473
- {
1351
+
1352
+ static size_t frb_per_field_analyzer_size(const void *p) {
1353
+ return sizeof(FrtPerFieldAnalyzer);
1354
+ (void)p;
1355
+ }
1356
+
1357
+ const rb_data_type_t frb_per_field_analyzer_t = {
1358
+ .wrap_struct_name = "FrbPerFieldAnalyzer",
1359
+ .function = {
1360
+ .dmark = frb_pfa_mark,
1361
+ .dfree = frb_analyzer_free,
1362
+ .dsize = frb_per_field_analyzer_size,
1363
+ .dcompact = NULL,
1364
+ .reserved = {0},
1365
+ },
1366
+ .parent = NULL,
1367
+ .data = NULL,
1368
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
1369
+ };
1370
+
1371
+ static VALUE frb_per_field_analyzer_alloc(VALUE rclass) {
1372
+ FrtAnalyzer *a = frt_per_field_analyzer_alloc();
1373
+ return TypedData_Wrap_Struct(rclass, &frb_per_field_analyzer_t, a);
1374
+ }
1375
+
1376
+ static VALUE frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer) {
1474
1377
  FrtAnalyzer *def = frb_get_cwrapped_analyzer(ranalyzer);
1475
- FrtAnalyzer *a = frt_per_field_analyzer_new(def);
1476
- Frt_Wrap_Struct(self, &frb_pfa_mark, &frb_analyzer_free, a);
1477
- object_add(a, self);
1378
+ FrtAnalyzer *a;
1379
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_per_field_analyzer_t, a);
1380
+ frt_per_field_analyzer_init(a, def);
1381
+ a->ranalyzer = self;
1478
1382
  return self;
1479
1383
  }
1480
1384
 
@@ -1489,11 +1393,9 @@ frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1489
1393
  * field_name:: field we wish to set the analyzer for
1490
1394
  * analyzer:: analyzer to be used on +field_name+
1491
1395
  */
1492
- static VALUE
1493
- frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1494
- {
1396
+ static VALUE frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer) {
1495
1397
  FrtAnalyzer *pfa, *a;
1496
- Data_Get_Struct(self, FrtAnalyzer, pfa);
1398
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_per_field_analyzer_t, pfa);
1497
1399
  a = frb_get_cwrapped_analyzer(ranalyzer);
1498
1400
 
1499
1401
  frt_pfa_add_field(pfa, frb_field(rfield), a);
@@ -1510,12 +1412,10 @@ frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1510
1412
  * field_name:: name of the field to be tokenized
1511
1413
  * input:: data from the field to be tokenized
1512
1414
  */
1513
- static VALUE
1514
- frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1515
- {
1415
+ static VALUE frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring) {
1516
1416
  FrtAnalyzer *pfa, *a;
1517
- FrtSymbol field = frb_field(rfield);
1518
- GET_A(pfa, self);
1417
+ ID field = frb_field(rfield);
1418
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_per_field_analyzer_t, pfa);
1519
1419
 
1520
1420
  StringValue(rstring);
1521
1421
  a = (FrtAnalyzer *)frt_h_get(PFA(pfa)->dict, (void *)field);
@@ -1523,29 +1423,46 @@ frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1523
1423
  a = PFA(pfa)->default_a;
1524
1424
  }
1525
1425
  if (a->get_ts == cwa_get_ts) {
1426
+ VALUE rstr = rb_str_new_cstr(rs2s(rstring));
1427
+ rb_enc_associate(rstr, rb_enc_get(rstring));
1526
1428
  return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1527
- rb_str_new_cstr(rb_id2name(field)), rb_str_new_cstr(rs2s(rstring)));
1528
- }
1529
- else {
1429
+ rb_str_new_cstr(rb_id2name(field)), rstr);
1430
+ } else {
1530
1431
  return get_rb_ts_from_a(a, rfield, rstring);
1531
1432
  }
1532
1433
  }
1533
1434
 
1534
1435
  /*** RegExpAnalyzer ***/
1535
1436
 
1536
- static void
1537
- frb_re_analyzer_mark(FrtAnalyzer *a)
1538
- {
1539
- frb_gc_mark(a->current_ts);
1437
+ static void frb_re_analyzer_mark(void *p) {
1438
+ if (((FrtAnalyzer *)p)->current_ts->rts)
1439
+ rb_gc_mark(((FrtAnalyzer *)p)->current_ts->rts);
1540
1440
  }
1541
1441
 
1542
- static void
1543
- re_analyzer_destroy_i(FrtAnalyzer *a)
1544
- {
1442
+ static void re_analyzer_destroy_i(FrtAnalyzer *a) {
1545
1443
  frt_ts_deref(a->current_ts);
1546
1444
  free(a);
1547
1445
  }
1548
1446
 
1447
+ const rb_data_type_t frb_reg_exp_analyzer_t = {
1448
+ .wrap_struct_name = "FrbRegExpAnalyzer",
1449
+ .function = {
1450
+ .dmark = frb_re_analyzer_mark,
1451
+ .dfree = frb_analyzer_free,
1452
+ .dsize = frb_analyzer_size,
1453
+ .dcompact = 0,
1454
+ .reserved = {0},
1455
+ },
1456
+ .parent = 0,
1457
+ .data = 0,
1458
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
1459
+ };
1460
+
1461
+ static VALUE frb_reg_exp_analyzer_alloc(VALUE rclass) {
1462
+ FrtAnalyzer *a;
1463
+ return TypedData_Make_Struct(rclass, FrtAnalyzer, &frb_reg_exp_analyzer_t, a);
1464
+ }
1465
+
1549
1466
  /*
1550
1467
  * call-seq:
1551
1468
  * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
@@ -1556,27 +1473,26 @@ re_analyzer_destroy_i(FrtAnalyzer *a)
1556
1473
  * reg_exp:: the token matcher for the tokenizer to use
1557
1474
  * lower:: set to false if you don't want to downcase the tokens
1558
1475
  */
1559
- static VALUE
1560
- frb_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1561
- {
1476
+ static VALUE frb_re_analyzer_init(int argc, VALUE *argv, VALUE self) {
1562
1477
  VALUE lower, rets, regex, proc;
1563
1478
  FrtAnalyzer *a;
1564
1479
  FrtTokenStream *ts;
1565
1480
  rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1566
1481
 
1567
1482
  ts = rets_new(Qnil, regex, proc);
1568
- rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts);
1569
- object_add(ts, rets);
1483
+ rets = TypedData_Wrap_Struct(cRegExpTokenizer, &frb_reg_exp_token_stream_t, ts);
1484
+ ts->rts = rets;
1570
1485
 
1571
1486
  if (lower != Qfalse) {
1572
- rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets);
1487
+ rets = frb_lowercase_filter_init(frb_lowercase_filter_alloc(cLowerCaseFilter), rets);
1573
1488
  ts = DATA_PTR(rets);
1489
+ ts->rts = rets;
1574
1490
  }
1575
1491
  FRT_REF(ts);
1576
1492
 
1577
- a = frt_analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1578
- Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a);
1579
- object_add(a, self);
1493
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_reg_exp_analyzer_t, a);
1494
+ frt_analyzer_init(a, ts, &re_analyzer_destroy_i, NULL);
1495
+ a->ranalyzer = self;
1580
1496
  return self;
1581
1497
  }
1582
1498
 
@@ -1591,63 +1507,16 @@ frb_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1591
1507
  * field_name:: name of the field to be tokenized
1592
1508
  * input:: data from the field to be tokenized
1593
1509
  */
1594
- static VALUE
1595
- frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1596
- {
1510
+ static VALUE frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext) {
1597
1511
  FrtTokenStream *ts;
1598
1512
  FrtAnalyzer *a;
1599
- GET_A(a, self);
1600
-
1601
- StringValue(rtext);
1513
+ TypedData_Get_Struct(self, FrtAnalyzer, &frb_reg_exp_analyzer_t, a);
1602
1514
 
1603
- ts = frt_a_get_ts(a, frb_field(rfield), rs2s(rtext));
1515
+ ts = frt_a_get_ts(a, frb_field(rfield), rs2s(rtext), rb_enc_get(rtext));
1604
1516
 
1605
- /* Make sure that there is no entry already */
1606
- object_set(&ts->text, rtext);
1607
- if (ts->next == &rets_next) {
1608
- RETS(ts)->rtext = rtext;
1609
- rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
1610
- }
1611
- else {
1612
- RETS(((FrtTokenFilter*)ts)->sub_ts)->rtext = rtext;
1613
- rb_hash_aset(object_space, ((VALUE)((FrtTokenFilter*)ts)->sub_ts)|1, rtext);
1614
- }
1615
1517
  return get_rb_token_stream(ts);
1616
1518
  }
1617
1519
 
1618
- /****************************************************************************
1619
- *
1620
- * Locale stuff
1621
- *
1622
- ****************************************************************************/
1623
-
1624
- /*
1625
- * call-seq:
1626
- * Ferret.locale -> locale_str
1627
- *
1628
- * Returns a string corresponding to the locale set. For example;
1629
- *
1630
- * puts Ferret.locale #=> "en_US.UTF-8"
1631
- */
1632
- static VALUE frb_get_locale(VALUE self)
1633
- {
1634
- return (frb_locale ? rb_str_new2(frb_locale) : Qnil);
1635
- }
1636
-
1637
- /*
1638
- * call-seq:
1639
- * Ferret.locale = "en_US.UTF-8"
1640
- *
1641
- * Set the global locale. You should use this method to set different locales
1642
- * when indexing documents with different encodings.
1643
- */
1644
- static VALUE frb_set_locale(VALUE self, VALUE locale)
1645
- {
1646
- char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
1647
- frb_locale = setlocale(LC_CTYPE, l);
1648
- return frb_locale ? rb_str_new2(frb_locale) : Qnil;
1649
- }
1650
-
1651
1520
  /****************************************************************************
1652
1521
  *
1653
1522
  * Init Functions
@@ -1680,8 +1549,7 @@ static VALUE frb_set_locale(VALUE self, VALUE locale)
1680
1549
  * equal to @text.length(), as the term text may have been
1681
1550
  * altered by a stemmer or some other filter.
1682
1551
  */
1683
- static void Init_Token(void)
1684
- {
1552
+ static void Init_Token(void) {
1685
1553
  cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1686
1554
  rb_define_alloc_func(cToken, frb_token_alloc);
1687
1555
  rb_include_module(cToken, rb_mComparable);
@@ -1712,8 +1580,7 @@ static void Init_Token(void)
1712
1580
  * Tokenizer:: a TokenStream whose input is a string
1713
1581
  * TokenFilter:: a TokenStream whose input is another TokenStream
1714
1582
  */
1715
- static void Init_TokenStream(void)
1716
- {
1583
+ static void Init_TokenStream(void) {
1717
1584
  cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1718
1585
  frb_mark_cclass(cTokenStream);
1719
1586
  rb_define_method(cTokenStream, "next", frb_ts_next, 0);
@@ -1721,30 +1588,6 @@ static void Init_TokenStream(void)
1721
1588
  rb_define_method(cTokenStream, "text", frb_ts_get_text, 0);
1722
1589
  }
1723
1590
 
1724
- /*
1725
- * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1726
- *
1727
- * == Summary
1728
- *
1729
- * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1730
- * That is to say, it defines tokens as maximal strings of adjacent letters,
1731
- * as defined by the regular expression _/[A-Za-z]+/_.
1732
- *
1733
- * === Example
1734
- *
1735
- * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1736
- * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1737
- */
1738
- static void Init_AsciiLetterTokenizer(void)
1739
- {
1740
- cAsciiLetterTokenizer =
1741
- rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1742
- frb_mark_cclass(cAsciiLetterTokenizer);
1743
- rb_define_alloc_func(cAsciiLetterTokenizer, frb_data_alloc);
1744
- rb_define_method(cAsciiLetterTokenizer, "initialize",
1745
- frb_a_letter_tokenizer_init, 1);
1746
- }
1747
-
1748
1591
  /*
1749
1592
  * Document-class: Ferret::Analysis::LetterTokenizer
1750
1593
  *
@@ -1760,38 +1603,11 @@ static void Init_AsciiLetterTokenizer(void)
1760
1603
  * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1761
1604
  * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1762
1605
  */
1763
- static void Init_LetterTokenizer(void)
1764
- {
1765
- cLetterTokenizer =
1766
- rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1606
+ static void Init_LetterTokenizer(void) {
1607
+ cLetterTokenizer = rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1767
1608
  frb_mark_cclass(cLetterTokenizer);
1768
- rb_define_alloc_func(cLetterTokenizer, frb_data_alloc);
1769
- rb_define_method(cLetterTokenizer, "initialize",
1770
- frb_letter_tokenizer_init, -1);
1771
- }
1772
-
1773
- /*
1774
- * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1775
- *
1776
- * == Summary
1777
- *
1778
- * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1779
- * Adjacent sequences of non-WhiteSpace characters form tokens.
1780
- *
1781
- * === Example
1782
- *
1783
- * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1784
- * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1785
- */
1786
- static void Init_AsciiWhiteSpaceTokenizer(void)
1787
- {
1788
- cAsciiWhiteSpaceTokenizer =
1789
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1790
- cTokenStream);
1791
- frb_mark_cclass(cAsciiWhiteSpaceTokenizer);
1792
- rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frb_data_alloc);
1793
- rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1794
- frb_a_whitespace_tokenizer_init, 1);
1609
+ rb_define_alloc_func(cLetterTokenizer, frb_letter_tokenizer_alloc);
1610
+ rb_define_method(cLetterTokenizer, "initialize", frb_letter_tokenizer_init, -1);
1795
1611
  }
1796
1612
 
1797
1613
  /*
@@ -1807,38 +1623,11 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
1807
1623
  * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1808
1624
  * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1809
1625
  */
1810
- static void Init_WhiteSpaceTokenizer(void)
1811
- {
1812
- cWhiteSpaceTokenizer =
1813
- rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1626
+ static void Init_WhiteSpaceTokenizer(void) {
1627
+ cWhiteSpaceTokenizer = rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1814
1628
  frb_mark_cclass(cWhiteSpaceTokenizer);
1815
- rb_define_alloc_func(cWhiteSpaceTokenizer, frb_data_alloc);
1816
- rb_define_method(cWhiteSpaceTokenizer, "initialize",
1817
- frb_whitespace_tokenizer_init, -1);
1818
- }
1819
-
1820
- /*
1821
- * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1822
- *
1823
- * == Summary
1824
- *
1825
- * The standard tokenizer is an advanced tokenizer which tokenizes most
1826
- * words correctly as well as tokenizing things like email addresses, web
1827
- * addresses, phone numbers, etc.
1828
- *
1829
- * === Example
1830
- *
1831
- * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1832
- * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1833
- */
1834
- static void Init_AsciiStandardTokenizer(void)
1835
- {
1836
- cAsciiStandardTokenizer =
1837
- rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1838
- frb_mark_cclass(cAsciiStandardTokenizer);
1839
- rb_define_alloc_func(cAsciiStandardTokenizer, frb_data_alloc);
1840
- rb_define_method(cAsciiStandardTokenizer, "initialize",
1841
- frb_a_standard_tokenizer_init, 1);
1629
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frb_whitespace_tokenizer_alloc);
1630
+ rb_define_method(cWhiteSpaceTokenizer, "initialize", frb_whitespace_tokenizer_init, -1);
1842
1631
  }
1843
1632
 
1844
1633
  /*
@@ -1855,14 +1644,11 @@ static void Init_AsciiStandardTokenizer(void)
1855
1644
  * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1856
1645
  * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1857
1646
  */
1858
- static void Init_StandardTokenizer(void)
1859
- {
1860
- cStandardTokenizer =
1861
- rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1647
+ static void Init_StandardTokenizer(void) {
1648
+ cStandardTokenizer = rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1862
1649
  frb_mark_cclass(cStandardTokenizer);
1863
- rb_define_alloc_func(cStandardTokenizer, frb_data_alloc);
1864
- rb_define_method(cStandardTokenizer, "initialize",
1865
- frb_standard_tokenizer_init, 1);
1650
+ rb_define_alloc_func(cStandardTokenizer, frb_standard_tokenizer_alloc);
1651
+ rb_define_method(cStandardTokenizer, "initialize", frb_standard_tokenizer_init, -1);
1866
1652
  }
1867
1653
 
1868
1654
  /*
@@ -1885,16 +1671,13 @@ static void Init_StandardTokenizer(void)
1885
1671
  * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1886
1672
  * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1887
1673
  */
1888
- static void Init_RegExpTokenizer(void)
1889
- {
1890
- cRegExpTokenizer =
1891
- rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1674
+ static void Init_RegExpTokenizer(void) {
1675
+ cRegExpTokenizer = rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1892
1676
  frb_mark_cclass(cRegExpTokenizer);
1893
1677
  rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1894
1678
  rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1895
- rb_define_alloc_func(cRegExpTokenizer, frb_data_alloc);
1896
- rb_define_method(cRegExpTokenizer, "initialize",
1897
- frb_rets_init, -1);
1679
+ rb_define_alloc_func(cRegExpTokenizer, frb_reg_exp_tokenizer_alloc);
1680
+ rb_define_method(cRegExpTokenizer, "initialize", frb_rets_init, -1);
1898
1681
  rb_define_method(cRegExpTokenizer, "text=", frb_rets_set_text, 1);
1899
1682
  rb_define_method(cRegExpTokenizer, "text", frb_rets_get_text, 0);
1900
1683
  }
@@ -1902,30 +1685,6 @@ static void Init_RegExpTokenizer(void)
1902
1685
  /***************/
1903
1686
  /*** Filters ***/
1904
1687
  /***************/
1905
-
1906
- /*
1907
- * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1908
- *
1909
- * == Summary
1910
- *
1911
- * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1912
- * ASCII characters. For other characters use LowerCaseFilter.
1913
- *
1914
- * === Example
1915
- *
1916
- * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1917
- *
1918
- */
1919
- static void Init_AsciiLowerCaseFilter(void)
1920
- {
1921
- cAsciiLowerCaseFilter =
1922
- rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1923
- frb_mark_cclass(cAsciiLowerCaseFilter);
1924
- rb_define_alloc_func(cAsciiLowerCaseFilter, frb_data_alloc);
1925
- rb_define_method(cAsciiLowerCaseFilter, "initialize",
1926
- frb_a_lowercase_filter_init, 1);
1927
- }
1928
-
1929
1688
  /*
1930
1689
  * Document-class: Ferret::Analysis::LowerCaseFilter
1931
1690
  *
@@ -1939,14 +1698,11 @@ static void Init_AsciiLowerCaseFilter(void)
1939
1698
  * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
1940
1699
  *
1941
1700
  */
1942
- static void Init_LowerCaseFilter(void)
1943
- {
1944
- cLowerCaseFilter =
1945
- rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1701
+ static void Init_LowerCaseFilter(void) {
1702
+ cLowerCaseFilter = rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1946
1703
  frb_mark_cclass(cLowerCaseFilter);
1947
- rb_define_alloc_func(cLowerCaseFilter, frb_data_alloc);
1948
- rb_define_method(cLowerCaseFilter, "initialize",
1949
- frb_lowercase_filter_init, 1);
1704
+ rb_define_alloc_func(cLowerCaseFilter, frb_lowercase_filter_alloc);
1705
+ rb_define_method(cLowerCaseFilter, "initialize", frb_lowercase_filter_init, 1);
1950
1706
  }
1951
1707
 
1952
1708
  /*
@@ -1964,12 +1720,10 @@ static void Init_LowerCaseFilter(void)
1964
1720
  * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
1965
1721
  *
1966
1722
  */
1967
- static void Init_HyphenFilter(void)
1968
- {
1969
- cHyphenFilter =
1970
- rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1723
+ static void Init_HyphenFilter(void) {
1724
+ cHyphenFilter = rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1971
1725
  frb_mark_cclass(cHyphenFilter);
1972
- rb_define_alloc_func(cHyphenFilter, frb_data_alloc);
1726
+ rb_define_alloc_func(cHyphenFilter, frb_hyphen_filter_alloc);
1973
1727
  rb_define_method(cHyphenFilter, "initialize", frb_hyphen_filter_init, 1);
1974
1728
  }
1975
1729
 
@@ -2014,14 +1768,11 @@ static void Init_HyphenFilter(void)
2014
1768
  * }
2015
1769
  * filt = MappingFilter.new(token_stream, mapping)
2016
1770
  */
2017
- static void Init_MappingFilter(void)
2018
- {
2019
- cMappingFilter =
2020
- rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
1771
+ static void Init_MappingFilter(void) {
1772
+ cMappingFilter = rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
2021
1773
  frb_mark_cclass(cMappingFilter);
2022
- rb_define_alloc_func(cMappingFilter, frb_data_alloc);
2023
- rb_define_method(cMappingFilter, "initialize",
2024
- frb_mapping_filter_init, 2);
1774
+ rb_define_alloc_func(cMappingFilter, frb_mapping_filter_alloc);
1775
+ rb_define_method(cMappingFilter, "initialize", frb_mapping_filter_init, 2);
2025
1776
  }
2026
1777
 
2027
1778
  /*
@@ -2037,14 +1788,11 @@ static void Init_MappingFilter(void)
2037
1788
  *
2038
1789
  * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
2039
1790
  */
2040
- static void Init_StopFilter(void)
2041
- {
2042
- cStopFilter =
2043
- rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1791
+ static void Init_StopFilter(void) {
1792
+ cStopFilter = rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
2044
1793
  frb_mark_cclass(cStopFilter);
2045
- rb_define_alloc_func(cStopFilter, frb_data_alloc);
2046
- rb_define_method(cStopFilter, "initialize",
2047
- frb_stop_filter_init, -1);
1794
+ rb_define_alloc_func(cStopFilter, frb_stop_filter_alloc);
1795
+ rb_define_method(cStopFilter, "initialize", frb_stop_filter_init, -1);
2048
1796
  }
2049
1797
 
2050
1798
  /*
@@ -2056,28 +1804,41 @@ static void Init_StopFilter(void)
2056
1804
  * stemming algorithm. Note: the input to the stemming filter must already
2057
1805
  * be in lower case, so you will need to use LowerCaseFilter or lowercasing
2058
1806
  * Tokenizer further down the Tokenizer chain in order for this to work
2059
- * properly!
2060
- *
2061
- * === Available algorithms and encodings
2062
- *
2063
- * Algorithm Algorithm Pseudonyms Encoding
2064
- * ----------------------------------------------------------------
2065
- * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
2066
- * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
2067
- * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
2068
- * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
2069
- * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
2070
- * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
2071
- * "hungarian", | "hu", "hun" | "ISO_8859_1", "UTF_8"
2072
- * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
2073
- * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
2074
- * "porter", | | "ISO_8859_1", "UTF_8"
2075
- * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
2076
- * "romanian", | "ro", "ron", "rum" | "ISO_8859_2", "UTF_8"
2077
- * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
2078
- * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
2079
- * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
2080
- * "turkish", | "tr", "tur" | "UTF_8"
1807
+ * properly! Stemmers work on UTF-8 encoding only.
1808
+ *
1809
+ * === Available algorithms
1810
+ *
1811
+ * Algorithm Algorithm Pseudonyms
1812
+ * ------------------------------------------
1813
+ * "arabic" | "ar", "ara"
1814
+ * "armenian" | "arm", "hy", "hye"
1815
+ * "basque" | "baq", "eu", "eus"
1816
+ * "catalan" | "ca", "cat"
1817
+ * "danish" | "da", "dan"
1818
+ * "dutch" | "dut", "nl", "nld"
1819
+ * "english" | "en", "eng"
1820
+ * "finnish" | "fi", "fin"
1821
+ * "french" | "fr", "fra", "fre"
1822
+ * "german" | "de", "deu", "ge", "ger"
1823
+ * "greek" | "el", "ell", "gre"
1824
+ * "hindi" | "hi", "hin"
1825
+ * "hungarian" | "hu", "hun"
1826
+ * "indonesian" | "id", "ind"
1827
+ * "italian" | "it", "ita"
1828
+ * "irish" | "ga", "gle"
1829
+ * "lithuanian" | "lit"
1830
+ * "nepali" | "ne", "nep"
1831
+ * "norwegian" | "nl", "no", "nor"
1832
+ * "porter" |
1833
+ * "portuguese" | "por", "pt"
1834
+ * "romanian" | "ro", "ron", "rum"
1835
+ * "russian" | "ru", "rus"
1836
+ * "serbian" | "sr", "srp"
1837
+ * "spanish" | "es", "esl", "spa"
1838
+ * "swedish" | "sv", "swe"
1839
+ * "tamil" | "ta", "tam"
1840
+ * "turkish" | "tr", "tur"
1841
+ * "yiddish" | "yi", "yid"
2081
1842
  *
2082
1843
  *
2083
1844
  * === New Stemmers
@@ -2109,14 +1870,11 @@ static void Init_StopFilter(void)
2109
1870
  * algorithm:: The algorithm (or language) to use (default: "english")
2110
1871
  * encoding:: The encoding of the data (default: "UTF-8")
2111
1872
  */
2112
- static void Init_StemFilter(void)
2113
- {
2114
- cStemFilter =
2115
- rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1873
+ static void Init_StemFilter(void) {
1874
+ cStemFilter = rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
2116
1875
  frb_mark_cclass(cStemFilter);
2117
- rb_define_alloc_func(cStemFilter, frb_data_alloc);
2118
- rb_define_method(cStemFilter, "initialize",
2119
- frb_stem_filter_init, -1);
1876
+ rb_define_alloc_func(cStemFilter, frb_stem_filter_alloc);
1877
+ rb_define_method(cStemFilter, "initialize", frb_stem_filter_init, -1);
2120
1878
  }
2121
1879
 
2122
1880
  /*************************/
@@ -2153,61 +1911,21 @@ static void Init_StemFilter(void)
2153
1911
  * end
2154
1912
  * end
2155
1913
  */
2156
- static void Init_Analyzer(void)
2157
- {
2158
- cAnalyzer =
2159
- rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1914
+ static void Init_Analyzer(void) {
1915
+ cAnalyzer = rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
2160
1916
  frb_mark_cclass(cAnalyzer);
2161
- rb_define_alloc_func(cAnalyzer, frb_data_alloc);
2162
- rb_define_method(cAnalyzer, "initialize", frb_letter_analyzer_init, -1);
1917
+ rb_define_alloc_func(cAnalyzer, frb_analyzer_alloc);
1918
+ rb_define_method(cAnalyzer, "initialize", frb_analyzer_init, -1);
2163
1919
  rb_define_method(cAnalyzer, "token_stream", frb_analyzer_token_stream, 2);
2164
1920
  }
2165
1921
 
2166
- /*
2167
- * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
2168
- *
2169
- * == Summary
2170
- *
2171
- * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2172
- * maximal strings of ASCII characters. If implemented in Ruby it would look
2173
- * like;
2174
- *
2175
- * class AsciiLetterAnalyzer
2176
- * def initialize(lower = true)
2177
- * @lower = lower
2178
- * end
2179
- *
2180
- * def token_stream(field, str)
2181
- * if @lower
2182
- * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
2183
- * else
2184
- * return AsciiLetterTokenizer.new(str)
2185
- * end
2186
- * end
2187
- * end
2188
- *
2189
- * As you can see it makes use of the AsciiLetterTokenizer and
2190
- * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2191
- * characters so you should use the LetterAnalyzer is you want to analyze
2192
- * multi-byte data like "UTF-8".
2193
- */
2194
- static void Init_AsciiLetterAnalyzer(void)
2195
- {
2196
- cAsciiLetterAnalyzer =
2197
- rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
2198
- frb_mark_cclass(cAsciiLetterAnalyzer);
2199
- rb_define_alloc_func(cAsciiLetterAnalyzer, frb_data_alloc);
2200
- rb_define_method(cAsciiLetterAnalyzer, "initialize",
2201
- frb_a_letter_analyzer_init, -1);
2202
- }
2203
-
2204
1922
  /*
2205
1923
  * Document-class: Ferret::Analysis::LetterAnalyzer
2206
1924
  *
2207
1925
  * == Summary
2208
1926
  *
2209
1927
  * A LetterAnalyzer creates a TokenStream that splits the input up into
2210
- * maximal strings of characters as recognized by the current locale. If
1928
+ * maximal strings of characters as recognized by the current encoding. If
2211
1929
  * implemented in Ruby it would look like;
2212
1930
  *
2213
1931
  * class LetterAnalyzer
@@ -2222,51 +1940,11 @@ static void Init_AsciiLetterAnalyzer(void)
2222
1940
  *
2223
1941
  * As you can see it makes use of the LetterTokenizer.
2224
1942
  */
2225
- static void Init_LetterAnalyzer(void)
2226
- {
2227
- cLetterAnalyzer =
2228
- rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1943
+ static void Init_LetterAnalyzer(void) {
1944
+ cLetterAnalyzer = rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
2229
1945
  frb_mark_cclass(cLetterAnalyzer);
2230
- rb_define_alloc_func(cLetterAnalyzer, frb_data_alloc);
2231
- rb_define_method(cLetterAnalyzer, "initialize",
2232
- frb_letter_analyzer_init, -1);
2233
- }
2234
-
2235
- /*
2236
- * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
2237
- *
2238
- * == Summary
2239
- *
2240
- * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
2241
- * non-whitespace characters. If implemented in Ruby the
2242
- * AsciiWhiteSpaceAnalyzer would look like;
2243
- *
2244
- * class AsciiWhiteSpaceAnalyzer
2245
- * def initialize(lower = true)
2246
- * @lower = lower
2247
- * end
2248
- *
2249
- * def token_stream(field, str)
2250
- * if @lower
2251
- * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
2252
- * else
2253
- * return AsciiWhiteSpaceTokenizer.new(str)
2254
- * end
2255
- * end
2256
- * end
2257
- *
2258
- * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
2259
- * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
2260
- * as "UTF-8".
2261
- */
2262
- static void Init_AsciiWhiteSpaceAnalyzer(void)
2263
- {
2264
- cAsciiWhiteSpaceAnalyzer =
2265
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
2266
- frb_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2267
- rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frb_data_alloc);
2268
- rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
2269
- frb_a_white_space_analyzer_init, -1);
1946
+ rb_define_alloc_func(cLetterAnalyzer, frb_letter_analyzer_alloc);
1947
+ rb_define_method(cLetterAnalyzer, "initialize", frb_letter_analyzer_init, -1);
2270
1948
  }
2271
1949
 
2272
1950
  /*
@@ -2290,51 +1968,11 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
2290
1968
  *
2291
1969
  * As you can see it makes use of the WhiteSpaceTokenizer.
2292
1970
  */
2293
- static void Init_WhiteSpaceAnalyzer(void)
2294
- {
2295
- cWhiteSpaceAnalyzer =
2296
- rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1971
+ static void Init_WhiteSpaceAnalyzer(void) {
1972
+ cWhiteSpaceAnalyzer = rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
2297
1973
  frb_mark_cclass(cWhiteSpaceAnalyzer);
2298
- rb_define_alloc_func(cWhiteSpaceAnalyzer, frb_data_alloc);
2299
- rb_define_method(cWhiteSpaceAnalyzer, "initialize",
2300
- frb_white_space_analyzer_init, -1);
2301
- }
2302
-
2303
- /*
2304
- * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
2305
- *
2306
- * == Summary
2307
- *
2308
- * The AsciiStandardAnalyzer is the most advanced of the available
2309
- * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2310
- *
2311
- * class AsciiStandardAnalyzer
2312
- * def initialize(stop_words = FRT_FULL_ENGLISH_STOP_WORDS, lower = true)
2313
- * @lower = lower
2314
- * @stop_words = stop_words
2315
- * end
2316
- *
2317
- * def token_stream(field, str)
2318
- * ts = AsciiStandardTokenizer.new(str)
2319
- * ts = AsciiLowerCaseFilter.new(ts) if @lower
2320
- * ts = StopFilter.new(ts, @stop_words)
2321
- * ts = HyphenFilter.new(ts)
2322
- * end
2323
- * end
2324
- *
2325
- * As you can see it makes use of the AsciiStandardTokenizer and you can also
2326
- * add your own list of stop-words if you wish. Note that this tokenizer
2327
- * won't recognize non-ASCII characters so you should use the
2328
- * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2329
- */
2330
- static void Init_AsciiStandardAnalyzer(void)
2331
- {
2332
- cAsciiStandardAnalyzer =
2333
- rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2334
- frb_mark_cclass(cAsciiStandardAnalyzer);
2335
- rb_define_alloc_func(cAsciiStandardAnalyzer, frb_data_alloc);
2336
- rb_define_method(cAsciiStandardAnalyzer, "initialize",
2337
- frb_a_standard_analyzer_init, -1);
1974
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frb_whitespace_analyzer_alloc);
1975
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize", frb_whitespace_analyzer_init, -1);
2338
1976
  }
2339
1977
 
2340
1978
  /*
@@ -2362,14 +2000,11 @@ static void Init_AsciiStandardAnalyzer(void)
2362
2000
  * As you can see it makes use of the StandardTokenizer and you can also add
2363
2001
  * your own list of stopwords if you wish.
2364
2002
  */
2365
- static void Init_StandardAnalyzer(void)
2366
- {
2367
- cStandardAnalyzer =
2368
- rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2003
+ static void Init_StandardAnalyzer(void) {
2004
+ cStandardAnalyzer = rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2369
2005
  frb_mark_cclass(cStandardAnalyzer);
2370
- rb_define_alloc_func(cStandardAnalyzer, frb_data_alloc);
2371
- rb_define_method(cStandardAnalyzer, "initialize",
2372
- frb_standard_analyzer_init, -1);
2006
+ rb_define_alloc_func(cStandardAnalyzer, frb_standard_analyzer_alloc);
2007
+ rb_define_method(cStandardAnalyzer, "initialize", frb_standard_analyzer_init, -1);
2373
2008
  }
2374
2009
 
2375
2010
  /*
@@ -2392,20 +2027,14 @@ static void Init_StandardAnalyzer(void)
2392
2027
  * # Use a custom analyzer on the :created_at field
2393
2028
  * pfa[:created_at] = DateAnalyzer.new
2394
2029
  */
2395
- static void Init_PerFieldAnalyzer(void)
2396
- {
2397
- cPerFieldAnalyzer =
2398
- rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2030
+ static void Init_PerFieldAnalyzer(void) {
2031
+ cPerFieldAnalyzer = rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2399
2032
  frb_mark_cclass(cPerFieldAnalyzer);
2400
- rb_define_alloc_func(cPerFieldAnalyzer, frb_data_alloc);
2401
- rb_define_method(cPerFieldAnalyzer, "initialize",
2402
- frb_per_field_analyzer_init, 1);
2403
- rb_define_method(cPerFieldAnalyzer, "add_field",
2404
- frb_per_field_analyzer_add_field, 2);
2405
- rb_define_method(cPerFieldAnalyzer, "[]=",
2406
- frb_per_field_analyzer_add_field, 2);
2407
- rb_define_method(cPerFieldAnalyzer, "token_stream",
2408
- frb_pfa_analyzer_token_stream, 2);
2033
+ rb_define_alloc_func(cPerFieldAnalyzer, frb_per_field_analyzer_alloc);
2034
+ rb_define_method(cPerFieldAnalyzer, "initialize", frb_per_field_analyzer_init, 1);
2035
+ rb_define_method(cPerFieldAnalyzer, "add_field", frb_per_field_analyzer_add_field, 2);
2036
+ rb_define_method(cPerFieldAnalyzer, "[]=", frb_per_field_analyzer_add_field, 2);
2037
+ rb_define_method(cPerFieldAnalyzer, "token_stream", frb_pfa_analyzer_token_stream, 2);
2409
2038
  }
2410
2039
 
2411
2040
  /*
@@ -2435,16 +2064,12 @@ static void Init_PerFieldAnalyzer(void)
2435
2064
  *
2436
2065
  * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2437
2066
  */
2438
- static void Init_RegExpAnalyzer(void)
2439
- {
2440
- cRegExpAnalyzer =
2441
- rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2067
+ static void Init_RegExpAnalyzer(void) {
2068
+ cRegExpAnalyzer = rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2442
2069
  frb_mark_cclass(cRegExpAnalyzer);
2443
- rb_define_alloc_func(cRegExpAnalyzer, frb_data_alloc);
2444
- rb_define_method(cRegExpAnalyzer, "initialize",
2445
- frb_re_analyzer_init, -1);
2446
- rb_define_method(cRegExpAnalyzer, "token_stream",
2447
- frb_re_analyzer_token_stream, 2);
2070
+ rb_define_alloc_func(cRegExpAnalyzer, frb_reg_exp_analyzer_alloc);
2071
+ rb_define_method(cRegExpAnalyzer, "initialize", frb_re_analyzer_init, -1);
2072
+ rb_define_method(cRegExpAnalyzer, "token_stream", frb_re_analyzer_token_stream, 2);
2448
2073
  }
2449
2074
 
2450
2075
  /* rdoc hack
@@ -2491,9 +2116,7 @@ extern VALUE mFerret = rb_define_module("Ferret");
2491
2116
  * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2492
2117
  * might be "begin" (after stemming).
2493
2118
  */
2494
- void
2495
- Init_Analysis(void)
2496
- {
2119
+ void Init_Analysis(void) {
2497
2120
  mAnalysis = rb_define_module_under(mFerret, "Analysis");
2498
2121
 
2499
2122
  /* TokenStream Methods */
@@ -2508,56 +2131,30 @@ Init_Analysis(void)
2508
2131
  object_space = rb_hash_new();
2509
2132
  rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2510
2133
 
2511
- /*** * * Locale stuff * * ***/
2512
- rb_define_singleton_method(mFerret, "locale=", frb_set_locale, 1);
2513
- rb_define_singleton_method(mFerret, "locale", frb_get_locale, 0);
2514
-
2515
- rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2516
- get_rstopwords(FRT_ENGLISH_STOP_WORDS));
2517
- rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2518
- get_rstopwords(FRT_FULL_ENGLISH_STOP_WORDS));
2519
- rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2520
- get_rstopwords(FRT_EXTENDED_ENGLISH_STOP_WORDS));
2521
- rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2522
- get_rstopwords(FRT_FULL_FRENCH_STOP_WORDS));
2523
- rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2524
- get_rstopwords(FRT_FULL_SPANISH_STOP_WORDS));
2525
- rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2526
- get_rstopwords(FRT_FULL_PORTUGUESE_STOP_WORDS));
2527
- rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2528
- get_rstopwords(FRT_FULL_ITALIAN_STOP_WORDS));
2529
- rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2530
- get_rstopwords(FRT_FULL_GERMAN_STOP_WORDS));
2531
- rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2532
- get_rstopwords(FRT_FULL_DUTCH_STOP_WORDS));
2533
- rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2534
- get_rstopwords(FRT_FULL_SWEDISH_STOP_WORDS));
2535
- rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2536
- get_rstopwords(FRT_FULL_NORWEGIAN_STOP_WORDS));
2537
- rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2538
- get_rstopwords(FRT_FULL_DANISH_STOP_WORDS));
2539
- rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2540
- get_rstopwords(FRT_FULL_RUSSIAN_STOP_WORDS));
2541
- rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2542
- get_rstopwords(FRT_FULL_FINNISH_STOP_WORDS));
2543
- rb_define_const(mAnalysis, "FULL_HUNGARIAN_STOP_WORDS",
2544
- get_rstopwords(FRT_FULL_HUNGARIAN_STOP_WORDS));
2134
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS", get_rstopwords(FRT_ENGLISH_STOP_WORDS));
2135
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS", get_rstopwords(FRT_FULL_ENGLISH_STOP_WORDS));
2136
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS", get_rstopwords(FRT_EXTENDED_ENGLISH_STOP_WORDS));
2137
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS", get_rstopwords(FRT_FULL_FRENCH_STOP_WORDS));
2138
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS", get_rstopwords(FRT_FULL_SPANISH_STOP_WORDS));
2139
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS", get_rstopwords(FRT_FULL_PORTUGUESE_STOP_WORDS));
2140
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS", get_rstopwords(FRT_FULL_ITALIAN_STOP_WORDS));
2141
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS", get_rstopwords(FRT_FULL_GERMAN_STOP_WORDS));
2142
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS", get_rstopwords(FRT_FULL_DUTCH_STOP_WORDS));
2143
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS", get_rstopwords(FRT_FULL_SWEDISH_STOP_WORDS));
2144
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS", get_rstopwords(FRT_FULL_NORWEGIAN_STOP_WORDS));
2145
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS", get_rstopwords(FRT_FULL_DANISH_STOP_WORDS));
2146
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS", get_rstopwords(FRT_FULL_RUSSIAN_STOP_WORDS));
2147
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS", get_rstopwords(FRT_FULL_FINNISH_STOP_WORDS));
2148
+ rb_define_const(mAnalysis, "FULL_HUNGARIAN_STOP_WORDS", get_rstopwords(FRT_FULL_HUNGARIAN_STOP_WORDS));
2545
2149
 
2546
2150
  Init_Token();
2547
2151
  Init_TokenStream();
2548
2152
 
2549
- Init_AsciiLetterTokenizer();
2550
2153
  Init_LetterTokenizer();
2551
-
2552
- Init_AsciiWhiteSpaceTokenizer();
2553
2154
  Init_WhiteSpaceTokenizer();
2554
-
2555
- Init_AsciiStandardTokenizer();
2556
2155
  Init_StandardTokenizer();
2557
-
2558
2156
  Init_RegExpTokenizer();
2559
2157
 
2560
- Init_AsciiLowerCaseFilter();
2561
2158
  Init_LowerCaseFilter();
2562
2159
  Init_HyphenFilter();
2563
2160
  Init_StopFilter();
@@ -2565,13 +2162,9 @@ Init_Analysis(void)
2565
2162
  Init_StemFilter();
2566
2163
 
2567
2164
  Init_Analyzer();
2568
- Init_AsciiLetterAnalyzer();
2569
2165
  Init_LetterAnalyzer();
2570
- Init_AsciiWhiteSpaceAnalyzer();
2571
2166
  Init_WhiteSpaceAnalyzer();
2572
- Init_AsciiStandardAnalyzer();
2573
2167
  Init_StandardAnalyzer();
2574
2168
  Init_PerFieldAnalyzer();
2575
2169
  Init_RegExpAnalyzer();
2576
-
2577
2170
  }