ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -1,5 +1,7 @@
1
+ #include "symbol.h"
1
2
  #include <string.h>
2
3
  #include "search.h"
4
+ #include "internal.h"
3
5
 
4
6
  #define TQ(query) ((TermQuery *)(query))
5
7
  #define TSc(scorer) ((TermScorer *)(scorer))
@@ -103,7 +105,7 @@ static Explanation *tsc_explain(Scorer *self, int doc_num)
103
105
  }
104
106
  return expl_new(sim_tf(self->similarity, (float)tf),
105
107
  "tf(term_freq(%s:%s)=%d)",
106
- TQ(query)->field, TQ(query)->term, tf);
108
+ S(TQ(query)->field), TQ(query)->term, tf);
107
109
  }
108
110
 
109
111
  static void tsc_destroy(Scorer *self)
@@ -144,9 +146,8 @@ static Scorer *tw_scorer(Weight *self, IndexReader *ir)
144
146
  {
145
147
  TermQuery *tq = TQ(self->query);
146
148
  TermDocEnum *tde = ir_term_docs_for(ir, tq->field, tq->term);
147
- if (!tde) {
148
- return NULL;
149
- }
149
+ /* ir_term_docs_for should always return a TermDocEnum */
150
+ assert(NULL != tde);
150
151
 
151
152
  return tsc_new(self, tde, ir_get_norms(ir, tq->field));
152
153
  }
@@ -161,10 +162,9 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
161
162
  float field_norm;
162
163
  Explanation *field_norm_expl;
163
164
 
164
- char *query_str = self->query->to_s(self->query, "");
165
+ char *query_str = self->query->to_s(self->query, NULL);
165
166
  TermQuery *tq = TQ(self->query);
166
167
  char *term = tq->term;
167
- char *field = tq->field;
168
168
 
169
169
  Explanation *expl = expl_new(0.0, "weight(%s in %d), product of:",
170
170
  query_str, doc_num);
@@ -172,9 +172,9 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
172
172
  /* We need two of these as it's included in both the query explanation
173
173
  * and the field explanation */
174
174
  Explanation *idf_expl1 = expl_new(self->idf, "idf(doc_freq=%d)",
175
- ir_doc_freq(ir, field, term));
175
+ ir_doc_freq(ir, tq->field, term));
176
176
  Explanation *idf_expl2 = expl_new(self->idf, "idf(doc_freq=%d)",
177
- ir_doc_freq(ir, field, term));
177
+ ir_doc_freq(ir, tq->field, term));
178
178
 
179
179
  /* explain query weight */
180
180
  Explanation *query_expl = expl_new(0.0, "query_weight(%s), product of:",
@@ -197,7 +197,7 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
197
197
 
198
198
  /* explain field weight */
199
199
  field_expl = expl_new(0.0, "field_weight(%s:%s in %d), product of:",
200
- field, term, doc_num);
200
+ S(tq->field), term, doc_num);
201
201
 
202
202
  scorer = self->scorer(self, ir);
203
203
  tf_expl = scorer->explain(scorer, doc_num);
@@ -205,12 +205,12 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
205
205
  expl_add_detail(field_expl, tf_expl);
206
206
  expl_add_detail(field_expl, idf_expl2);
207
207
 
208
- field_norms = ir_get_norms(ir, field);
208
+ field_norms = ir_get_norms(ir, tq->field);
209
209
  field_norm = (field_norms
210
210
  ? sim_decode_norm(self->similarity, field_norms[doc_num])
211
211
  : (float)0.0);
212
212
  field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
213
- field, doc_num);
213
+ S(tq->field), doc_num);
214
214
 
215
215
  expl_add_detail(field_expl, field_norm_expl);
216
216
 
@@ -259,18 +259,18 @@ static Weight *tw_new(Query *query, Searcher *searcher)
259
259
  static void tq_destroy(Query *self)
260
260
  {
261
261
  free(TQ(self)->term);
262
- free(TQ(self)->field);
263
262
  q_destroy_i(self);
264
263
  }
265
264
 
266
- static char *tq_to_s(Query *self, const char *field)
265
+ static char *tq_to_s(Query *self, Symbol default_field)
267
266
  {
268
- size_t flen = strlen(TQ(self)->field);
267
+ const char *field = S(TQ(self)->field);
268
+ size_t flen = strlen(field);
269
269
  size_t tlen = strlen(TQ(self)->term);
270
270
  char *buffer = ALLOC_N(char, 34 + flen + tlen);
271
271
  char *b = buffer;
272
- if (strcmp(field, TQ(self)->field) != 0) {
273
- memcpy(b, TQ(self)->field, sizeof(char) * flen);
272
+ if (default_field != TQ(self)->field) {
273
+ memcpy(b, field, sizeof(char) * flen);
274
274
  b[flen] = ':';
275
275
  b += flen + 1;
276
276
  }
@@ -291,19 +291,19 @@ static void tq_extract_terms(Query *self, HashSet *terms)
291
291
 
292
292
  static unsigned long tq_hash(Query *self)
293
293
  {
294
- return str_hash(TQ(self)->term) ^ str_hash(TQ(self)->field);
294
+ return str_hash(TQ(self)->term) ^ sym_hash(TQ(self)->field);
295
295
  }
296
296
 
297
297
  static int tq_eq(Query *self, Query *o)
298
298
  {
299
299
  return (strcmp(TQ(self)->term, TQ(o)->term) == 0)
300
- && (strcmp(TQ(self)->field, TQ(o)->field) == 0);
300
+ && (TQ(self)->field == TQ(o)->field);
301
301
  }
302
302
 
303
303
  static MatchVector *tq_get_matchv_i(Query *self, MatchVector *mv,
304
304
  TermVector *tv)
305
305
  {
306
- if (strcmp(tv->field, TQ(self)->field) == 0) {
306
+ if (tv->field == TQ(self)->field) {
307
307
  int i;
308
308
  TVTerm *tv_term = tv_get_tv_term(tv, TQ(self)->term);
309
309
  if (tv_term) {
@@ -316,11 +316,11 @@ static MatchVector *tq_get_matchv_i(Query *self, MatchVector *mv,
316
316
  return mv;
317
317
  }
318
318
 
319
- Query *tq_new(const char *field, const char *term)
319
+ Query *tq_new(Symbol field, const char *term)
320
320
  {
321
321
  Query *self = q_new(TermQuery);
322
322
 
323
- TQ(self)->field = estrdup(field);
323
+ TQ(self)->field = field;
324
324
  TQ(self)->term = estrdup(term);
325
325
 
326
326
  self->type = TERM_QUERY;
@@ -1,5 +1,7 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
+ #include "symbol.h"
4
+ #include "internal.h"
3
5
 
4
6
  /****************************************************************************
5
7
  *
@@ -9,21 +11,17 @@
9
11
 
10
12
  #define WCQ(query) ((WildCardQuery *)(query))
11
13
 
12
- static char *wcq_to_s(Query *self, const char *current_field)
14
+ static char *wcq_to_s(Query *self, Symbol default_field)
13
15
  {
14
16
  char *buffer, *bptr;
15
- const char *field = WCQ(self)->field;
17
+ const char *field_str = S(WCQ(self)->field);
16
18
  const char *pattern = WCQ(self)->pattern;
17
- size_t flen = strlen(field);
18
- size_t plen = strlen(pattern);
19
- bptr = buffer = ALLOC_N(char, plen + flen + 35);
19
+ bptr = buffer = ALLOC_N(char, strlen(pattern) + strlen(field_str) + 35);
20
20
 
21
- if (strcmp(field, current_field) != 0) {
22
- sprintf(bptr, "%s:", field);
23
- bptr += flen + 1;
21
+ if (WCQ(self)->field != default_field) {
22
+ bptr += sprintf(bptr, "%s:", field_str);
24
23
  }
25
- sprintf(bptr, "%s", pattern);
26
- bptr += plen;
24
+ bptr += sprintf(bptr, "%s", pattern);
27
25
 
28
26
  if (self->boost != 1.0) {
29
27
  *bptr = '^';
@@ -35,7 +33,7 @@ static char *wcq_to_s(Query *self, const char *current_field)
35
33
 
36
34
  bool wc_match(const char *pattern, const char *text)
37
35
  {
38
- const char *p = pattern, *t = text, *xt;
36
+ const char *p = pattern, *t = text, *xt;
39
37
 
40
38
  /* include '\0' as we need to match empty string */
41
39
  const char *text_last = t + strlen(t);
@@ -83,18 +81,17 @@ bool wc_match(const char *pattern, const char *text)
83
81
  static Query *wcq_rewrite(Query *self, IndexReader *ir)
84
82
  {
85
83
  Query *q;
86
- const char *field = WCQ(self)->field;
87
84
  const char *pattern = WCQ(self)->pattern;
88
85
  const char *first_star = strchr(pattern, WILD_STRING);
89
86
  const char *first_ques = strchr(pattern, WILD_CHAR);
90
87
 
91
88
  if (NULL == first_star && NULL == first_ques) {
92
- q = tq_new(field, pattern);
89
+ q = tq_new(WCQ(self)->field, pattern);
93
90
  q->boost = self->boost;
94
91
  }
95
92
  else {
96
- const int field_num = fis_get_field_num(ir->fis, field);
97
- q = multi_tq_new_conf(field, MTQMaxTerms(self), 0.0);
93
+ const int field_num = fis_get_field_num(ir->fis, WCQ(self)->field);
94
+ q = multi_tq_new_conf(WCQ(self)->field, MTQMaxTerms(self), 0.0);
98
95
 
99
96
  if (field_num >= 0) {
100
97
  TermEnum *te;
@@ -116,8 +113,8 @@ static Query *wcq_rewrite(Query *self, IndexReader *ir)
116
113
  if (te != NULL) {
117
114
  const char *term = te->curr_term;
118
115
  const char *pat_term = term + prefix_len;
119
- do {
120
- if (prefix && strncmp(term, prefix, prefix_len) != 0) {
116
+ do {
117
+ if (prefix[0] && strncmp(term, prefix, prefix_len) != 0) {
121
118
  break;
122
119
  }
123
120
 
@@ -135,27 +132,26 @@ static Query *wcq_rewrite(Query *self, IndexReader *ir)
135
132
 
136
133
  static void wcq_destroy(Query *self)
137
134
  {
138
- free(WCQ(self)->field);
139
135
  free(WCQ(self)->pattern);
140
136
  q_destroy_i(self);
141
137
  }
142
138
 
143
139
  static unsigned long wcq_hash(Query *self)
144
140
  {
145
- return str_hash(WCQ(self)->field) ^ str_hash(WCQ(self)->pattern);
141
+ return sym_hash(WCQ(self)->field) ^ str_hash(WCQ(self)->pattern);
146
142
  }
147
143
 
148
144
  static int wcq_eq(Query *self, Query *o)
149
145
  {
150
- return (strcmp(WCQ(self)->pattern, WCQ(o)->pattern) == 0)
151
- && (strcmp(WCQ(self)->field, WCQ(o)->field) == 0);
146
+ return (strcmp(WCQ(self)->pattern, WCQ(o)->pattern) == 0)
147
+ && (WCQ(self)->field == WCQ(o)->field);
152
148
  }
153
149
 
154
- Query *wcq_new(const char *field, const char *pattern)
150
+ Query *wcq_new(Symbol field, const char *pattern)
155
151
  {
156
152
  Query *self = q_new(WildCardQuery);
157
153
 
158
- WCQ(self)->field = estrdup(field);
154
+ WCQ(self)->field = field;
159
155
  WCQ(self)->pattern = estrdup(pattern);
160
156
  MTQMaxTerms(self) = WILD_CARD_QUERY_MAX_TERMS;
161
157
 
@@ -1,10 +1,19 @@
1
- #include <regex.h>
1
+ #include "lang.h"
2
+ #ifdef FRT_RUBY_VERSION_1_9
3
+ # include <ruby/re.h>
4
+ #else
5
+ # include <regex.h>
6
+ #endif
2
7
  #include <locale.h>
3
- #include <st.h>
8
+ #ifdef FRT_RUBY_VERSION_1_9
9
+ # include <ruby/st.h>
10
+ #else
11
+ # include <st.h>
12
+ #endif
4
13
  #include "ferret.h"
5
14
  #include "analysis.h"
6
15
 
7
- static char *frt_locale = NULL;
16
+ static char *frb_locale = NULL;
8
17
 
9
18
  static VALUE mAnalysis;
10
19
 
@@ -47,13 +56,19 @@ static ID id_token_stream;
47
56
 
48
57
  static VALUE object_space;
49
58
 
59
+ #ifndef FRT_RUBY_VERSION_1_9
50
60
  extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
51
61
  int, struct re_registers *);
62
+ #endif
52
63
 
53
64
  int
54
- frt_rb_hash_size(VALUE hash)
65
+ frb_rb_hash_size(VALUE hash)
55
66
  {
67
+ #ifdef FRT_RUBY_VERSION_1_9
68
+ return RHASH(hash)->ntbl->num_entries;
69
+ #else
56
70
  return RHASH(hash)->tbl->num_entries;
71
+ #endif
57
72
  }
58
73
 
59
74
  /****************************************************************************
@@ -69,11 +84,11 @@ get_stopwords(VALUE rstop_words)
69
84
  int i, len;
70
85
  VALUE rstr;
71
86
  Check_Type(rstop_words, T_ARRAY);
72
- len = RARRAY(rstop_words)->len;
73
- stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
87
+ len = RARRAY_LEN(rstop_words);
88
+ stop_words = ALLOC_N(char *, RARRAY_LEN(rstop_words) + 1);
74
89
  stop_words[len] = NULL;
75
90
  for (i = 0; i < len; i++) {
76
- rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
91
+ rstr = rb_obj_as_string(RARRAY_PTR(rstop_words)[i]);
77
92
  stop_words[i] = rs2s(rstr);
78
93
  }
79
94
  return stop_words;
@@ -93,22 +108,22 @@ typedef struct RToken {
93
108
  } RToken;
94
109
 
95
110
  static void
96
- frt_token_free(void *p)
111
+ frb_token_free(void *p)
97
112
  {
98
113
  free(p);
99
114
  }
100
115
 
101
116
  static void
102
- frt_token_mark(void *p)
117
+ frb_token_mark(void *p)
103
118
  {
104
119
  RToken *token = (RToken *)p;
105
120
  rb_gc_mark(token->text);
106
121
  }
107
122
 
108
123
  static VALUE
109
- frt_token_alloc(VALUE klass)
124
+ frb_token_alloc(VALUE klass)
110
125
  {
111
- return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free,
126
+ return Data_Wrap_Struct(klass, &frb_token_mark, &frb_token_free,
112
127
  ALLOC(RToken));
113
128
  }
114
129
 
@@ -121,18 +136,18 @@ get_token(Token *tk)
121
136
  token->start = tk->start;
122
137
  token->end = tk->end;
123
138
  token->pos_inc = tk->pos_inc;
124
- return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
139
+ return Data_Wrap_Struct(cToken, &frb_token_mark, &frb_token_free, token);
125
140
  }
126
141
 
127
142
  Token *
128
- frt_set_token(Token *tk, VALUE rt)
143
+ frb_set_token(Token *tk, VALUE rt)
129
144
  {
130
145
  RToken *rtk;
131
146
 
132
147
  if (rt == Qnil) return NULL;
133
148
 
134
149
  Data_Get_Struct(rt, RToken, rtk);
135
- tk_set(tk, rs2s(rtk->text), RSTRING(rtk->text)->len,
150
+ tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
136
151
  rtk->start, rtk->end, rtk->pos_inc);
137
152
  return tk;
138
153
  }
@@ -171,7 +186,7 @@ frt_set_token(Token *tk, VALUE rt)
171
186
  * return:: a newly created and assigned Token object
172
187
  */
173
188
  static VALUE
174
- frt_token_init(int argc, VALUE *argv, VALUE self)
189
+ frb_token_init(int argc, VALUE *argv, VALUE self)
175
190
  {
176
191
  RToken *token;
177
192
  VALUE rtext, rstart, rend, rpos_inc, rtype;
@@ -201,7 +216,7 @@ frt_token_init(int argc, VALUE *argv, VALUE self)
201
216
  * lexically by the token text.
202
217
  */
203
218
  static VALUE
204
- frt_token_cmp(VALUE self, VALUE rother)
219
+ frb_token_cmp(VALUE self, VALUE rother)
205
220
  {
206
221
  RToken *token, *other;
207
222
  int cmp;
@@ -230,7 +245,7 @@ frt_token_cmp(VALUE self, VALUE rother)
230
245
  * Returns the text that this token represents
231
246
  */
232
247
  static VALUE
233
- frt_token_get_text(VALUE self)
248
+ frb_token_get_text(VALUE self)
234
249
  {
235
250
  RToken *token;
236
251
  GET_TK(token, self);
@@ -244,7 +259,7 @@ frt_token_get_text(VALUE self)
244
259
  * Set the text for this token.
245
260
  */
246
261
  static VALUE
247
- frt_token_set_text(VALUE self, VALUE rtext)
262
+ frb_token_set_text(VALUE self, VALUE rtext)
248
263
  {
249
264
  RToken *token;
250
265
  GET_TK(token, self);
@@ -259,7 +274,7 @@ frt_token_set_text(VALUE self, VALUE rtext)
259
274
  * Start byte-position of this token
260
275
  */
261
276
  static VALUE
262
- frt_token_get_start_offset(VALUE self)
277
+ frb_token_get_start_offset(VALUE self)
263
278
  {
264
279
  RToken *token;
265
280
  GET_TK(token, self);
@@ -273,7 +288,7 @@ frt_token_get_start_offset(VALUE self)
273
288
  * End byte-position of this token
274
289
  */
275
290
  static VALUE
276
- frt_token_get_end_offset(VALUE self)
291
+ frb_token_get_end_offset(VALUE self)
277
292
  {
278
293
  RToken *token;
279
294
  GET_TK(token, self);
@@ -287,7 +302,7 @@ frt_token_get_end_offset(VALUE self)
287
302
  * Position Increment for this token
288
303
  */
289
304
  static VALUE
290
- frt_token_get_pos_inc(VALUE self)
305
+ frb_token_get_pos_inc(VALUE self)
291
306
  {
292
307
  RToken *token;
293
308
  GET_TK(token, self);
@@ -301,7 +316,7 @@ frt_token_get_pos_inc(VALUE self)
301
316
  * Set start byte-position of this token
302
317
  */
303
318
  static VALUE
304
- frt_token_set_start_offset(VALUE self, VALUE rstart)
319
+ frb_token_set_start_offset(VALUE self, VALUE rstart)
305
320
  {
306
321
  RToken *token;
307
322
  GET_TK(token, self);
@@ -316,7 +331,7 @@ frt_token_set_start_offset(VALUE self, VALUE rstart)
316
331
  * Set end byte-position of this token
317
332
  */
318
333
  static VALUE
319
- frt_token_set_end_offset(VALUE self, VALUE rend)
334
+ frb_token_set_end_offset(VALUE self, VALUE rend)
320
335
  {
321
336
  RToken *token;
322
337
  GET_TK(token, self);
@@ -352,7 +367,7 @@ frt_token_set_end_offset(VALUE self, VALUE rend)
352
367
  *
353
368
  */
354
369
  static VALUE
355
- frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
370
+ frb_token_set_pos_inc(VALUE self, VALUE rpos_inc)
356
371
  {
357
372
  RToken *token;
358
373
  GET_TK(token, self);
@@ -367,12 +382,12 @@ frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
367
382
  * Return a string representation of the token
368
383
  */
369
384
  static VALUE
370
- frt_token_to_s(VALUE self)
385
+ frb_token_to_s(VALUE self)
371
386
  {
372
387
  RToken *token;
373
388
  char *buf;
374
389
  GET_TK(token, self);
375
- buf = alloca(RSTRING(token->text)->len + 80);
390
+ buf = alloca(RSTRING_LEN(token->text) + 80);
376
391
  sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
377
392
  token->start, token->end, token->pos_inc);
378
393
  return rb_str_new2(buf);
@@ -387,14 +402,14 @@ frt_token_to_s(VALUE self)
387
402
  #define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
388
403
 
389
404
  static void
390
- frt_ts_mark(void *p)
405
+ frb_ts_mark(void *p)
391
406
  {
392
407
  TokenStream *ts = (TokenStream *)p;
393
- if (ts->text) frt_gc_mark(&ts->text);
408
+ if (ts->text) frb_gc_mark(&ts->text);
394
409
  }
395
410
 
396
411
  static void
397
- frt_ts_free(TokenStream *ts)
412
+ frb_ts_free(TokenStream *ts)
398
413
  {
399
414
  if (object_get(&ts->text) != Qnil) {
400
415
  object_del(&ts->text);
@@ -403,8 +418,8 @@ frt_ts_free(TokenStream *ts)
403
418
  ts_deref(ts);
404
419
  }
405
420
 
406
- static void frt_rets_free(TokenStream *ts);
407
- static void frt_rets_mark(TokenStream *ts);
421
+ static void frb_rets_free(TokenStream *ts);
422
+ static void frb_rets_mark(TokenStream *ts);
408
423
  static Token *rets_next(TokenStream *ts);
409
424
 
410
425
  static VALUE
@@ -413,11 +428,11 @@ get_rb_token_stream(TokenStream *ts)
413
428
  VALUE rts = object_get(ts);
414
429
  if (rts == Qnil) {
415
430
  if (ts->next == &rets_next) {
416
- rts = Data_Wrap_Struct(cTokenStream, &frt_rets_mark,
417
- &frt_rets_free, ts);
431
+ rts = Data_Wrap_Struct(cTokenStream, &frb_rets_mark,
432
+ &frb_rets_free, ts);
418
433
  } else {
419
- rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark,
420
- &frt_ts_free, ts);
434
+ rts = Data_Wrap_Struct(cTokenStream, &frb_ts_mark,
435
+ &frb_ts_free, ts);
421
436
  }
422
437
  object_add(ts, rts);
423
438
  }
@@ -429,7 +444,7 @@ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
429
444
  {
430
445
  StringValue(rstr);
431
446
  ts->reset(ts, rs2s(rstr));
432
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
447
+ Frt_Wrap_Struct(self, &frb_ts_mark, &frb_ts_free, ts);
433
448
  object_add(&ts->text, rstr);
434
449
  object_add(ts, self);
435
450
  return self;
@@ -445,7 +460,7 @@ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
445
460
  * token_stream.text = File.read(file_name)
446
461
  */
447
462
  static VALUE
448
- frt_ts_set_text(VALUE self, VALUE rtext)
463
+ frb_ts_set_text(VALUE self, VALUE rtext)
449
464
  {
450
465
  TokenStream *ts;
451
466
  Data_Get_Struct(self, TokenStream, ts);
@@ -465,7 +480,7 @@ frt_ts_set_text(VALUE self, VALUE rtext)
465
480
  * Return the text that the TokenStream is tokenizing
466
481
  */
467
482
  static VALUE
468
- frt_ts_get_text(VALUE self)
483
+ frb_ts_get_text(VALUE self)
469
484
  {
470
485
  VALUE rtext = Qnil;
471
486
  TokenStream *ts;
@@ -487,7 +502,7 @@ frt_ts_get_text(VALUE self)
487
502
  * tokens.
488
503
  */
489
504
  static VALUE
490
- frt_ts_next(VALUE self)
505
+ frb_ts_next(VALUE self)
491
506
  {
492
507
  TokenStream *ts;
493
508
  Token *next;
@@ -507,16 +522,16 @@ frt_ts_next(VALUE self)
507
522
  #define TkFilt(filter) ((TokenFilter *)(filter))
508
523
 
509
524
  static void
510
- frt_tf_mark(void *p)
525
+ frb_tf_mark(void *p)
511
526
  {
512
527
  TokenStream *ts = (TokenStream *)p;
513
528
  if (TkFilt(ts)->sub_ts) {
514
- frt_gc_mark(&TkFilt(ts)->sub_ts);
529
+ frb_gc_mark(&TkFilt(ts)->sub_ts);
515
530
  }
516
531
  }
517
532
 
518
533
  static void
519
- frt_tf_free(TokenStream *ts)
534
+ frb_tf_free(TokenStream *ts)
520
535
  {
521
536
  if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
522
537
  object_del(&TkFilt(ts)->sub_ts);
@@ -545,7 +560,7 @@ cwrts_destroy_i(TokenStream *ts)
545
560
  object_del(&ts->text);
546
561
  }
547
562
  rb_hash_delete(object_space, ((VALUE)ts)|1);
548
- /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
563
+ /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
549
564
  free(ts);
550
565
  }
551
566
 
@@ -553,7 +568,7 @@ static Token *
553
568
  cwrts_next(TokenStream *ts)
554
569
  {
555
570
  VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
556
- return frt_set_token(&(CachedTS(ts)->token), rtoken);
571
+ return frb_set_token(&(CachedTS(ts)->token), rtoken);
557
572
  }
558
573
 
559
574
  static TokenStream *
@@ -574,10 +589,10 @@ cwrts_clone_i(TokenStream *orig_ts)
574
589
  }
575
590
 
576
591
  static TokenStream *
577
- frt_get_cwrapped_rts(VALUE rts)
592
+ frb_get_cwrapped_rts(VALUE rts)
578
593
  {
579
594
  TokenStream *ts;
580
- if (frt_is_cclass(rts) && DATA_PTR(rts)) {
595
+ if (frb_is_cclass(rts) && DATA_PTR(rts)) {
581
596
  GET_TS(ts, rts);
582
597
  REF(ts);
583
598
  }
@@ -621,7 +636,7 @@ typedef struct RegExpTokenStream {
621
636
  VALUE rtext;
622
637
  VALUE regex;
623
638
  VALUE proc;
624
- int curr_ind;
639
+ long curr_ind;
625
640
  } RegExpTokenStream;
626
641
 
627
642
  static void
@@ -631,12 +646,12 @@ rets_destroy_i(TokenStream *ts)
631
646
  object_del(&ts->text);
632
647
  }
633
648
  rb_hash_delete(object_space, ((VALUE)ts)|1);
634
- /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
649
+ /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
635
650
  free(ts);
636
651
  }
637
652
 
638
653
  static void
639
- frt_rets_free(TokenStream *ts)
654
+ frb_rets_free(TokenStream *ts)
640
655
  {
641
656
  if (object_get(&ts->text) != Qnil) {
642
657
  object_del(&ts->text);
@@ -646,9 +661,9 @@ frt_rets_free(TokenStream *ts)
646
661
  }
647
662
 
648
663
  static void
649
- frt_rets_mark(TokenStream *ts)
664
+ frb_rets_mark(TokenStream *ts)
650
665
  {
651
- if (ts->text) frt_gc_mark(&ts->text);
666
+ if (ts->text) frb_gc_mark(&ts->text);
652
667
  rb_gc_mark(RETS(ts)->rtext);
653
668
  rb_gc_mark(RETS(ts)->regex);
654
669
  rb_gc_mark(RETS(ts)->proc);
@@ -662,7 +677,7 @@ frt_rets_mark(TokenStream *ts)
662
677
  * tokenize the text from the beginning.
663
678
  */
664
679
  static VALUE
665
- frt_rets_set_text(VALUE self, VALUE rtext)
680
+ frb_rets_set_text(VALUE self, VALUE rtext)
666
681
  {
667
682
  TokenStream *ts;
668
683
  GET_TS(ts, self);
@@ -682,23 +697,88 @@ frt_rets_set_text(VALUE self, VALUE rtext)
682
697
  * Get the text being tokenized by the tokenizer.
683
698
  */
684
699
  static VALUE
685
- frt_rets_get_text(VALUE self)
700
+ frb_rets_get_text(VALUE self)
686
701
  {
687
702
  TokenStream *ts;
688
703
  GET_TS(ts, self);
689
704
  return RETS(ts)->rtext;
690
705
  }
691
706
 
707
+ #ifdef FRT_RUBY_VERSION_1_9
708
+
709
+ // partly lifted from ruby 1.9 string.c
710
+ #include <ruby/encoding.h>
711
+ #define BEG(no) regs->beg[no]
712
+ #define END(no) regs->end[no]
713
+ #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
714
+ static VALUE
715
+ scan_once(VALUE str, VALUE pat, long *start)
716
+ {
717
+ VALUE match;
718
+ struct re_registers *regs;
719
+
720
+ if (rb_reg_search(pat, str, *start, 0) >= 0) {
721
+ match = rb_backref_get();
722
+ regs = RMATCH_REGS(match);
723
+ if (BEG(0) == END(0)) {
724
+ rb_encoding *enc = STR_ENC_GET(str);
725
+ /*
726
+ * Always consume at least one character of the input string
727
+ */
728
+ if (RSTRING_LEN(str) > END(0))
729
+ *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
730
+ RSTRING_END(str), enc);
731
+ else
732
+ *start = END(0)+1;
733
+ }
734
+ else {
735
+ *start = END(0);
736
+ }
737
+ return rb_reg_nth_match(0, match);
738
+ }
739
+ return Qnil;
740
+ }
741
+ //
742
+
743
+ static Token *
744
+ rets_next(TokenStream *ts)
745
+ {
746
+ VALUE ret;
747
+ long rtok_len;
748
+ int beg, end;
749
+ Check_Type(RETS(ts)->regex, T_REGEXP);
750
+ ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
751
+ if (NIL_P(ret)) return NULL;
752
+
753
+ Check_Type(ret, T_STRING);
754
+ rtok_len = RSTRING_LEN(ret);
755
+ beg = RETS(ts)->curr_ind - rtok_len;
756
+ end = RETS(ts)->curr_ind;
757
+
758
+ if (NIL_P(RETS(ts)->proc)) {
759
+ return tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
760
+ beg, end, 1);
761
+ } else {
762
+ VALUE rtok;
763
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
764
+ return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
765
+ RSTRING_LEN(rtok), beg, end, 1);
766
+ }
767
+ }
768
+
769
+ #else
770
+
692
771
  static Token *
693
772
  rets_next(TokenStream *ts)
694
773
  {
695
774
  static struct re_registers regs;
696
775
  int ret, beg, end;
697
- struct RString *rtext = RSTRING(RETS(ts)->rtext);
776
+ long rtext_len = RSTRING_LEN(RETS(ts)->rtext);
777
+ char *rtext_ptr = RSTRING_PTR(RETS(ts)->rtext);
698
778
  Check_Type(RETS(ts)->regex, T_REGEXP);
699
779
  ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
700
- rtext->ptr, rtext->len,
701
- RETS(ts)->curr_ind, rtext->len - RETS(ts)->curr_ind,
780
+ rtext_ptr, rtext_len,
781
+ RETS(ts)->curr_ind, rtext_len - RETS(ts)->curr_ind,
702
782
  &regs);
703
783
 
704
784
  if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
@@ -707,16 +787,18 @@ rets_next(TokenStream *ts)
707
787
  beg = regs.beg[0];
708
788
  RETS(ts)->curr_ind = end = regs.end[0];
709
789
  if (NIL_P(RETS(ts)->proc)) {
710
- return tk_set(&(CachedTS(ts)->token), rtext->ptr + beg, end - beg,
790
+ return tk_set(&(CachedTS(ts)->token), rtext_ptr + beg, end - beg,
711
791
  beg, end, 1);
712
792
  } else {
713
- VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
793
+ VALUE rtok = rb_str_new(rtext_ptr + beg, end - beg);
714
794
  rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
715
795
  return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
716
- RSTRING(rtok)->len, beg, end, 1);
796
+ RSTRING_LEN(rtok), beg, end, 1);
717
797
  }
718
798
  }
719
799
 
800
+ #endif
801
+
720
802
  static TokenStream *
721
803
  rets_reset(TokenStream *ts, char *text)
722
804
  {
@@ -770,7 +852,7 @@ rets_new(VALUE rtext, VALUE regex, VALUE proc)
770
852
  * regexp:: regular expression used to recognize tokens in the input
771
853
  */
772
854
  static VALUE
773
- frt_rets_init(int argc, VALUE *argv, VALUE self)
855
+ frb_rets_init(int argc, VALUE *argv, VALUE self)
774
856
  {
775
857
  VALUE rtext, regex, proc;
776
858
  TokenStream *ts;
@@ -779,7 +861,7 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
779
861
 
780
862
  ts = rets_new(rtext, regex, proc);
781
863
 
782
- Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
864
+ Frt_Wrap_Struct(self, &frb_rets_mark, &frb_rets_free, ts);
783
865
  object_add(ts, self);
784
866
  return self;
785
867
  }
@@ -801,7 +883,7 @@ lower = (argc ? RTEST(rlower) : dflt)
801
883
  * Create a new AsciiLetterTokenizer
802
884
  */
803
885
  static VALUE
804
- frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
886
+ frb_a_letter_tokenizer_init(VALUE self, VALUE rstr)
805
887
  {
806
888
  return get_wrapped_ts(self, rstr, letter_tokenizer_new());
807
889
  }
@@ -816,11 +898,11 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
816
898
  * lower:: set to false if you don't wish to downcase tokens
817
899
  */
818
900
  static VALUE
819
- frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
901
+ frb_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
820
902
  {
821
903
  TS_ARGS(false);
822
904
  #ifndef POSH_OS_WIN32
823
- if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
905
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
824
906
  #endif
825
907
  return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
826
908
  }
@@ -832,7 +914,7 @@ frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
832
914
  * Create a new AsciiWhiteSpaceTokenizer
833
915
  */
834
916
  static VALUE
835
- frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
917
+ frb_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
836
918
  {
837
919
  return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
838
920
  }
@@ -847,11 +929,11 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
847
929
  * lower:: set to false if you don't wish to downcase tokens
848
930
  */
849
931
  static VALUE
850
- frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
932
+ frb_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
851
933
  {
852
934
  TS_ARGS(false);
853
935
  #ifndef POSH_OS_WIN32
854
- if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
936
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
855
937
  #endif
856
938
  return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
857
939
  }
@@ -863,7 +945,7 @@ frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
863
945
  * Create a new AsciiStandardTokenizer
864
946
  */
865
947
  static VALUE
866
- frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
948
+ frb_a_standard_tokenizer_init(VALUE self, VALUE rstr)
867
949
  {
868
950
  return get_wrapped_ts(self, rstr, standard_tokenizer_new());
869
951
  }
@@ -878,10 +960,10 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
878
960
  * lower:: set to false if you don't wish to downcase tokens
879
961
  */
880
962
  static VALUE
881
- frt_standard_tokenizer_init(VALUE self, VALUE rstr)
963
+ frb_standard_tokenizer_init(VALUE self, VALUE rstr)
882
964
  {
883
965
  #ifndef POSH_OS_WIN32
884
- if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
966
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
885
967
  #endif
886
968
  return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
887
969
  }
@@ -900,13 +982,13 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
900
982
  * LowerCaseFilter.
901
983
  */
902
984
  static VALUE
903
- frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
985
+ frb_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
904
986
  {
905
- TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
987
+ TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
906
988
  ts = lowercase_filter_new(ts);
907
989
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
908
990
 
909
- Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
991
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
910
992
  object_add(ts, self);
911
993
  return self;
912
994
  }
@@ -919,16 +1001,16 @@ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
919
1001
  * lowercase based on the current locale.
920
1002
  */
921
1003
  static VALUE
922
- frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
1004
+ frb_lowercase_filter_init(VALUE self, VALUE rsub_ts)
923
1005
  {
924
- TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
1006
+ TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
925
1007
  #ifndef POSH_OS_WIN32
926
- if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1008
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
927
1009
  #endif
928
1010
  ts = mb_lowercase_filter_new(ts);
929
1011
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
930
1012
 
931
- Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1013
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
932
1014
  object_add(ts, self);
933
1015
  return self;
934
1016
  }
@@ -944,13 +1026,13 @@ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
944
1026
  * used by default by the StandardAnalyzer.
945
1027
  */
946
1028
  static VALUE
947
- frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
1029
+ frb_hyphen_filter_init(VALUE self, VALUE rsub_ts)
948
1030
  {
949
- TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
1031
+ TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
950
1032
  ts = hyphen_filter_new(ts);
951
1033
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
952
1034
 
953
- Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1035
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
954
1036
  object_add(ts, self);
955
1037
  return self;
956
1038
  }
@@ -969,12 +1051,12 @@ frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
969
1051
  * Ferret::Analysis contains a number of stop-word lists.
970
1052
  */
971
1053
  static VALUE
972
- frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
1054
+ frb_stop_filter_init(int argc, VALUE *argv, VALUE self)
973
1055
  {
974
1056
  VALUE rsub_ts, rstop_words;
975
1057
  TokenStream *ts;
976
1058
  rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
977
- ts = frt_get_cwrapped_rts(rsub_ts);
1059
+ ts = frb_get_cwrapped_rts(rsub_ts);
978
1060
  if (rstop_words != Qnil) {
979
1061
  char **stop_words = get_stopwords(rstop_words);
980
1062
  ts = stop_filter_new_with_words(ts, (const char **)stop_words);
@@ -985,12 +1067,13 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
985
1067
  }
986
1068
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
987
1069
 
988
- Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1070
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
989
1071
  object_add(ts, self);
990
1072
  return self;
991
1073
  }
992
1074
 
993
- static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
1075
+ static INLINE void frb_add_mapping_i(TokenStream *mf, VALUE from,
1076
+ const char *to)
994
1077
  {
995
1078
  switch (TYPE(from)) {
996
1079
  case T_STRING:
@@ -1007,13 +1090,13 @@ static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
1007
1090
  }
1008
1091
  }
1009
1092
 
1010
- static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1093
+ static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1011
1094
  {
1012
1095
  if (key == Qundef) {
1013
1096
  return ST_CONTINUE;
1014
1097
  } else {
1015
1098
  TokenStream *mf = (TokenStream *)arg;
1016
- char *to;
1099
+ const char *to;
1017
1100
  switch (TYPE(value)) {
1018
1101
  case T_STRING:
1019
1102
  to = rs2s(value);
@@ -1029,12 +1112,12 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1029
1112
  }
1030
1113
  if (TYPE(key) == T_ARRAY) {
1031
1114
  int i;
1032
- for (i = RARRAY(key)->len - 1; i >= 0; i--) {
1033
- frt_add_mapping_i(mf, RARRAY(key)->ptr[i], to);
1115
+ for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
1116
+ frb_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
1034
1117
  }
1035
1118
  }
1036
1119
  else {
1037
- frt_add_mapping_i(mf, key, to);
1120
+ frb_add_mapping_i(mf, key, to);
1038
1121
  }
1039
1122
  }
1040
1123
  return ST_CONTINUE;
@@ -1066,16 +1149,16 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1066
1149
  * })
1067
1150
  */
1068
1151
  static VALUE
1069
- frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1152
+ frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1070
1153
  {
1071
1154
  TokenStream *ts;
1072
- ts = frt_get_cwrapped_rts(rsub_ts);
1155
+ ts = frb_get_cwrapped_rts(rsub_ts);
1073
1156
  ts = mapping_filter_new(ts);
1074
- rb_hash_foreach(mapping, frt_add_mappings_i, (VALUE)ts);
1157
+ rb_hash_foreach(mapping, frb_add_mappings_i, (VALUE)ts);
1075
1158
  mulmap_compile(((MappingFilter *)ts)->mapper);
1076
1159
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1077
1160
 
1078
- Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1161
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1079
1162
  object_add(ts, self);
1080
1163
  return self;
1081
1164
  }
@@ -1096,14 +1179,14 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1096
1179
  * encoding:: The encoding of the data (default: "UTF-8")
1097
1180
  */
1098
1181
  static VALUE
1099
- frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
1182
+ frb_stem_filter_init(int argc, VALUE *argv, VALUE self)
1100
1183
  {
1101
1184
  VALUE rsub_ts, ralgorithm, rcharenc;
1102
1185
  char *algorithm = "english";
1103
1186
  char *charenc = NULL;
1104
1187
  TokenStream *ts;
1105
1188
  rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
1106
- ts = frt_get_cwrapped_rts(rsub_ts);
1189
+ ts = frb_get_cwrapped_rts(rsub_ts);
1107
1190
  switch (argc) {
1108
1191
  case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
1109
1192
  case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
@@ -1111,8 +1194,12 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
1111
1194
  ts = stem_filter_new(ts, algorithm, charenc);
1112
1195
  object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
1113
1196
 
1114
- Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
1197
+ Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
1115
1198
  object_add(ts, self);
1199
+ if (((StemFilter *)ts)->stemmer == NULL) {
1200
+ rb_raise(rb_eArgError, "No stemmer could be found with the encoding "
1201
+ "%s and the language %s", charenc, algorithm);
1202
+ }
1116
1203
  return self;
1117
1204
  }
1118
1205
 
@@ -1139,28 +1226,28 @@ static void
1139
1226
  cwa_destroy_i(Analyzer *a)
1140
1227
  {
1141
1228
  rb_hash_delete(object_space, ((VALUE)a)|1);
1142
- /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1229
+ /*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
1143
1230
  free(a);
1144
1231
  }
1145
1232
 
1146
1233
  static TokenStream *
1147
- cwa_get_ts(Analyzer *a, char *field, char *text)
1234
+ cwa_get_ts(Analyzer *a, Symbol field, char *text)
1148
1235
  {
1149
1236
  VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1150
- ID2SYM(rb_intern(field)), rb_str_new2(text));
1151
- return frt_get_cwrapped_rts(rts);
1237
+ FSYM2SYM(field), rb_str_new2(text));
1238
+ return frb_get_cwrapped_rts(rts);
1152
1239
  }
1153
1240
 
1154
1241
  Analyzer *
1155
- frt_get_cwrapped_analyzer(VALUE ranalyzer)
1242
+ frb_get_cwrapped_analyzer(VALUE ranalyzer)
1156
1243
  {
1157
1244
  Analyzer *a = NULL;
1158
- if (frt_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1245
+ if (frb_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
1159
1246
  Data_Get_Struct(ranalyzer, Analyzer, a);
1160
1247
  REF(a);
1161
1248
  }
1162
1249
  else {
1163
- a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1250
+ a = (Analyzer *)frt_ecalloc(sizeof(CWrappedAnalyzer));
1164
1251
  a->destroy_i = &cwa_destroy_i;
1165
1252
  a->get_ts = &cwa_get_ts;
1166
1253
  a->ref_cnt = 1;
@@ -1172,20 +1259,20 @@ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1172
1259
  }
1173
1260
 
1174
1261
  static void
1175
- frt_analyzer_free(Analyzer *a)
1262
+ frb_analyzer_free(Analyzer *a)
1176
1263
  {
1177
1264
  object_del(a);
1178
1265
  a_deref(a);
1179
1266
  }
1180
1267
 
1181
1268
  VALUE
1182
- frt_get_analyzer(Analyzer *a)
1269
+ frb_get_analyzer(Analyzer *a)
1183
1270
  {
1184
1271
  VALUE self = Qnil;
1185
1272
  if (a) {
1186
1273
  self = object_get(a);
1187
1274
  if (self == Qnil) {
1188
- self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
1275
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frb_analyzer_free, a);
1189
1276
  REF(a);
1190
1277
  object_add(a, self);
1191
1278
  }
@@ -1196,7 +1283,7 @@ frt_get_analyzer(Analyzer *a)
1196
1283
  INLINE VALUE
1197
1284
  get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1198
1285
  {
1199
- TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1286
+ TokenStream *ts = a_get_ts(a, frb_field(rfield), rs2s(rstring));
1200
1287
 
1201
1288
  /* Make sure that there is no entry already */
1202
1289
  object_set(&ts->text, rstring);
@@ -1215,10 +1302,10 @@ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1215
1302
  * input:: data from the field to be tokenized
1216
1303
  */
1217
1304
  static VALUE
1218
- frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1305
+ frb_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1219
1306
  {
1220
1307
  /* NOTE: Any changes made to this method may also need to be applied to
1221
- * frt_re_analyzer_token_stream */
1308
+ * frb_re_analyzer_token_stream */
1222
1309
  Analyzer *a;
1223
1310
  GET_A(a, self);
1224
1311
 
@@ -1244,12 +1331,12 @@ lower = (argc ? RTEST(rlower) : dflt)
1244
1331
  * lower:: set to false if you don't want the field's tokens to be downcased
1245
1332
  */
1246
1333
  static VALUE
1247
- frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1334
+ frb_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1248
1335
  {
1249
1336
  Analyzer *a;
1250
1337
  GET_LOWER(false);
1251
1338
  a = whitespace_analyzer_new(lower);
1252
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1339
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1253
1340
  object_add(a, self);
1254
1341
  return self;
1255
1342
  }
@@ -1265,15 +1352,15 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1265
1352
  * lower:: set to false if you don't want the field's tokens to be downcased
1266
1353
  */
1267
1354
  static VALUE
1268
- frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1355
+ frb_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1269
1356
  {
1270
1357
  Analyzer *a;
1271
1358
  GET_LOWER(false);
1272
1359
  #ifndef POSH_OS_WIN32
1273
- if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1360
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1274
1361
  #endif
1275
1362
  a = mb_whitespace_analyzer_new(lower);
1276
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1363
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1277
1364
  object_add(a, self);
1278
1365
  return self;
1279
1366
  }
@@ -1289,12 +1376,12 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1289
1376
  * lower:: set to false if you don't want the field's tokens to be downcased
1290
1377
  */
1291
1378
  static VALUE
1292
- frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1379
+ frb_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1293
1380
  {
1294
1381
  Analyzer *a;
1295
1382
  GET_LOWER(true);
1296
1383
  a = letter_analyzer_new(lower);
1297
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1384
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1298
1385
  object_add(a, self);
1299
1386
  return self;
1300
1387
  }
@@ -1310,15 +1397,15 @@ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1310
1397
  * lower:: set to false if you don't want the field's tokens to be downcased
1311
1398
  */
1312
1399
  static VALUE
1313
- frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1400
+ frb_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
1314
1401
  {
1315
1402
  Analyzer *a;
1316
1403
  GET_LOWER(true);
1317
1404
  #ifndef POSH_OS_WIN32
1318
- if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1405
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1319
1406
  #endif
1320
1407
  a = mb_letter_analyzer_new(lower);
1321
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1408
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1322
1409
  object_add(a, self);
1323
1410
  return self;
1324
1411
  }
@@ -1350,7 +1437,7 @@ get_rstopwords(const char **stop_words)
1350
1437
  * stop_words:: list of stop-words to pass to the StopFilter
1351
1438
  */
1352
1439
  static VALUE
1353
- frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1440
+ frb_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1354
1441
  {
1355
1442
  bool lower;
1356
1443
  VALUE rlower, rstop_words;
@@ -1364,7 +1451,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1364
1451
  } else {
1365
1452
  a = standard_analyzer_new(lower);
1366
1453
  }
1367
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1454
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1368
1455
  object_add(a, self);
1369
1456
  return self;
1370
1457
  }
@@ -1383,13 +1470,13 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1383
1470
  * stop_words:: list of stop-words to pass to the StopFilter
1384
1471
  */
1385
1472
  static VALUE
1386
- frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1473
+ frb_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1387
1474
  {
1388
1475
  bool lower;
1389
1476
  VALUE rlower, rstop_words;
1390
1477
  Analyzer *a;
1391
1478
  #ifndef POSH_OS_WIN32
1392
- if (!frt_locale) frt_locale = setlocale(LC_CTYPE, "");
1479
+ if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
1393
1480
  #endif
1394
1481
  rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1395
1482
  lower = ((rlower == Qnil) ? true : RTEST(rlower));
@@ -1400,22 +1487,22 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
1400
1487
  } else {
1401
1488
  a = mb_standard_analyzer_new(lower);
1402
1489
  }
1403
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1490
+ Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
1404
1491
  object_add(a, self);
1405
1492
  return self;
1406
1493
  }
1407
1494
 
1408
1495
  static void
1409
- frt_h_mark_values_i(void *key, void *value, void *arg)
1496
+ frb_h_mark_values_i(void *key, void *value, void *arg)
1410
1497
  {
1411
- frt_gc_mark(value);
1498
+ frb_gc_mark(value);
1412
1499
  }
1413
1500
 
1414
1501
  static void
1415
- frt_pfa_mark(void *p)
1502
+ frb_pfa_mark(void *p)
1416
1503
  {
1417
- frt_gc_mark(PFA(p)->default_a);
1418
- h_each(PFA(p)->dict, &frt_h_mark_values_i, NULL);
1504
+ frb_gc_mark(PFA(p)->default_a);
1505
+ h_each(PFA(p)->dict, &frb_h_mark_values_i, NULL);
1419
1506
  }
1420
1507
 
1421
1508
  /*** PerFieldAnalyzer ***/
@@ -1431,11 +1518,11 @@ frt_pfa_mark(void *p)
1431
1518
  * specified
1432
1519
  */
1433
1520
  static VALUE
1434
- frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1521
+ frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1435
1522
  {
1436
- Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
1523
+ Analyzer *def = frb_get_cwrapped_analyzer(ranalyzer);
1437
1524
  Analyzer *a = per_field_analyzer_new(def);
1438
- Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
1525
+ Frt_Wrap_Struct(self, &frb_pfa_mark, &frb_analyzer_free, a);
1439
1526
  object_add(a, self);
1440
1527
  return self;
1441
1528
  }
@@ -1452,13 +1539,13 @@ frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
1452
1539
  * analyzer:: analyzer to be used on +field_name+
1453
1540
  */
1454
1541
  static VALUE
1455
- frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1542
+ frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1456
1543
  {
1457
1544
  Analyzer *pfa, *a;
1458
1545
  Data_Get_Struct(self, Analyzer, pfa);
1459
- a = frt_get_cwrapped_analyzer(ranalyzer);
1546
+ a = frb_get_cwrapped_analyzer(ranalyzer);
1460
1547
 
1461
- pfa_add_field(pfa, frt_field(rfield), a);
1548
+ pfa_add_field(pfa, frb_field(rfield), a);
1462
1549
  return self;
1463
1550
  }
1464
1551
 
@@ -1473,10 +1560,10 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1473
1560
  * input:: data from the field to be tokenized
1474
1561
  */
1475
1562
  static VALUE
1476
- frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1563
+ frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1477
1564
  {
1478
1565
  Analyzer *pfa, *a;
1479
- char *field = frt_field(rfield);
1566
+ Symbol field = frb_field(rfield);
1480
1567
  GET_A(pfa, self);
1481
1568
 
1482
1569
  StringValue(rstring);
@@ -1486,7 +1573,7 @@ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1486
1573
  }
1487
1574
  if (a->get_ts == cwa_get_ts) {
1488
1575
  return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1489
- ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
1576
+ FSYM2SYM(field), rb_str_new2(rs2s(rstring)));
1490
1577
  }
1491
1578
  else {
1492
1579
  return get_rb_ts_from_a(a, rfield, rstring);
@@ -1496,9 +1583,9 @@ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1496
1583
  /*** RegExpAnalyzer ***/
1497
1584
 
1498
1585
  static void
1499
- frt_re_analyzer_mark(Analyzer *a)
1586
+ frb_re_analyzer_mark(Analyzer *a)
1500
1587
  {
1501
- frt_gc_mark(a->current_ts);
1588
+ frb_gc_mark(a->current_ts);
1502
1589
  }
1503
1590
 
1504
1591
  static void
@@ -1519,7 +1606,7 @@ re_analyzer_destroy_i(Analyzer *a)
1519
1606
  * lower:: set to false if you don't want to downcase the tokens
1520
1607
  */
1521
1608
  static VALUE
1522
- frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1609
+ frb_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1523
1610
  {
1524
1611
  VALUE lower, rets, regex, proc;
1525
1612
  Analyzer *a;
@@ -1527,17 +1614,17 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1527
1614
  rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1528
1615
 
1529
1616
  ts = rets_new(Qnil, regex, proc);
1530
- rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1617
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts);
1531
1618
  object_add(ts, rets);
1532
1619
 
1533
1620
  if (lower != Qfalse) {
1534
- rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
1621
+ rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets);
1535
1622
  ts = DATA_PTR(rets);
1536
1623
  }
1537
1624
  REF(ts);
1538
1625
 
1539
1626
  a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1540
- Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
1627
+ Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a);
1541
1628
  object_add(a, self);
1542
1629
  return self;
1543
1630
  }
@@ -1554,7 +1641,7 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1554
1641
  * input:: data from the field to be tokenized
1555
1642
  */
1556
1643
  static VALUE
1557
- frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1644
+ frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1558
1645
  {
1559
1646
  TokenStream *ts;
1560
1647
  Analyzer *a;
@@ -1562,7 +1649,7 @@ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1562
1649
 
1563
1650
  StringValue(rtext);
1564
1651
 
1565
- ts = a_get_ts(a, frt_field(rfield), rs2s(rtext));
1652
+ ts = a_get_ts(a, frb_field(rfield), rs2s(rtext));
1566
1653
 
1567
1654
  /* Make sure that there is no entry already */
1568
1655
  object_set(&ts->text, rtext);
@@ -1591,9 +1678,9 @@ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
1591
1678
  *
1592
1679
  * puts Ferret.locale #=> "en_US.UTF-8"
1593
1680
  */
1594
- static VALUE frt_get_locale(VALUE self, VALUE locale)
1681
+ static VALUE frb_get_locale(VALUE self, VALUE locale)
1595
1682
  {
1596
- return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
1683
+ return (frb_locale ? rb_str_new2(frb_locale) : Qnil);
1597
1684
  }
1598
1685
 
1599
1686
  /*
@@ -1603,11 +1690,11 @@ static VALUE frt_get_locale(VALUE self, VALUE locale)
1603
1690
  * Set the global locale. You should use this method to set different locales
1604
1691
  * when indexing documents with different encodings.
1605
1692
  */
1606
- static VALUE frt_set_locale(VALUE self, VALUE locale)
1693
+ static VALUE frb_set_locale(VALUE self, VALUE locale)
1607
1694
  {
1608
1695
  char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
1609
- frt_locale = setlocale(LC_CTYPE, l);
1610
- return frt_locale ? rb_str_new2(frt_locale) : Qnil;
1696
+ frb_locale = setlocale(LC_CTYPE, l);
1697
+ return frb_locale ? rb_str_new2(frb_locale) : Qnil;
1611
1698
  }
1612
1699
 
1613
1700
  /****************************************************************************
@@ -1645,25 +1732,27 @@ static VALUE frt_set_locale(VALUE self, VALUE locale)
1645
1732
  static void Init_Token(void)
1646
1733
  {
1647
1734
  cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1648
- rb_define_alloc_func(cToken, frt_token_alloc);
1735
+ rb_define_alloc_func(cToken, frb_token_alloc);
1649
1736
  rb_include_module(cToken, rb_mComparable);
1650
1737
 
1651
- rb_define_method(cToken, "initialize", frt_token_init, -1);
1652
- rb_define_method(cToken, "<=>", frt_token_cmp, 1);
1653
- rb_define_method(cToken, "text", frt_token_get_text, 0);
1654
- rb_define_method(cToken, "text=", frt_token_set_text, 1);
1655
- rb_define_method(cToken, "start", frt_token_get_start_offset, 0);
1656
- rb_define_method(cToken, "start=", frt_token_set_start_offset, 1);
1657
- rb_define_method(cToken, "end", frt_token_get_end_offset, 0);
1658
- rb_define_method(cToken, "end=", frt_token_set_end_offset, 1);
1659
- rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
1660
- rb_define_method(cToken, "pos_inc=", frt_token_set_pos_inc, 1);
1661
- rb_define_method(cToken, "to_s", frt_token_to_s, 0);
1738
+ rb_define_method(cToken, "initialize", frb_token_init, -1);
1739
+ rb_define_method(cToken, "<=>", frb_token_cmp, 1);
1740
+ rb_define_method(cToken, "text", frb_token_get_text, 0);
1741
+ rb_define_method(cToken, "text=", frb_token_set_text, 1);
1742
+ rb_define_method(cToken, "start", frb_token_get_start_offset, 0);
1743
+ rb_define_method(cToken, "start=", frb_token_set_start_offset, 1);
1744
+ rb_define_method(cToken, "end", frb_token_get_end_offset, 0);
1745
+ rb_define_method(cToken, "end=", frb_token_set_end_offset, 1);
1746
+ rb_define_method(cToken, "pos_inc", frb_token_get_pos_inc, 0);
1747
+ rb_define_method(cToken, "pos_inc=", frb_token_set_pos_inc, 1);
1748
+ rb_define_method(cToken, "to_s", frb_token_to_s, 0);
1662
1749
  }
1663
1750
 
1664
1751
  /*
1665
1752
  * Document-class: Ferret::Analysis::TokenStream
1666
1753
  *
1754
+ * == Summary
1755
+ *
1667
1756
  * A TokenStream enumerates the sequence of tokens, either from
1668
1757
  * fields of a document or from query text.
1669
1758
  *
@@ -1675,15 +1764,17 @@ static void Init_Token(void)
1675
1764
  static void Init_TokenStream(void)
1676
1765
  {
1677
1766
  cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1678
- frt_mark_cclass(cTokenStream);
1679
- rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1680
- rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1681
- rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
1767
+ frb_mark_cclass(cTokenStream);
1768
+ rb_define_method(cTokenStream, "next", frb_ts_next, 0);
1769
+ rb_define_method(cTokenStream, "text=", frb_ts_set_text, 1);
1770
+ rb_define_method(cTokenStream, "text", frb_ts_get_text, 0);
1682
1771
  }
1683
1772
 
1684
1773
  /*
1685
1774
  * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1686
1775
  *
1776
+ * == Summary
1777
+ *
1687
1778
  * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1688
1779
  * That is to say, it defines tokens as maximal strings of adjacent letters,
1689
1780
  * as defined by the regular expression _/[A-Za-z]+/_.
@@ -1697,15 +1788,17 @@ static void Init_AsciiLetterTokenizer(void)
1697
1788
  {
1698
1789
  cAsciiLetterTokenizer =
1699
1790
  rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1700
- frt_mark_cclass(cAsciiLetterTokenizer);
1701
- rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1791
+ frb_mark_cclass(cAsciiLetterTokenizer);
1792
+ rb_define_alloc_func(cAsciiLetterTokenizer, frb_data_alloc);
1702
1793
  rb_define_method(cAsciiLetterTokenizer, "initialize",
1703
- frt_a_letter_tokenizer_init, 1);
1794
+ frb_a_letter_tokenizer_init, 1);
1704
1795
  }
1705
1796
 
1706
1797
  /*
1707
1798
  * Document-class: Ferret::Analysis::LetterTokenizer
1708
1799
  *
1800
+ * == Summary
1801
+ *
1709
1802
  * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1710
1803
  * to say, it defines tokens as maximal strings of adjacent letters, as
1711
1804
  * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
@@ -1720,15 +1813,17 @@ static void Init_LetterTokenizer(void)
1720
1813
  {
1721
1814
  cLetterTokenizer =
1722
1815
  rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1723
- frt_mark_cclass(cLetterTokenizer);
1724
- rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1816
+ frb_mark_cclass(cLetterTokenizer);
1817
+ rb_define_alloc_func(cLetterTokenizer, frb_data_alloc);
1725
1818
  rb_define_method(cLetterTokenizer, "initialize",
1726
- frt_letter_tokenizer_init, -1);
1819
+ frb_letter_tokenizer_init, -1);
1727
1820
  }
1728
1821
 
1729
1822
  /*
1730
1823
  * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1731
1824
  *
1825
+ * == Summary
1826
+ *
1732
1827
  * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1733
1828
  * Adjacent sequences of non-WhiteSpace characters form tokens.
1734
1829
  *
@@ -1742,15 +1837,17 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
1742
1837
  cAsciiWhiteSpaceTokenizer =
1743
1838
  rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1744
1839
  cTokenStream);
1745
- frt_mark_cclass(cAsciiWhiteSpaceTokenizer);
1746
- rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1840
+ frb_mark_cclass(cAsciiWhiteSpaceTokenizer);
1841
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frb_data_alloc);
1747
1842
  rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1748
- frt_a_whitespace_tokenizer_init, 1);
1843
+ frb_a_whitespace_tokenizer_init, 1);
1749
1844
  }
1750
1845
 
1751
1846
  /*
1752
1847
  * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1753
1848
  *
1849
+ * == Summary
1850
+ *
1754
1851
  * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1755
1852
  * Adjacent sequences of non-WhiteSpace characters form tokens.
1756
1853
  *
@@ -1763,15 +1860,17 @@ static void Init_WhiteSpaceTokenizer(void)
1763
1860
  {
1764
1861
  cWhiteSpaceTokenizer =
1765
1862
  rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1766
- frt_mark_cclass(cWhiteSpaceTokenizer);
1767
- rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1863
+ frb_mark_cclass(cWhiteSpaceTokenizer);
1864
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frb_data_alloc);
1768
1865
  rb_define_method(cWhiteSpaceTokenizer, "initialize",
1769
- frt_whitespace_tokenizer_init, -1);
1866
+ frb_whitespace_tokenizer_init, -1);
1770
1867
  }
1771
1868
 
1772
1869
  /*
1773
1870
  * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1774
1871
  *
1872
+ * == Summary
1873
+ *
1775
1874
  * The standard tokenizer is an advanced tokenizer which tokenizes most
1776
1875
  * words correctly as well as tokenizing things like email addresses, web
1777
1876
  * addresses, phone numbers, etc.
@@ -1785,15 +1884,17 @@ static void Init_AsciiStandardTokenizer(void)
1785
1884
  {
1786
1885
  cAsciiStandardTokenizer =
1787
1886
  rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1788
- frt_mark_cclass(cAsciiStandardTokenizer);
1789
- rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1887
+ frb_mark_cclass(cAsciiStandardTokenizer);
1888
+ rb_define_alloc_func(cAsciiStandardTokenizer, frb_data_alloc);
1790
1889
  rb_define_method(cAsciiStandardTokenizer, "initialize",
1791
- frt_a_standard_tokenizer_init, 1);
1890
+ frb_a_standard_tokenizer_init, 1);
1792
1891
  }
1793
1892
 
1794
1893
  /*
1795
1894
  * Document-class: Ferret::Analysis::StandardTokenizer
1796
1895
  *
1896
+ * == Summary
1897
+ *
1797
1898
  * The standard tokenizer is an advanced tokenizer which tokenizes most
1798
1899
  * words correctly as well as tokenizing things like email addresses, web
1799
1900
  * addresses, phone numbers, etc.
@@ -1807,15 +1908,17 @@ static void Init_StandardTokenizer(void)
1807
1908
  {
1808
1909
  cStandardTokenizer =
1809
1910
  rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1810
- frt_mark_cclass(cStandardTokenizer);
1811
- rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1911
+ frb_mark_cclass(cStandardTokenizer);
1912
+ rb_define_alloc_func(cStandardTokenizer, frb_data_alloc);
1812
1913
  rb_define_method(cStandardTokenizer, "initialize",
1813
- frt_standard_tokenizer_init, 1);
1914
+ frb_standard_tokenizer_init, 1);
1814
1915
  }
1815
1916
 
1816
1917
  /*
1817
1918
  * Document-class: Ferret::Analysis::RegExpTokenizer
1818
1919
  *
1920
+ * == Summary
1921
+ *
1819
1922
  * A tokenizer that recognizes tokens based on a regular expression passed to
1820
1923
  * the constructor. Most possible tokenizers can be created using this class.
1821
1924
  *
@@ -1835,14 +1938,14 @@ static void Init_RegExpTokenizer(void)
1835
1938
  {
1836
1939
  cRegExpTokenizer =
1837
1940
  rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1838
- frt_mark_cclass(cRegExpTokenizer);
1941
+ frb_mark_cclass(cRegExpTokenizer);
1839
1942
  rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1840
1943
  rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1841
- rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1944
+ rb_define_alloc_func(cRegExpTokenizer, frb_data_alloc);
1842
1945
  rb_define_method(cRegExpTokenizer, "initialize",
1843
- frt_rets_init, -1);
1844
- rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1845
- rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1946
+ frb_rets_init, -1);
1947
+ rb_define_method(cRegExpTokenizer, "text=", frb_rets_set_text, 1);
1948
+ rb_define_method(cRegExpTokenizer, "text", frb_rets_get_text, 0);
1846
1949
  }
1847
1950
 
1848
1951
  /***************/
@@ -1852,6 +1955,8 @@ static void Init_RegExpTokenizer(void)
1852
1955
  /*
1853
1956
  * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1854
1957
  *
1958
+ * == Summary
1959
+ *
1855
1960
  * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1856
1961
  * ASCII characters. For other characters use LowerCaseFilter.
1857
1962
  *
@@ -1864,15 +1969,17 @@ static void Init_AsciiLowerCaseFilter(void)
1864
1969
  {
1865
1970
  cAsciiLowerCaseFilter =
1866
1971
  rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1867
- frt_mark_cclass(cAsciiLowerCaseFilter);
1868
- rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1972
+ frb_mark_cclass(cAsciiLowerCaseFilter);
1973
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frb_data_alloc);
1869
1974
  rb_define_method(cAsciiLowerCaseFilter, "initialize",
1870
- frt_a_lowercase_filter_init, 1);
1975
+ frb_a_lowercase_filter_init, 1);
1871
1976
  }
1872
1977
 
1873
1978
  /*
1874
1979
  * Document-class: Ferret::Analysis::LowerCaseFilter
1875
1980
  *
1981
+ * == Summary
1982
+ *
1876
1983
  * LowerCaseFilter normalizes a token's text to lowercase based on the
1877
1984
  * current locale.
1878
1985
  *
@@ -1885,15 +1992,17 @@ static void Init_LowerCaseFilter(void)
1885
1992
  {
1886
1993
  cLowerCaseFilter =
1887
1994
  rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1888
- frt_mark_cclass(cLowerCaseFilter);
1889
- rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1995
+ frb_mark_cclass(cLowerCaseFilter);
1996
+ rb_define_alloc_func(cLowerCaseFilter, frb_data_alloc);
1890
1997
  rb_define_method(cLowerCaseFilter, "initialize",
1891
- frt_lowercase_filter_init, 1);
1998
+ frb_lowercase_filter_init, 1);
1892
1999
  }
1893
2000
 
1894
2001
  /*
1895
2002
  * Document-class: Ferret::Analysis::HyphenFilter
1896
2003
  *
2004
+ * == Summary
2005
+ *
1897
2006
  * HyphenFilter filters hyphenated words by adding both the word concatenated
1898
2007
  * into a single word and split into multiple words. ie "e-mail" becomes
1899
2008
  * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
@@ -1908,14 +2017,16 @@ static void Init_HyphenFilter(void)
1908
2017
  {
1909
2018
  cHyphenFilter =
1910
2019
  rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1911
- frt_mark_cclass(cHyphenFilter);
1912
- rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
1913
- rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
2020
+ frb_mark_cclass(cHyphenFilter);
2021
+ rb_define_alloc_func(cHyphenFilter, frb_data_alloc);
2022
+ rb_define_method(cHyphenFilter, "initialize", frb_hyphen_filter_init, 1);
1914
2023
  }
1915
2024
 
1916
2025
  /*
1917
2026
  * Document-class: Ferret::Analysis::MappingFilter
1918
2027
  *
2028
+ * == Summary
2029
+ *
1919
2030
  * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1920
2031
  * characters to ASCII characters for easier searching and better search
1921
2032
  * recall. The mapping is compiled into a Deterministic Finite Automata so it
@@ -1956,15 +2067,17 @@ static void Init_MappingFilter(void)
1956
2067
  {
1957
2068
  cMappingFilter =
1958
2069
  rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
1959
- frt_mark_cclass(cMappingFilter);
1960
- rb_define_alloc_func(cMappingFilter, frt_data_alloc);
2070
+ frb_mark_cclass(cMappingFilter);
2071
+ rb_define_alloc_func(cMappingFilter, frb_data_alloc);
1961
2072
  rb_define_method(cMappingFilter, "initialize",
1962
- frt_mapping_filter_init, 2);
2073
+ frb_mapping_filter_init, 2);
1963
2074
  }
1964
2075
 
1965
2076
  /*
1966
2077
  * Document-class: Ferret::Analysis::StopFilter
1967
2078
  *
2079
+ * == Summary
2080
+ *
1968
2081
  * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
1969
2082
  * that you don't wish to be index. Usually they will be common words like
1970
2083
  * "the" and "and" although you can specify whichever words you want.
@@ -1977,10 +2090,10 @@ static void Init_StopFilter(void)
1977
2090
  {
1978
2091
  cStopFilter =
1979
2092
  rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1980
- frt_mark_cclass(cStopFilter);
1981
- rb_define_alloc_func(cStopFilter, frt_data_alloc);
2093
+ frb_mark_cclass(cStopFilter);
2094
+ rb_define_alloc_func(cStopFilter, frb_data_alloc);
1982
2095
  rb_define_method(cStopFilter, "initialize",
1983
- frt_stop_filter_init, -1);
2096
+ frb_stop_filter_init, -1);
1984
2097
  }
1985
2098
 
1986
2099
  /*
@@ -2004,13 +2117,25 @@ static void Init_StopFilter(void)
2004
2117
  * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
2005
2118
  * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
2006
2119
  * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
2120
+ * "hungarian", | "hu", "hun" | "ISO_8859_1", "UTF_8"
2007
2121
  * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
2008
2122
  * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
2009
2123
  * "porter", | | "ISO_8859_1", "UTF_8"
2010
2124
  * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
2125
+ * "romanian", | "ro", "ron", "rum" | "ISO_8859_2", "UTF_8"
2011
2126
  * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
2012
2127
  * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
2013
2128
  * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
2129
+ * "turkish", | "tr", "tur" | "UTF_8"
2130
+ *
2131
+ *
2132
+ * === New Stemmers
2133
+ *
2134
+ * The following stemmers have recently benn added. Please try them out;
2135
+ *
2136
+ * * Hungarian
2137
+ * * Romanian
2138
+ * * Turkish
2014
2139
  *
2015
2140
  * === Example
2016
2141
  *
@@ -2037,10 +2162,10 @@ static void Init_StemFilter(void)
2037
2162
  {
2038
2163
  cStemFilter =
2039
2164
  rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
2040
- frt_mark_cclass(cStemFilter);
2041
- rb_define_alloc_func(cStemFilter, frt_data_alloc);
2165
+ frb_mark_cclass(cStemFilter);
2166
+ rb_define_alloc_func(cStemFilter, frb_data_alloc);
2042
2167
  rb_define_method(cStemFilter, "initialize",
2043
- frt_stem_filter_init, -1);
2168
+ frb_stem_filter_init, -1);
2044
2169
  }
2045
2170
 
2046
2171
  /*************************/
@@ -2081,10 +2206,10 @@ static void Init_Analyzer(void)
2081
2206
  {
2082
2207
  cAnalyzer =
2083
2208
  rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
2084
- frt_mark_cclass(cAnalyzer);
2085
- rb_define_alloc_func(cAnalyzer, frt_data_alloc);
2086
- rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
2087
- rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
2209
+ frb_mark_cclass(cAnalyzer);
2210
+ rb_define_alloc_func(cAnalyzer, frb_data_alloc);
2211
+ rb_define_method(cAnalyzer, "initialize", frb_letter_analyzer_init, -1);
2212
+ rb_define_method(cAnalyzer, "token_stream", frb_analyzer_token_stream, 2);
2088
2213
  }
2089
2214
 
2090
2215
  /*
@@ -2119,10 +2244,10 @@ static void Init_AsciiLetterAnalyzer(void)
2119
2244
  {
2120
2245
  cAsciiLetterAnalyzer =
2121
2246
  rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
2122
- frt_mark_cclass(cAsciiLetterAnalyzer);
2123
- rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
2247
+ frb_mark_cclass(cAsciiLetterAnalyzer);
2248
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frb_data_alloc);
2124
2249
  rb_define_method(cAsciiLetterAnalyzer, "initialize",
2125
- frt_a_letter_analyzer_init, -1);
2250
+ frb_a_letter_analyzer_init, -1);
2126
2251
  }
2127
2252
 
2128
2253
  /*
@@ -2150,10 +2275,10 @@ static void Init_LetterAnalyzer(void)
2150
2275
  {
2151
2276
  cLetterAnalyzer =
2152
2277
  rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
2153
- frt_mark_cclass(cLetterAnalyzer);
2154
- rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
2278
+ frb_mark_cclass(cLetterAnalyzer);
2279
+ rb_define_alloc_func(cLetterAnalyzer, frb_data_alloc);
2155
2280
  rb_define_method(cLetterAnalyzer, "initialize",
2156
- frt_letter_analyzer_init, -1);
2281
+ frb_letter_analyzer_init, -1);
2157
2282
  }
2158
2283
 
2159
2284
  /*
@@ -2187,10 +2312,10 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
2187
2312
  {
2188
2313
  cAsciiWhiteSpaceAnalyzer =
2189
2314
  rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
2190
- frt_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2191
- rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
2315
+ frb_mark_cclass(cAsciiWhiteSpaceAnalyzer);
2316
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frb_data_alloc);
2192
2317
  rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
2193
- frt_a_white_space_analyzer_init, -1);
2318
+ frb_a_white_space_analyzer_init, -1);
2194
2319
  }
2195
2320
 
2196
2321
  /*
@@ -2218,10 +2343,10 @@ static void Init_WhiteSpaceAnalyzer(void)
2218
2343
  {
2219
2344
  cWhiteSpaceAnalyzer =
2220
2345
  rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
2221
- frt_mark_cclass(cWhiteSpaceAnalyzer);
2222
- rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
2346
+ frb_mark_cclass(cWhiteSpaceAnalyzer);
2347
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frb_data_alloc);
2223
2348
  rb_define_method(cWhiteSpaceAnalyzer, "initialize",
2224
- frt_white_space_analyzer_init, -1);
2349
+ frb_white_space_analyzer_init, -1);
2225
2350
  }
2226
2351
 
2227
2352
  /*
@@ -2255,10 +2380,10 @@ static void Init_AsciiStandardAnalyzer(void)
2255
2380
  {
2256
2381
  cAsciiStandardAnalyzer =
2257
2382
  rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2258
- frt_mark_cclass(cAsciiStandardAnalyzer);
2259
- rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
2383
+ frb_mark_cclass(cAsciiStandardAnalyzer);
2384
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frb_data_alloc);
2260
2385
  rb_define_method(cAsciiStandardAnalyzer, "initialize",
2261
- frt_a_standard_analyzer_init, -1);
2386
+ frb_a_standard_analyzer_init, -1);
2262
2387
  }
2263
2388
 
2264
2389
  /*
@@ -2290,10 +2415,10 @@ static void Init_StandardAnalyzer(void)
2290
2415
  {
2291
2416
  cStandardAnalyzer =
2292
2417
  rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2293
- frt_mark_cclass(cStandardAnalyzer);
2294
- rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
2418
+ frb_mark_cclass(cStandardAnalyzer);
2419
+ rb_define_alloc_func(cStandardAnalyzer, frb_data_alloc);
2295
2420
  rb_define_method(cStandardAnalyzer, "initialize",
2296
- frt_standard_analyzer_init, -1);
2421
+ frb_standard_analyzer_init, -1);
2297
2422
  }
2298
2423
 
2299
2424
  /*
@@ -2320,16 +2445,16 @@ static void Init_PerFieldAnalyzer(void)
2320
2445
  {
2321
2446
  cPerFieldAnalyzer =
2322
2447
  rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2323
- frt_mark_cclass(cPerFieldAnalyzer);
2324
- rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2448
+ frb_mark_cclass(cPerFieldAnalyzer);
2449
+ rb_define_alloc_func(cPerFieldAnalyzer, frb_data_alloc);
2325
2450
  rb_define_method(cPerFieldAnalyzer, "initialize",
2326
- frt_per_field_analyzer_init, 1);
2451
+ frb_per_field_analyzer_init, 1);
2327
2452
  rb_define_method(cPerFieldAnalyzer, "add_field",
2328
- frt_per_field_analyzer_add_field, 2);
2453
+ frb_per_field_analyzer_add_field, 2);
2329
2454
  rb_define_method(cPerFieldAnalyzer, "[]=",
2330
- frt_per_field_analyzer_add_field, 2);
2455
+ frb_per_field_analyzer_add_field, 2);
2331
2456
  rb_define_method(cPerFieldAnalyzer, "token_stream",
2332
- frt_pfa_analyzer_token_stream, 2);
2457
+ frb_pfa_analyzer_token_stream, 2);
2333
2458
  }
2334
2459
 
2335
2460
  /*
@@ -2363,12 +2488,12 @@ static void Init_RegExpAnalyzer(void)
2363
2488
  {
2364
2489
  cRegExpAnalyzer =
2365
2490
  rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2366
- frt_mark_cclass(cRegExpAnalyzer);
2367
- rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2491
+ frb_mark_cclass(cRegExpAnalyzer);
2492
+ rb_define_alloc_func(cRegExpAnalyzer, frb_data_alloc);
2368
2493
  rb_define_method(cRegExpAnalyzer, "initialize",
2369
- frt_re_analyzer_init, -1);
2494
+ frb_re_analyzer_init, -1);
2370
2495
  rb_define_method(cRegExpAnalyzer, "token_stream",
2371
- frt_re_analyzer_token_stream, 2);
2496
+ frb_re_analyzer_token_stream, 2);
2372
2497
  }
2373
2498
 
2374
2499
  /* rdoc hack
@@ -2433,8 +2558,8 @@ Init_Analysis(void)
2433
2558
  rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2434
2559
 
2435
2560
  /*** * * Locale stuff * * ***/
2436
- rb_define_singleton_method(mFerret, "locale=", frt_set_locale, 1);
2437
- rb_define_singleton_method(mFerret, "locale", frt_get_locale, 0);
2561
+ rb_define_singleton_method(mFerret, "locale=", frb_set_locale, 1);
2562
+ rb_define_singleton_method(mFerret, "locale", frb_get_locale, 0);
2438
2563
 
2439
2564
  rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2440
2565
  get_rstopwords(ENGLISH_STOP_WORDS));
@@ -2464,6 +2589,8 @@ Init_Analysis(void)
2464
2589
  get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
2465
2590
  rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2466
2591
  get_rstopwords(FULL_FINNISH_STOP_WORDS));
2592
+ rb_define_const(mAnalysis, "FULL_HUNGARIAN_STOP_WORDS",
2593
+ get_rstopwords(FULL_HUNGARIAN_STOP_WORDS));
2467
2594
 
2468
2595
  Init_Token();
2469
2596
  Init_TokenStream();