ferret 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/q_term.c CHANGED
@@ -18,6 +18,14 @@ Scorer *tw_scorer(Weight *self, IndexReader *ir)
18
18
 
19
19
  Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
20
20
  {
21
+ Explanation *qnorm_expl;
22
+ Explanation *field_expl;
23
+ Scorer *scorer;
24
+ Explanation *tf_expl;
25
+ uchar *field_norms;
26
+ float field_norm;
27
+ Explanation *field_norm_expl;
28
+
21
29
  char *query_str = self->query->to_s(self->query, "");
22
30
  TermQuery *tq = (TermQuery *)self->query->data;
23
31
  Term *term = tq->term;
@@ -26,14 +34,14 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
26
34
  Explanation *expl = expl_create(0.0,
27
35
  strfmt("weight(%s in %d), product of:", query_str, doc_num));
28
36
 
29
- // We need two of these as it's included in both the query explanation
30
- // and the field explanation
37
+ /* We need two of these as it's included in both the query explanation
38
+ * and the field explanation */
31
39
  Explanation *idf_expl1 = expl_create(self->idf,
32
40
  strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
33
41
  Explanation *idf_expl2 = expl_create(self->idf,
34
42
  strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
35
43
 
36
- // explain query weight
44
+ /* explain query weight */
37
45
  Explanation *query_expl = expl_create(0.0,
38
46
  strfmt("query_weight(%s), product of:", query_str));
39
47
  free(query_str);
@@ -44,33 +52,35 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
44
52
 
45
53
  expl_add_detail(query_expl, idf_expl1);
46
54
 
47
- Explanation *qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
55
+ qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
48
56
  expl_add_detail(query_expl, qnorm_expl);
49
57
 
50
58
  query_expl->value = self->query->boost * idf_expl1->value * qnorm_expl->value;
51
59
 
52
60
  expl_add_detail(expl, query_expl);
53
61
 
54
- // explain field weight
55
- Explanation *field_expl = expl_create(0.0,
62
+ /* explain field weight */
63
+ field_expl = expl_create(0.0,
56
64
  strfmt("field_weight(%s:%s in %d), product of:",
57
65
  field_name, term->text, doc_num));
58
66
 
59
- Scorer *scorer = self->scorer(self, ir);
60
- Explanation *tf_expl = scorer->explain(scorer, doc_num);
67
+ scorer = self->scorer(self, ir);
68
+ tf_expl = scorer->explain(scorer, doc_num);
61
69
  scorer->destroy(scorer);
62
70
  expl_add_detail(field_expl, tf_expl);
63
71
  expl_add_detail(field_expl, idf_expl2);
64
72
 
65
- uchar *field_norms = ir->get_norms(ir, field_name);
66
- float field_norm = (field_norms ? sim_decode_norm(self->similarity, field_norms[doc_num]) : 0.0);
67
- Explanation *field_norm_expl = expl_create(field_norm,
73
+ field_norms = ir->get_norms(ir, field_name);
74
+ field_norm = (field_norms
75
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
76
+ : (float)0.0);
77
+ field_norm_expl = expl_create(field_norm,
68
78
  strfmt("field_norm(field=%s, doc=%d)", field_name, doc_num));
69
79
  expl_add_detail(field_expl, field_norm_expl);
70
80
 
71
81
  field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
72
82
 
73
- // combine them
83
+ /* combine them */
74
84
  if (query_expl->value == 1.0) {
75
85
  expl_destoy(expl);
76
86
  return field_expl;
@@ -86,30 +96,18 @@ char *tw_to_s(Weight *self)
86
96
  return strfmt("TermWeight(%f)", self->value);
87
97
  }
88
98
 
89
- void tw_destroy(void *p)
90
- {
91
- free(p);
92
- }
93
-
94
99
  Weight *tw_create(Query *query, Searcher *searcher)
95
100
  {
96
- Weight *self = ALLOC(Weight);
97
- ZEROSET(self, Weight, 1);
98
- self->get_query = &w_get_query;
99
- self->get_value = &w_get_value;
100
- self->normalize = &w_normalize;
101
+ Weight *self = w_create(query);
101
102
  self->scorer = &tw_scorer;
102
103
  self->explain = &tw_explain;
103
104
  self->to_s = &tw_to_s;
104
- self->destroy = &tw_destroy;
105
105
  self->sum_of_squared_weights = &w_sum_of_squared_weights;
106
106
 
107
107
  self->similarity = query->get_similarity(query, searcher);
108
108
  self->idf = sim_idf(self->similarity,
109
109
  searcher->doc_freq(searcher, ((TermQuery *)query->data)->term),
110
110
  searcher->max_doc(searcher)); // compute idf
111
- self->query = query;
112
- self->value = 0.0;
113
111
 
114
112
  return self;
115
113
  }
@@ -120,20 +118,19 @@ Weight *tw_create(Query *query, Searcher *searcher)
120
118
  *
121
119
  ***************************************************************************/
122
120
 
123
- void tq_destroy(void *p)
121
+ void tq_destroy(Query *self)
124
122
  {
125
- Query *q = (Query *)p;
126
- TermQuery *tq = q->data;
123
+ TermQuery *tq = self->data;
127
124
  term_destroy(tq->term);
128
125
  free(tq);
129
- q_destroy(q);
126
+ q_destroy_i(self);
130
127
  }
131
128
 
132
129
  char *tq_to_s(Query *self, char *field)
133
130
  {
134
131
  Term *term = ((TermQuery *)self->data)->term;
135
- int flen = strlen(term->field);
136
- int tlen = strlen(term->text);
132
+ size_t flen = strlen(term->field);
133
+ size_t tlen = strlen(term->text);
137
134
  char *buffer = ALLOC_N(char, 34 + flen + tlen);
138
135
  char *b = buffer;
139
136
  if (strcmp(field, term->field) != 0) {
@@ -151,10 +148,21 @@ char *tq_to_s(Query *self, char *field)
151
148
  return buffer;
152
149
  }
153
150
 
154
- void tq_extract_terms(Query *self, Array *terms)
151
+ static void tq_extract_terms(Query *self, HashSet *terms)
155
152
  {
156
153
  Term *term = ((TermQuery *)self->data)->term;
157
- ary_append(terms, term);
154
+ hs_add(terms, term_clone(term));
155
+ }
156
+
157
+ static uint tq_hash(Query *self)
158
+ {
159
+ return term_hash(((TermQuery *)self->data)->term);
160
+ }
161
+
162
+ static int tq_eq(Query *self, Query *o)
163
+ {
164
+ return term_eq(((TermQuery *)self->data)->term,
165
+ ((TermQuery *)o->data)->term);
158
166
  }
159
167
 
160
168
  Query *tq_create(Term *term)
@@ -164,14 +172,18 @@ Query *tq_create(Term *term)
164
172
  tq->term = term;
165
173
  self->type = TERM_QUERY;
166
174
  self->data = tq;
167
- self->create_weight = &tw_create;
168
175
  self->extract_terms = &tq_extract_terms;
169
176
  self->to_s = &tq_to_s;
170
- self->destroy = &tq_destroy;
177
+ self->hash = &tq_hash;
178
+ self->eq = &tq_eq;
179
+
180
+ self->destroy_i = &tq_destroy;
181
+ self->create_weight_i = &tw_create;
171
182
 
172
183
  return self;
173
184
  }
174
185
 
186
+
175
187
  /***************************************************************************
176
188
  *
177
189
  * TermScorer
@@ -183,13 +195,13 @@ float tsc_score(Scorer *self)
183
195
  TermScorer *ts = (TermScorer *)self->data;
184
196
  int freq = ts->freqs[ts->pointer];
185
197
  float score;
186
- // compute tf(f)*weight
187
- if (freq < SCORE_CACHE_SIZE) { // check cache
188
- score = ts->score_cache[freq]; // cache hit
198
+ /* compute tf(f)*weight */
199
+ if (freq < SCORE_CACHE_SIZE) { /* check cache */
200
+ score = ts->score_cache[freq]; /* cache hit */
189
201
  } else {
190
- score = sim_tf(self->similarity, freq) * ts->weight_value; // cache miss
202
+ score = sim_tf(self->similarity, (float)freq) * ts->weight_value; /* cache miss */
191
203
  }
192
- // normalize for field
204
+ /* normalize for field */
193
205
  score *= sim_decode_norm(self->similarity, ts->norms[self->doc]);
194
206
  return score;
195
207
  }
@@ -217,8 +229,9 @@ bool tsc_next(Scorer *self)
217
229
  bool tsc_skip_to(Scorer *self, int doc_num)
218
230
  {
219
231
  TermScorer *ts = (TermScorer *)self->data;
220
-
221
- // first scan in cache
232
+ TermDocEnum *tde = ts->tde;
233
+
234
+ /* first scan in cache */
222
235
  while (++(ts->pointer) < ts->pointer_max) {
223
236
  if (ts->docs[ts->pointer] >= doc_num) {
224
237
  self->doc = ts->docs[ts->pointer];
@@ -226,10 +239,8 @@ bool tsc_skip_to(Scorer *self, int doc_num)
226
239
  }
227
240
  }
228
241
 
229
- // not found in cache, seek underlying stream
230
- TermDocEnum *tde = ts->tde;
231
- bool result = tde->skip_to(tde, doc_num);
232
- if (result) {
242
+ /* not found in cache, seek underlying stream */
243
+ if (tde->skip_to(tde, doc_num)) {
233
244
  ts->pointer_max = 1;
234
245
  ts->pointer = 0;
235
246
  ts->docs[0] = self->doc = tde->doc_num(tde);
@@ -242,6 +253,7 @@ bool tsc_skip_to(Scorer *self, int doc_num)
242
253
 
243
254
  Explanation *tsc_explain(Scorer *self, int doc_num)
244
255
  {
256
+ Explanation *tf_explanation;
245
257
  TermScorer *ts = (TermScorer *)self->data;
246
258
  Query *query = ts->weight->get_query(ts->weight);
247
259
  Term *term = ((TermQuery *)query->data)->term;
@@ -260,18 +272,17 @@ Explanation *tsc_explain(Scorer *self, int doc_num)
260
272
  }
261
273
  tde->close(tde);
262
274
  ts->tde = NULL;
263
- Explanation *tf_explanation = expl_create(sim_tf(self->similarity, tf),
275
+ tf_explanation = expl_create(sim_tf(self->similarity, (float)tf),
264
276
  strfmt("tf(term_freq(%s:%s)=%d)", term->field, term->text, tf));
265
277
 
266
278
  return tf_explanation;
267
279
  }
268
280
 
269
- void tsc_destroy(void *p)
281
+ void tsc_destroy(Scorer *self)
270
282
  {
271
- Scorer *self = (Scorer *)p;
272
283
  TermScorer *ts = (TermScorer *)self->data;
273
284
  if (ts->tde) ts->tde->close(ts->tde);
274
- scorer_destroy(p);
285
+ scorer_destroy_i(self);
275
286
  }
276
287
 
277
288
  Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
@@ -287,7 +298,7 @@ Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
287
298
  ts->weight_value = weight->value;
288
299
 
289
300
  for (i = 0; i < SCORE_CACHE_SIZE; i++) {
290
- ts->score_cache[i] = sim_tf(self->similarity, i) * ts->weight_value;
301
+ ts->score_cache[i] = sim_tf(self->similarity, (float)i) * ts->weight_value;
291
302
  }
292
303
 
293
304
  self->score = &tsc_score;
data/ext/q_wildcard.c CHANGED
@@ -11,8 +11,8 @@ char *wcq_to_s(Query *self, char *field)
11
11
  {
12
12
  char *buffer, *bptr;
13
13
  Term *term = (Term *)self->data;
14
- int tlen = strlen(term->text);
15
- int flen = strlen(term->field);
14
+ size_t tlen = strlen(term->text);
15
+ size_t flen = strlen(term->field);
16
16
  bptr = buffer = ALLOC_N(char, tlen + flen + 35);
17
17
 
18
18
  if (strcmp(term->field, field) != 0) {
@@ -77,8 +77,8 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
77
77
  Term *term = (Term *)self->data;
78
78
  char *text = term->text;
79
79
  char *field = term->field;
80
- char *first_star = index(text, WILD_STRING);
81
- char *first_ques = index(text, WILD_CHAR);
80
+ char *first_star = strrchr(text, WILD_STRING);
81
+ char *first_ques = strrchr(text, WILD_CHAR);
82
82
  if (!first_star && !first_ques) {
83
83
  q = tq_create(term_clone(term));
84
84
  } else {
@@ -89,7 +89,7 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
89
89
  char *pattern = (first_ques && first_star > first_ques)
90
90
  ? first_ques : first_star;
91
91
 
92
- int prefix_len = pattern - text;
92
+ int prefix_len = (int)(pattern - text);
93
93
 
94
94
  prefix_term.field = field;
95
95
  prefix_term.text = (char *)EMPTY_STRING;
@@ -120,15 +120,23 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
120
120
  free(prefix);
121
121
  }
122
122
 
123
- if (self->rewritten) self->rewritten->destroy(self->rewritten);
124
- return self->rewritten = q;
123
+ return q;
125
124
  }
126
125
 
127
- void wcq_destroy(void *p)
126
+ static void wcq_destroy(Query *self)
128
127
  {
129
- Query *self = (Query *)p;
130
128
  if (self->destroy_all) term_destroy((Term *)self->data);
131
- q_destroy(self);
129
+ q_destroy_i(self);
130
+ }
131
+
132
+ static uint wcq_hash(Query *self)
133
+ {
134
+ return term_hash((Term *)self->data);
135
+ }
136
+
137
+ static int wcq_eq(Query *self, Query *o)
138
+ {
139
+ return term_eq((Term *)self->data, (Term *)o->data);
132
140
  }
133
141
 
134
142
  Query *wcq_create(Term *term)
@@ -136,11 +144,14 @@ Query *wcq_create(Term *term)
136
144
  Query *self = q_create();
137
145
 
138
146
  self->data = term;
147
+
139
148
  self->type = WILD_CARD_QUERY;
140
- self->create_weight = NULL;
141
- self->to_s = &wcq_to_s;
142
149
  self->rewrite = &wcq_rewrite;
143
- self->destroy = &wcq_destroy;
150
+ self->to_s = &wcq_to_s;
151
+ self->hash = &wcq_hash;
152
+ self->eq = &wcq_eq;
153
+ self->destroy_i = &wcq_destroy;
154
+ self->create_weight_i = &q_create_weight_unsup;
144
155
 
145
156
  return self;
146
157
  }
data/ext/r_analysis.c CHANGED
@@ -1,3 +1,4 @@
1
+ #include <regex.h>
1
2
  #include "ferret.h"
2
3
  #include "analysis.h"
3
4
  #include "locale.h"
@@ -9,6 +10,7 @@ static VALUE cAsciiWhiteSpaceTokenizer;
9
10
  static VALUE cWhiteSpaceTokenizer;
10
11
  static VALUE cAsciiStandardTokenizer;
11
12
  static VALUE cStandardTokenizer;
13
+ static VALUE cRegExpTokenizer;
12
14
 
13
15
  static VALUE cAsciiLowerCaseFilter;
14
16
  static VALUE cLowerCaseFilter;
@@ -23,14 +25,25 @@ static VALUE cWhiteSpaceAnalyzer;
23
25
  static VALUE cAsciiStandardAnalyzer;
24
26
  static VALUE cStandardAnalyzer;
25
27
  static VALUE cPerFieldAnalyzer;
28
+ static VALUE cRegExpAnalyzer;
26
29
 
27
30
  //static VALUE cRegexAnalyzer;
28
31
  static VALUE cTokenStream;
29
32
 
33
+ /* TokenStream Methods */
30
34
  static ID id_next;
31
35
  static ID id_reset;
32
36
  static ID id_clone;
33
37
 
38
+ /* Analyzer Methods */
39
+ static ID id_token_stream;
40
+
41
+ static VALUE object_space;
42
+
43
+ extern TokenStream *ts_create();
44
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, int,
45
+ struct re_registers *);
46
+
34
47
  /****************************************************************************
35
48
  *
36
49
  * Utility Methods
@@ -111,7 +124,7 @@ frt_set_token(Token *tk, VALUE rt)
111
124
  return tk;
112
125
  }
113
126
 
114
- #define GET_TK RToken *token; Data_Get_Struct(self, RToken, token);
127
+ #define GET_TK RToken *token = (RToken *)DATA_PTR(self)
115
128
  static VALUE
116
129
  frt_token_init(int argc, VALUE *argv, VALUE self)
117
130
  {
@@ -212,13 +225,12 @@ frt_ts_mark(void *p)
212
225
  }
213
226
 
214
227
  static void
215
- frt_ts_free(void *p)
228
+ frt_ts_free(TokenStream *ts)
216
229
  {
217
- TokenStream *ts = (TokenStream *)p;
218
230
  if (object_get(&ts->text) != Qnil) object_del(&ts->text);
219
231
  if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
220
232
  object_del(ts);
221
- ts->destroy(ts);
233
+ ts_deref(ts);
222
234
  }
223
235
 
224
236
  static VALUE
@@ -273,8 +285,7 @@ frt_ts_get_text(VALUE self)
273
285
  static VALUE
274
286
  frt_ts_next(VALUE self)
275
287
  {
276
- TokenStream *ts;
277
- Data_Get_Struct(self, TokenStream, ts);
288
+ TokenStream *ts = (TokenStream *)DATA_PTR(self);
278
289
  Token *next = ts->next(ts);
279
290
  if (next == NULL) {
280
291
  return Qnil;
@@ -287,41 +298,45 @@ frt_ts_next(VALUE self)
287
298
  * CWrappedTokenStream
288
299
  ****************************************************************************/
289
300
 
290
- void cwrts_destroy(void *p)
301
+ static void
302
+ cwrts_destroy(TokenStream *ts)
291
303
  {
292
- TokenStream *ts = (TokenStream *)p;
304
+ rb_hash_delete(object_space, LONG2NUM((long)ts->data));
293
305
  free(ts->token);
294
306
  free(ts);
295
307
  }
296
308
 
297
- Token *cwrts_next(TokenStream *ts)
309
+ static Token *
310
+ cwrts_next(TokenStream *ts)
298
311
  {
299
312
  VALUE rts = (VALUE)ts->data;
300
313
  VALUE rtoken = rb_funcall(rts, id_next, 0);
301
314
  return frt_set_token(ts->token, rtoken);
302
315
  }
303
316
 
304
- void cwrts_reset(TokenStream *ts, char *text)
317
+ static void
318
+ cwrts_reset(TokenStream *ts, char *text)
305
319
  {
306
320
  VALUE rts = (VALUE)ts->data;
307
321
  ts->t = ts->text = text;
308
322
  rb_funcall(rts, id_reset, 1, rb_str_new2(text));
309
323
  }
310
324
 
311
- void cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
325
+ static void
326
+ cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
312
327
  {
313
328
  VALUE rorig_ts = (VALUE)orig_ts->data;
314
329
  new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
315
330
  }
316
331
 
317
332
  static TokenStream *
318
- get_cwrapped_rts(VALUE rts, bool *self_destroy)
333
+ frt_get_cwrapped_rts(VALUE rts)
319
334
  {
320
335
  TokenStream *ts;
321
336
  switch (TYPE(rts)) {
322
337
  case T_DATA:
323
338
  Data_Get_Struct(rts, TokenStream, ts);
324
- *self_destroy = true;
339
+ ref(ts);
325
340
  break;
326
341
  default:
327
342
  ts = ALLOC(TokenStream);
@@ -332,12 +347,184 @@ get_cwrapped_rts(VALUE rts, bool *self_destroy)
332
347
  ts->clone_i = &cwrts_clone_i;
333
348
  ts->destroy = &cwrts_destroy;
334
349
  ts->sub_ts = NULL;
335
- *self_destroy = false;
350
+ // prevent from being garbage collected
351
+ rb_hash_aset(object_space, LONG2NUM(rts), rts);
352
+ ts->ref_cnt = 1;
336
353
  break;
337
354
  }
338
355
  return ts;
339
356
  }
340
357
 
358
+ /****************************************************************************
359
+ * RegExpTokenStream
360
+ ****************************************************************************/
361
+
362
+ #define P "[_\\/.,-]"
363
+ #define HASDIGIT "\\w*\\d\\w*"
364
+ #define ALPHA "[-_[:alpha:]]"
365
+ #define ALNUM "[-_[:alnum:]]"
366
+
367
+ static char *token_re =
368
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
369
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
370
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
371
+ "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
372
+ "|(\\.\\w+)+"
373
+ "|"
374
+ ")";
375
+ static VALUE rtoken_re;
376
+
377
+ typedef struct RegExpTokenStream {
378
+ VALUE rtext;
379
+ VALUE regex;
380
+ VALUE proc;
381
+ int curr_ind;
382
+ } RegExpTokenStream;
383
+
384
+ static void
385
+ rets_destroy(TokenStream *ts)
386
+ {
387
+ rb_hash_delete(object_space, LONG2NUM((long)object_get(ts)));
388
+ free(ts->data);
389
+ free(ts->token);
390
+ free(ts);
391
+ }
392
+
393
+ static void
394
+ frt_rets_free(TokenStream *ts)
395
+ {
396
+ object_del(ts);
397
+ ts_deref(ts);
398
+ }
399
+
400
+ static void
401
+ frt_rets_mark(TokenStream *ts)
402
+ {
403
+ RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
404
+ rb_gc_mark(rets->rtext);
405
+ rb_gc_mark(rets->regex);
406
+ rb_gc_mark(rets->proc);
407
+ }
408
+
409
+ static VALUE
410
+ frt_rets_set_text(VALUE self, VALUE rtext)
411
+ {
412
+ TokenStream *ts;
413
+ RegExpTokenStream *rets;
414
+ Data_Get_Struct(self, TokenStream, ts);
415
+
416
+ StringValue(rtext);
417
+ rets = (RegExpTokenStream *)ts->data;
418
+ rets->rtext = rtext;
419
+ rets->curr_ind = 0;
420
+
421
+ return rtext;
422
+ }
423
+
424
+ static VALUE
425
+ frt_rets_get_text(VALUE self)
426
+ {
427
+ TokenStream *ts;
428
+ RegExpTokenStream *rets;
429
+ Data_Get_Struct(self, TokenStream, ts);
430
+ rets = (RegExpTokenStream *)ts->data;
431
+ return rets->rtext;
432
+ }
433
+
434
+ static Token *
435
+ rets_next(TokenStream *ts)
436
+ {
437
+ static struct re_registers regs;
438
+ int ret, beg, end;
439
+ RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
440
+ struct RString *rtext = RSTRING(rets->rtext);
441
+ Check_Type(rets->regex, T_REGEXP);
442
+ ret = ruby_re_search(RREGEXP(rets->regex)->ptr,
443
+ rtext->ptr, rtext->len,
444
+ rets->curr_ind, rtext->len - rets->curr_ind,
445
+ &regs);
446
+
447
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
448
+ if (ret < 0) return NULL; /* not matched */
449
+
450
+ beg = regs.beg[0];
451
+ rets->curr_ind = end = regs.end[0];
452
+ if (NIL_P(rets->proc)) {
453
+ return tk_set(ts->token, rtext->ptr + beg, end - beg, beg, end, 1);
454
+ } else {
455
+ VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
456
+ rtok = rb_funcall(rets->proc, id_call, 1, rtok);
457
+ return tk_set(ts->token, RSTRING(rtok)->ptr, RSTRING(rtok)->len, beg, end, 1);
458
+ }
459
+ }
460
+
461
+ static void
462
+ rets_reset(TokenStream *ts, char *text)
463
+ {
464
+ RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
465
+ rets->rtext = rb_str_new2(text);
466
+ rets->curr_ind = 0;
467
+ }
468
+
469
+ void
470
+ rets_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
471
+ {
472
+ RegExpTokenStream *new_rets = ALLOC(RegExpTokenStream);
473
+ RegExpTokenStream *orig_rets = (RegExpTokenStream *)orig_ts->data;
474
+ memcpy(new_rets, orig_rets, sizeof(RegExpTokenStream));
475
+ new_ts->data = new_rets;
476
+ }
477
+
478
+ static TokenStream *
479
+ rets_create(VALUE rtext, VALUE regex, VALUE proc)
480
+ {
481
+ RegExpTokenStream *rets;
482
+ TokenStream *ts;
483
+
484
+ if (rtext != Qnil) {
485
+ rtext = StringValue(rtext);
486
+ }
487
+ ts = ts_create();
488
+ ts->reset = &rets_reset;
489
+ ts->next = &rets_next;
490
+ ts->clone_i = &rets_clone_i;
491
+ ts->destroy = &rets_destroy;
492
+ ts->ref_cnt = 1;
493
+
494
+ rets = ALLOC(RegExpTokenStream);
495
+ rets->curr_ind = 0;
496
+ rets->rtext = rtext;
497
+ rets->proc = proc;
498
+ if (NIL_P(regex)) {
499
+ rets->regex = rtoken_re;
500
+ } else {
501
+ Check_Type(regex, T_REGEXP);
502
+ rets->regex = regex;
503
+ }
504
+
505
+ ts->data = rets;
506
+
507
+ return ts;
508
+ }
509
+
510
+ static VALUE
511
+ frt_rets_init(int argc, VALUE *argv, VALUE self)
512
+ {
513
+ VALUE rtext, regex, proc;
514
+ TokenStream *ts;
515
+
516
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
517
+
518
+ ts = rets_create(rtext, regex, proc);
519
+
520
+ Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
521
+ object_add(ts, self);
522
+ /* no need to add to object space as it is going to ruby space
523
+ * rb_hash_aset(object_space, LONG2NUM((long)self), self);
524
+ */
525
+ return self;
526
+ }
527
+
341
528
  /****************************************************************************
342
529
  * Tokenizers
343
530
  ****************************************************************************/
@@ -394,10 +581,8 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
394
581
  static VALUE
395
582
  frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
396
583
  {
397
- bool self_destroy;
398
- TokenStream *ts = lowercase_filter_create(
399
- get_cwrapped_rts(rsub_ts, &self_destroy));
400
- ts->destroy_sub = !self_destroy;
584
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
585
+ ts = lowercase_filter_create(ts);
401
586
  object_add(&ts->sub_ts, rsub_ts);
402
587
 
403
588
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -408,10 +593,8 @@ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
408
593
  static VALUE
409
594
  frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
410
595
  {
411
- bool self_destroy;
412
- TokenStream *ts = mb_lowercase_filter_create(
413
- get_cwrapped_rts(rsub_ts, &self_destroy));
414
- ts->destroy_sub = !self_destroy;
596
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
597
+ ts = mb_lowercase_filter_create(ts);
415
598
  object_add(&ts->sub_ts, rsub_ts);
416
599
 
417
600
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -423,19 +606,17 @@ static VALUE
423
606
  frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
424
607
  {
425
608
  VALUE rsub_ts, rstop_words;
426
- bool self_destroy;
427
609
  TokenStream *ts;
428
610
  rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
611
+ ts = frt_get_cwrapped_rts(rsub_ts);
429
612
  if (rstop_words != Qnil) {
430
613
  char **stop_words = get_stopwords(rstop_words);
431
- ts = stop_filter_create_with_words(
432
- get_cwrapped_rts(rsub_ts, &self_destroy), (const char **)stop_words);
614
+ ts = stop_filter_create_with_words(ts, (const char **)stop_words);
615
+
433
616
  free(stop_words);
434
617
  } else {
435
- ts = stop_filter_create(
436
- get_cwrapped_rts(rsub_ts, &self_destroy));
618
+ ts = stop_filter_create(ts);
437
619
  }
438
- ts->destroy_sub = !self_destroy;
439
620
  object_add(&ts->sub_ts, rsub_ts);
440
621
 
441
622
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -449,16 +630,14 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
449
630
  VALUE rsub_ts, ralgorithm, rcharenc;
450
631
  char *algorithm = "english";
451
632
  char *charenc = NULL;
452
- bool self_destroy;
453
633
  TokenStream *ts;
454
634
  rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
635
+ ts = frt_get_cwrapped_rts(rsub_ts);
455
636
  switch (argc) {
456
637
  case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
457
638
  case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
458
639
  }
459
- ts = stem_filter_create(
460
- get_cwrapped_rts(rsub_ts, &self_destroy), algorithm, charenc);
461
- ts->destroy_sub = !self_destroy;
640
+ ts = stem_filter_create(ts, algorithm, charenc);
462
641
  object_add(&ts->sub_ts, rsub_ts);
463
642
 
464
643
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -472,34 +651,49 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
472
651
  *
473
652
  ****************************************************************************/
474
653
 
475
- Analyzer *get_cwrapped_analyzer(ranalyzer)
654
+ /****************************************************************************
655
+ * CWrappedAnalyzer Methods
656
+ ****************************************************************************/
657
+
658
+ static void
659
+ cwa_destroy(Analyzer *a)
660
+ {
661
+ rb_hash_delete(object_space, LONG2NUM((long)a->data));
662
+ a_standard_destroy(a);
663
+ }
664
+
665
+ static TokenStream *
666
+ cwa_get_ts(Analyzer *a, char *field, char *text)
667
+ {
668
+ VALUE ranalyzer = (VALUE)a->data;
669
+ VALUE rts = rb_funcall(ranalyzer, id_token_stream, 2,
670
+ rb_str_new2(field), rb_str_new2(text));
671
+ return frt_get_cwrapped_rts(rts);
672
+ }
673
+
674
+ Analyzer *
675
+ frt_get_cwrapped_analyzer(ranalyzer)
476
676
  {
477
677
  Analyzer *a = NULL;
478
678
  switch (TYPE(ranalyzer)) {
479
679
  case T_DATA:
480
680
  Data_Get_Struct(ranalyzer, Analyzer, a);
681
+ ref(a);
481
682
  break;
482
683
  default:
483
- printf("Oh RFuck\n");
484
- //ts = ALLOC(TokenStream);
485
- //ts->token = ALLOC(Token);
486
- //ts->data = (void *)rts;
487
- //ts->next = &cwrts_next;
488
- //ts->reset = &cwrts_reset;
489
- //ts->clone_i = &cwrts_clone_i;
490
- //ts->destroy = &cwrts_destroy;
491
- //ts->sub_ts = NULL;
684
+ a = analyzer_create((void *)ranalyzer, NULL, &cwa_destroy, &cwa_get_ts);
685
+ // prevent from being garbage collected
686
+ rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
492
687
  break;
493
688
  }
494
689
  return a;
495
690
  }
496
691
 
497
692
  static void
498
- frt_analyzer_free(void *p)
693
+ frt_analyzer_free(Analyzer *a)
499
694
  {
500
- Analyzer *a = (Analyzer *)p;
501
695
  object_del(a);
502
- a->destroy(a);
696
+ a_deref(a);
503
697
  }
504
698
 
505
699
  VALUE
@@ -513,13 +707,16 @@ frt_get_analyzer(Analyzer *a)
513
707
  static VALUE
514
708
  frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
515
709
  {
516
- Analyzer *a = ((struct RData *)(self))->data;
710
+ TokenStream *ts;
711
+ Analyzer *a = (Analyzer *)DATA_PTR(self);
712
+
517
713
  rfield = rb_obj_as_string(rfield);
518
714
  rstring = rb_obj_as_string(rstring);
519
715
 
520
- TokenStream *ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
716
+ ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
521
717
 
522
- object_set(&ts->text, rstring); // Make sure that there is no entry already
718
+ /* Make sure that there is no entry already */
719
+ object_set(&ts->text, rstring);
523
720
  return get_token_stream(ts);
524
721
  }
525
722
 
@@ -533,8 +730,9 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
533
730
  static VALUE
534
731
  frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
535
732
  {
733
+ Analyzer *a;
536
734
  GET_LOWER(false);
537
- Analyzer *a = whitespace_analyzer_create(lower);
735
+ a = whitespace_analyzer_create(lower);
538
736
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
539
737
  object_add(a, self);
540
738
  return self;
@@ -544,8 +742,9 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
544
742
  static VALUE
545
743
  frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
546
744
  {
745
+ Analyzer *a;
547
746
  GET_LOWER(false);
548
- Analyzer *a = mb_whitespace_analyzer_create(lower);
747
+ a = mb_whitespace_analyzer_create(lower);
549
748
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
550
749
  object_add(a, self);
551
750
  return self;
@@ -555,8 +754,9 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
555
754
  static VALUE
556
755
  frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
557
756
  {
757
+ Analyzer *a;
558
758
  GET_LOWER(true);
559
- Analyzer *a = letter_analyzer_create(lower);
759
+ a = letter_analyzer_create(lower);
560
760
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
561
761
  object_add(a, self);
562
762
  return self;
@@ -566,8 +766,9 @@ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
566
766
  static VALUE
567
767
  frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
568
768
  {
769
+ Analyzer *a;
569
770
  GET_LOWER(true);
570
- Analyzer *a = mb_letter_analyzer_create(lower);
771
+ a = mb_letter_analyzer_create(lower);
571
772
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
572
773
  object_add(a, self);
573
774
  return self;
@@ -628,13 +829,29 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
628
829
  return self;
629
830
  }
630
831
 
832
+ void
833
+ frt_h_mark_values_i(void *key, void *value, void *arg)
834
+ {
835
+ frt_gc_mark(value);
836
+ }
837
+
838
+ void
839
+ frt_pfa_mark(void *p)
840
+ {
841
+ Analyzer *a = (Analyzer *)p;
842
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)a->data;
843
+ frt_gc_mark(pfa->def);
844
+ h_each(pfa->dict, &frt_h_mark_values_i, NULL);
845
+ }
846
+
631
847
  /*** PerFieldAnalyzer ***/
848
+
632
849
  static VALUE
633
850
  frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
634
851
  {
635
- Analyzer *def = get_cwrapped_analyzer(ranalyzer);
636
- Analyzer *a = per_field_analyzer_create(def, false);
637
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
852
+ Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
853
+ Analyzer *a = per_field_analyzer_create(def);
854
+ Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
638
855
  object_add(a, self);
639
856
  return self;
640
857
  }
@@ -644,42 +861,48 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
644
861
  {
645
862
  Analyzer *pfa, *a;
646
863
  Data_Get_Struct(self, Analyzer, pfa);
647
- Data_Get_Struct(ranalyzer, Analyzer, a);
864
+ a = frt_get_cwrapped_analyzer(ranalyzer);
648
865
 
649
866
  pfa_add_field(pfa, StringValuePtr(rfield), a);
650
867
  return self;
651
868
  }
652
869
 
870
+ /*** RegExpAnalyzer ***/
653
871
 
654
- /** RegexAnalyzer **/
655
- /*
656
- static VALUE
657
- frt_regex_analyzer_init(VALUE self)
872
+ static void
873
+ frt_re_analyzer_mark(Analyzer *a)
658
874
  {
659
- Analyzer *a = regex_analyzer_create();
660
- // keine Ahnung warum hier das Makro und nicht Data_Wrap_Struct:
661
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
662
- // wofuer?:
663
- object_add(a, self);
664
- return self;
875
+ frt_gc_mark(a->current_ts);
665
876
  }
666
877
 
667
- // convenience method
668
- // XXX this sets the locale for the entire program
669
- static VALUE
670
- frt_regex_analyzer_token_stream(VALUE self, VALUE field, VALUE string)
878
+ static void
879
+ re_analyzer_destroy(Analyzer *a)
671
880
  {
672
- Analyzer *a =((struct RData *)(self))->data;
673
- TokenStream *ts = a->get_ts( a, StringValuePtr(field), StringValuePtr(string) );
674
- // already freed via analyzer's free()
675
- VALUE token_stream = Data_Wrap_Struct(cTokenStream, NULL, NULL, ts);
676
- return token_stream;
881
+ free(a->data);
882
+ a_standard_destroy(a);
677
883
  }
678
- */
679
- /** /RegexAnalyzer **/
680
884
 
681
- /** TokenStream **/
682
- /** /TokenStream **/
885
+ static VALUE
886
+ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
887
+ {
888
+ VALUE lower, rets, regex, proc;
889
+ Analyzer *a;
890
+ TokenStream *ts;
891
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
892
+
893
+ ts = rets_create(Qnil, regex, proc);
894
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
895
+ ref(ts);
896
+ rb_hash_aset(object_space, LONG2NUM((long)rets), rets);
897
+ object_add(ts, rets);
898
+
899
+ if (lower != Qfalse) ts = mb_lowercase_filter_create(ts);
900
+
901
+ a = analyzer_create(NULL, ts, &re_analyzer_destroy, NULL);
902
+ Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
903
+ object_add(a, self);
904
+ return self;
905
+ }
683
906
 
684
907
  /****************************************************************************
685
908
  *
@@ -710,10 +933,17 @@ static VALUE frt_setlocale(VALUE self, VALUE locale)
710
933
  void
711
934
  Init_analysis(void)
712
935
  {
936
+ /* TokenStream Methods */
713
937
  id_next = rb_intern("next");
714
938
  id_reset = rb_intern("text=");
715
939
  id_clone = rb_intern("clone");
716
940
 
941
+ /* Analyzer Methods */
942
+ id_token_stream = rb_intern("token_stream");
943
+
944
+ object_space = rb_hash_new();
945
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
946
+
717
947
  /*** * * Locale stuff * * ***/
718
948
  frt_locale = setlocale(LC_ALL, "");
719
949
  rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
@@ -790,6 +1020,18 @@ Init_analysis(void)
790
1020
  rb_define_method(cStandardTokenizer, "initialize",
791
1021
  frt_standard_tokenizer_init, 1);
792
1022
 
1023
+ /*** * * RegExpTokenizer * * ***/
1024
+ cRegExpTokenizer =
1025
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1026
+ rtoken_re = rb_reg_new(token_re, strlen(token_re), 0);
1027
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1028
+ rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1029
+ rb_define_method(cRegExpTokenizer, "initialize",
1030
+ frt_rets_init, -1);
1031
+ rb_define_method(cRegExpTokenizer, "next", frt_ts_next, 0);
1032
+ rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1033
+ rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1034
+
793
1035
  /***************/
794
1036
  /*** Filters ***/
795
1037
  /***************/
@@ -911,7 +1153,13 @@ Init_analysis(void)
911
1153
  rb_define_method(cPerFieldAnalyzer, "[]=",
912
1154
  frt_per_field_analyzer_add_field, 2);
913
1155
 
914
- /** RegexAnalyzer **/
1156
+ /*** * * RegexAnalyzer * * ***/
1157
+ cRegExpAnalyzer =
1158
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
1159
+ rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
1160
+ rb_define_method(cRegExpAnalyzer, "initialize",
1161
+ frt_re_analyzer_init, -1);
1162
+
915
1163
  /*
916
1164
  cRegexAnalyzer =
917
1165
  rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);