ferret 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/q_term.c CHANGED
@@ -18,6 +18,14 @@ Scorer *tw_scorer(Weight *self, IndexReader *ir)
18
18
 
19
19
  Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
20
20
  {
21
+ Explanation *qnorm_expl;
22
+ Explanation *field_expl;
23
+ Scorer *scorer;
24
+ Explanation *tf_expl;
25
+ uchar *field_norms;
26
+ float field_norm;
27
+ Explanation *field_norm_expl;
28
+
21
29
  char *query_str = self->query->to_s(self->query, "");
22
30
  TermQuery *tq = (TermQuery *)self->query->data;
23
31
  Term *term = tq->term;
@@ -26,14 +34,14 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
26
34
  Explanation *expl = expl_create(0.0,
27
35
  strfmt("weight(%s in %d), product of:", query_str, doc_num));
28
36
 
29
- // We need two of these as it's included in both the query explanation
30
- // and the field explanation
37
+ /* We need two of these as it's included in both the query explanation
38
+ * and the field explanation */
31
39
  Explanation *idf_expl1 = expl_create(self->idf,
32
40
  strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
33
41
  Explanation *idf_expl2 = expl_create(self->idf,
34
42
  strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
35
43
 
36
- // explain query weight
44
+ /* explain query weight */
37
45
  Explanation *query_expl = expl_create(0.0,
38
46
  strfmt("query_weight(%s), product of:", query_str));
39
47
  free(query_str);
@@ -44,33 +52,35 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
44
52
 
45
53
  expl_add_detail(query_expl, idf_expl1);
46
54
 
47
- Explanation *qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
55
+ qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
48
56
  expl_add_detail(query_expl, qnorm_expl);
49
57
 
50
58
  query_expl->value = self->query->boost * idf_expl1->value * qnorm_expl->value;
51
59
 
52
60
  expl_add_detail(expl, query_expl);
53
61
 
54
- // explain field weight
55
- Explanation *field_expl = expl_create(0.0,
62
+ /* explain field weight */
63
+ field_expl = expl_create(0.0,
56
64
  strfmt("field_weight(%s:%s in %d), product of:",
57
65
  field_name, term->text, doc_num));
58
66
 
59
- Scorer *scorer = self->scorer(self, ir);
60
- Explanation *tf_expl = scorer->explain(scorer, doc_num);
67
+ scorer = self->scorer(self, ir);
68
+ tf_expl = scorer->explain(scorer, doc_num);
61
69
  scorer->destroy(scorer);
62
70
  expl_add_detail(field_expl, tf_expl);
63
71
  expl_add_detail(field_expl, idf_expl2);
64
72
 
65
- uchar *field_norms = ir->get_norms(ir, field_name);
66
- float field_norm = (field_norms ? sim_decode_norm(self->similarity, field_norms[doc_num]) : 0.0);
67
- Explanation *field_norm_expl = expl_create(field_norm,
73
+ field_norms = ir->get_norms(ir, field_name);
74
+ field_norm = (field_norms
75
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
76
+ : (float)0.0);
77
+ field_norm_expl = expl_create(field_norm,
68
78
  strfmt("field_norm(field=%s, doc=%d)", field_name, doc_num));
69
79
  expl_add_detail(field_expl, field_norm_expl);
70
80
 
71
81
  field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
72
82
 
73
- // combine them
83
+ /* combine them */
74
84
  if (query_expl->value == 1.0) {
75
85
  expl_destoy(expl);
76
86
  return field_expl;
@@ -86,30 +96,18 @@ char *tw_to_s(Weight *self)
86
96
  return strfmt("TermWeight(%f)", self->value);
87
97
  }
88
98
 
89
- void tw_destroy(void *p)
90
- {
91
- free(p);
92
- }
93
-
94
99
  Weight *tw_create(Query *query, Searcher *searcher)
95
100
  {
96
- Weight *self = ALLOC(Weight);
97
- ZEROSET(self, Weight, 1);
98
- self->get_query = &w_get_query;
99
- self->get_value = &w_get_value;
100
- self->normalize = &w_normalize;
101
+ Weight *self = w_create(query);
101
102
  self->scorer = &tw_scorer;
102
103
  self->explain = &tw_explain;
103
104
  self->to_s = &tw_to_s;
104
- self->destroy = &tw_destroy;
105
105
  self->sum_of_squared_weights = &w_sum_of_squared_weights;
106
106
 
107
107
  self->similarity = query->get_similarity(query, searcher);
108
108
  self->idf = sim_idf(self->similarity,
109
109
  searcher->doc_freq(searcher, ((TermQuery *)query->data)->term),
110
110
  searcher->max_doc(searcher)); // compute idf
111
- self->query = query;
112
- self->value = 0.0;
113
111
 
114
112
  return self;
115
113
  }
@@ -120,20 +118,19 @@ Weight *tw_create(Query *query, Searcher *searcher)
120
118
  *
121
119
  ***************************************************************************/
122
120
 
123
- void tq_destroy(void *p)
121
+ void tq_destroy(Query *self)
124
122
  {
125
- Query *q = (Query *)p;
126
- TermQuery *tq = q->data;
123
+ TermQuery *tq = self->data;
127
124
  term_destroy(tq->term);
128
125
  free(tq);
129
- q_destroy(q);
126
+ q_destroy_i(self);
130
127
  }
131
128
 
132
129
  char *tq_to_s(Query *self, char *field)
133
130
  {
134
131
  Term *term = ((TermQuery *)self->data)->term;
135
- int flen = strlen(term->field);
136
- int tlen = strlen(term->text);
132
+ size_t flen = strlen(term->field);
133
+ size_t tlen = strlen(term->text);
137
134
  char *buffer = ALLOC_N(char, 34 + flen + tlen);
138
135
  char *b = buffer;
139
136
  if (strcmp(field, term->field) != 0) {
@@ -151,10 +148,21 @@ char *tq_to_s(Query *self, char *field)
151
148
  return buffer;
152
149
  }
153
150
 
154
- void tq_extract_terms(Query *self, Array *terms)
151
+ static void tq_extract_terms(Query *self, HashSet *terms)
155
152
  {
156
153
  Term *term = ((TermQuery *)self->data)->term;
157
- ary_append(terms, term);
154
+ hs_add(terms, term_clone(term));
155
+ }
156
+
157
+ static uint tq_hash(Query *self)
158
+ {
159
+ return term_hash(((TermQuery *)self->data)->term);
160
+ }
161
+
162
+ static int tq_eq(Query *self, Query *o)
163
+ {
164
+ return term_eq(((TermQuery *)self->data)->term,
165
+ ((TermQuery *)o->data)->term);
158
166
  }
159
167
 
160
168
  Query *tq_create(Term *term)
@@ -164,14 +172,18 @@ Query *tq_create(Term *term)
164
172
  tq->term = term;
165
173
  self->type = TERM_QUERY;
166
174
  self->data = tq;
167
- self->create_weight = &tw_create;
168
175
  self->extract_terms = &tq_extract_terms;
169
176
  self->to_s = &tq_to_s;
170
- self->destroy = &tq_destroy;
177
+ self->hash = &tq_hash;
178
+ self->eq = &tq_eq;
179
+
180
+ self->destroy_i = &tq_destroy;
181
+ self->create_weight_i = &tw_create;
171
182
 
172
183
  return self;
173
184
  }
174
185
 
186
+
175
187
  /***************************************************************************
176
188
  *
177
189
  * TermScorer
@@ -183,13 +195,13 @@ float tsc_score(Scorer *self)
183
195
  TermScorer *ts = (TermScorer *)self->data;
184
196
  int freq = ts->freqs[ts->pointer];
185
197
  float score;
186
- // compute tf(f)*weight
187
- if (freq < SCORE_CACHE_SIZE) { // check cache
188
- score = ts->score_cache[freq]; // cache hit
198
+ /* compute tf(f)*weight */
199
+ if (freq < SCORE_CACHE_SIZE) { /* check cache */
200
+ score = ts->score_cache[freq]; /* cache hit */
189
201
  } else {
190
- score = sim_tf(self->similarity, freq) * ts->weight_value; // cache miss
202
+ score = sim_tf(self->similarity, (float)freq) * ts->weight_value; /* cache miss */
191
203
  }
192
- // normalize for field
204
+ /* normalize for field */
193
205
  score *= sim_decode_norm(self->similarity, ts->norms[self->doc]);
194
206
  return score;
195
207
  }
@@ -217,8 +229,9 @@ bool tsc_next(Scorer *self)
217
229
  bool tsc_skip_to(Scorer *self, int doc_num)
218
230
  {
219
231
  TermScorer *ts = (TermScorer *)self->data;
220
-
221
- // first scan in cache
232
+ TermDocEnum *tde = ts->tde;
233
+
234
+ /* first scan in cache */
222
235
  while (++(ts->pointer) < ts->pointer_max) {
223
236
  if (ts->docs[ts->pointer] >= doc_num) {
224
237
  self->doc = ts->docs[ts->pointer];
@@ -226,10 +239,8 @@ bool tsc_skip_to(Scorer *self, int doc_num)
226
239
  }
227
240
  }
228
241
 
229
- // not found in cache, seek underlying stream
230
- TermDocEnum *tde = ts->tde;
231
- bool result = tde->skip_to(tde, doc_num);
232
- if (result) {
242
+ /* not found in cache, seek underlying stream */
243
+ if (tde->skip_to(tde, doc_num)) {
233
244
  ts->pointer_max = 1;
234
245
  ts->pointer = 0;
235
246
  ts->docs[0] = self->doc = tde->doc_num(tde);
@@ -242,6 +253,7 @@ bool tsc_skip_to(Scorer *self, int doc_num)
242
253
 
243
254
  Explanation *tsc_explain(Scorer *self, int doc_num)
244
255
  {
256
+ Explanation *tf_explanation;
245
257
  TermScorer *ts = (TermScorer *)self->data;
246
258
  Query *query = ts->weight->get_query(ts->weight);
247
259
  Term *term = ((TermQuery *)query->data)->term;
@@ -260,18 +272,17 @@ Explanation *tsc_explain(Scorer *self, int doc_num)
260
272
  }
261
273
  tde->close(tde);
262
274
  ts->tde = NULL;
263
- Explanation *tf_explanation = expl_create(sim_tf(self->similarity, tf),
275
+ tf_explanation = expl_create(sim_tf(self->similarity, (float)tf),
264
276
  strfmt("tf(term_freq(%s:%s)=%d)", term->field, term->text, tf));
265
277
 
266
278
  return tf_explanation;
267
279
  }
268
280
 
269
- void tsc_destroy(void *p)
281
+ void tsc_destroy(Scorer *self)
270
282
  {
271
- Scorer *self = (Scorer *)p;
272
283
  TermScorer *ts = (TermScorer *)self->data;
273
284
  if (ts->tde) ts->tde->close(ts->tde);
274
- scorer_destroy(p);
285
+ scorer_destroy_i(self);
275
286
  }
276
287
 
277
288
  Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
@@ -287,7 +298,7 @@ Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
287
298
  ts->weight_value = weight->value;
288
299
 
289
300
  for (i = 0; i < SCORE_CACHE_SIZE; i++) {
290
- ts->score_cache[i] = sim_tf(self->similarity, i) * ts->weight_value;
301
+ ts->score_cache[i] = sim_tf(self->similarity, (float)i) * ts->weight_value;
291
302
  }
292
303
 
293
304
  self->score = &tsc_score;
data/ext/q_wildcard.c CHANGED
@@ -11,8 +11,8 @@ char *wcq_to_s(Query *self, char *field)
11
11
  {
12
12
  char *buffer, *bptr;
13
13
  Term *term = (Term *)self->data;
14
- int tlen = strlen(term->text);
15
- int flen = strlen(term->field);
14
+ size_t tlen = strlen(term->text);
15
+ size_t flen = strlen(term->field);
16
16
  bptr = buffer = ALLOC_N(char, tlen + flen + 35);
17
17
 
18
18
  if (strcmp(term->field, field) != 0) {
@@ -77,8 +77,8 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
77
77
  Term *term = (Term *)self->data;
78
78
  char *text = term->text;
79
79
  char *field = term->field;
80
- char *first_star = index(text, WILD_STRING);
81
- char *first_ques = index(text, WILD_CHAR);
80
+ char *first_star = strrchr(text, WILD_STRING);
81
+ char *first_ques = strrchr(text, WILD_CHAR);
82
82
  if (!first_star && !first_ques) {
83
83
  q = tq_create(term_clone(term));
84
84
  } else {
@@ -89,7 +89,7 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
89
89
  char *pattern = (first_ques && first_star > first_ques)
90
90
  ? first_ques : first_star;
91
91
 
92
- int prefix_len = pattern - text;
92
+ int prefix_len = (int)(pattern - text);
93
93
 
94
94
  prefix_term.field = field;
95
95
  prefix_term.text = (char *)EMPTY_STRING;
@@ -120,15 +120,23 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
120
120
  free(prefix);
121
121
  }
122
122
 
123
- if (self->rewritten) self->rewritten->destroy(self->rewritten);
124
- return self->rewritten = q;
123
+ return q;
125
124
  }
126
125
 
127
- void wcq_destroy(void *p)
126
+ static void wcq_destroy(Query *self)
128
127
  {
129
- Query *self = (Query *)p;
130
128
  if (self->destroy_all) term_destroy((Term *)self->data);
131
- q_destroy(self);
129
+ q_destroy_i(self);
130
+ }
131
+
132
+ static uint wcq_hash(Query *self)
133
+ {
134
+ return term_hash((Term *)self->data);
135
+ }
136
+
137
+ static int wcq_eq(Query *self, Query *o)
138
+ {
139
+ return term_eq((Term *)self->data, (Term *)o->data);
132
140
  }
133
141
 
134
142
  Query *wcq_create(Term *term)
@@ -136,11 +144,14 @@ Query *wcq_create(Term *term)
136
144
  Query *self = q_create();
137
145
 
138
146
  self->data = term;
147
+
139
148
  self->type = WILD_CARD_QUERY;
140
- self->create_weight = NULL;
141
- self->to_s = &wcq_to_s;
142
149
  self->rewrite = &wcq_rewrite;
143
- self->destroy = &wcq_destroy;
150
+ self->to_s = &wcq_to_s;
151
+ self->hash = &wcq_hash;
152
+ self->eq = &wcq_eq;
153
+ self->destroy_i = &wcq_destroy;
154
+ self->create_weight_i = &q_create_weight_unsup;
144
155
 
145
156
  return self;
146
157
  }
data/ext/r_analysis.c CHANGED
@@ -1,3 +1,4 @@
1
+ #include <regex.h>
1
2
  #include "ferret.h"
2
3
  #include "analysis.h"
3
4
  #include "locale.h"
@@ -9,6 +10,7 @@ static VALUE cAsciiWhiteSpaceTokenizer;
9
10
  static VALUE cWhiteSpaceTokenizer;
10
11
  static VALUE cAsciiStandardTokenizer;
11
12
  static VALUE cStandardTokenizer;
13
+ static VALUE cRegExpTokenizer;
12
14
 
13
15
  static VALUE cAsciiLowerCaseFilter;
14
16
  static VALUE cLowerCaseFilter;
@@ -23,14 +25,25 @@ static VALUE cWhiteSpaceAnalyzer;
23
25
  static VALUE cAsciiStandardAnalyzer;
24
26
  static VALUE cStandardAnalyzer;
25
27
  static VALUE cPerFieldAnalyzer;
28
+ static VALUE cRegExpAnalyzer;
26
29
 
27
30
  //static VALUE cRegexAnalyzer;
28
31
  static VALUE cTokenStream;
29
32
 
33
+ /* TokenStream Methods */
30
34
  static ID id_next;
31
35
  static ID id_reset;
32
36
  static ID id_clone;
33
37
 
38
+ /* Analyzer Methods */
39
+ static ID id_token_stream;
40
+
41
+ static VALUE object_space;
42
+
43
+ extern TokenStream *ts_create();
44
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, int,
45
+ struct re_registers *);
46
+
34
47
  /****************************************************************************
35
48
  *
36
49
  * Utility Methods
@@ -111,7 +124,7 @@ frt_set_token(Token *tk, VALUE rt)
111
124
  return tk;
112
125
  }
113
126
 
114
- #define GET_TK RToken *token; Data_Get_Struct(self, RToken, token);
127
+ #define GET_TK RToken *token = (RToken *)DATA_PTR(self)
115
128
  static VALUE
116
129
  frt_token_init(int argc, VALUE *argv, VALUE self)
117
130
  {
@@ -212,13 +225,12 @@ frt_ts_mark(void *p)
212
225
  }
213
226
 
214
227
  static void
215
- frt_ts_free(void *p)
228
+ frt_ts_free(TokenStream *ts)
216
229
  {
217
- TokenStream *ts = (TokenStream *)p;
218
230
  if (object_get(&ts->text) != Qnil) object_del(&ts->text);
219
231
  if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
220
232
  object_del(ts);
221
- ts->destroy(ts);
233
+ ts_deref(ts);
222
234
  }
223
235
 
224
236
  static VALUE
@@ -273,8 +285,7 @@ frt_ts_get_text(VALUE self)
273
285
  static VALUE
274
286
  frt_ts_next(VALUE self)
275
287
  {
276
- TokenStream *ts;
277
- Data_Get_Struct(self, TokenStream, ts);
288
+ TokenStream *ts = (TokenStream *)DATA_PTR(self);
278
289
  Token *next = ts->next(ts);
279
290
  if (next == NULL) {
280
291
  return Qnil;
@@ -287,41 +298,45 @@ frt_ts_next(VALUE self)
287
298
  * CWrappedTokenStream
288
299
  ****************************************************************************/
289
300
 
290
- void cwrts_destroy(void *p)
301
+ static void
302
+ cwrts_destroy(TokenStream *ts)
291
303
  {
292
- TokenStream *ts = (TokenStream *)p;
304
+ rb_hash_delete(object_space, LONG2NUM((long)ts->data));
293
305
  free(ts->token);
294
306
  free(ts);
295
307
  }
296
308
 
297
- Token *cwrts_next(TokenStream *ts)
309
+ static Token *
310
+ cwrts_next(TokenStream *ts)
298
311
  {
299
312
  VALUE rts = (VALUE)ts->data;
300
313
  VALUE rtoken = rb_funcall(rts, id_next, 0);
301
314
  return frt_set_token(ts->token, rtoken);
302
315
  }
303
316
 
304
- void cwrts_reset(TokenStream *ts, char *text)
317
+ static void
318
+ cwrts_reset(TokenStream *ts, char *text)
305
319
  {
306
320
  VALUE rts = (VALUE)ts->data;
307
321
  ts->t = ts->text = text;
308
322
  rb_funcall(rts, id_reset, 1, rb_str_new2(text));
309
323
  }
310
324
 
311
- void cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
325
+ static void
326
+ cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
312
327
  {
313
328
  VALUE rorig_ts = (VALUE)orig_ts->data;
314
329
  new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
315
330
  }
316
331
 
317
332
  static TokenStream *
318
- get_cwrapped_rts(VALUE rts, bool *self_destroy)
333
+ frt_get_cwrapped_rts(VALUE rts)
319
334
  {
320
335
  TokenStream *ts;
321
336
  switch (TYPE(rts)) {
322
337
  case T_DATA:
323
338
  Data_Get_Struct(rts, TokenStream, ts);
324
- *self_destroy = true;
339
+ ref(ts);
325
340
  break;
326
341
  default:
327
342
  ts = ALLOC(TokenStream);
@@ -332,12 +347,184 @@ get_cwrapped_rts(VALUE rts, bool *self_destroy)
332
347
  ts->clone_i = &cwrts_clone_i;
333
348
  ts->destroy = &cwrts_destroy;
334
349
  ts->sub_ts = NULL;
335
- *self_destroy = false;
350
+ // prevent from being garbage collected
351
+ rb_hash_aset(object_space, LONG2NUM(rts), rts);
352
+ ts->ref_cnt = 1;
336
353
  break;
337
354
  }
338
355
  return ts;
339
356
  }
340
357
 
358
+ /****************************************************************************
359
+ * RegExpTokenStream
360
+ ****************************************************************************/
361
+
362
+ #define P "[_\\/.,-]"
363
+ #define HASDIGIT "\\w*\\d\\w*"
364
+ #define ALPHA "[-_[:alpha:]]"
365
+ #define ALNUM "[-_[:alnum:]]"
366
+
367
+ static char *token_re =
368
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
369
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
370
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
371
+ "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
372
+ "|(\\.\\w+)+"
373
+ "|"
374
+ ")";
375
+ static VALUE rtoken_re;
376
+
377
+ typedef struct RegExpTokenStream {
378
+ VALUE rtext;
379
+ VALUE regex;
380
+ VALUE proc;
381
+ int curr_ind;
382
+ } RegExpTokenStream;
383
+
384
+ static void
385
+ rets_destroy(TokenStream *ts)
386
+ {
387
+ rb_hash_delete(object_space, LONG2NUM((long)object_get(ts)));
388
+ free(ts->data);
389
+ free(ts->token);
390
+ free(ts);
391
+ }
392
+
393
+ static void
394
+ frt_rets_free(TokenStream *ts)
395
+ {
396
+ object_del(ts);
397
+ ts_deref(ts);
398
+ }
399
+
400
+ static void
401
+ frt_rets_mark(TokenStream *ts)
402
+ {
403
+ RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
404
+ rb_gc_mark(rets->rtext);
405
+ rb_gc_mark(rets->regex);
406
+ rb_gc_mark(rets->proc);
407
+ }
408
+
409
+ static VALUE
410
+ frt_rets_set_text(VALUE self, VALUE rtext)
411
+ {
412
+ TokenStream *ts;
413
+ RegExpTokenStream *rets;
414
+ Data_Get_Struct(self, TokenStream, ts);
415
+
416
+ StringValue(rtext);
417
+ rets = (RegExpTokenStream *)ts->data;
418
+ rets->rtext = rtext;
419
+ rets->curr_ind = 0;
420
+
421
+ return rtext;
422
+ }
423
+
424
+ static VALUE
425
+ frt_rets_get_text(VALUE self)
426
+ {
427
+ TokenStream *ts;
428
+ RegExpTokenStream *rets;
429
+ Data_Get_Struct(self, TokenStream, ts);
430
+ rets = (RegExpTokenStream *)ts->data;
431
+ return rets->rtext;
432
+ }
433
+
434
+ static Token *
435
+ rets_next(TokenStream *ts)
436
+ {
437
+ static struct re_registers regs;
438
+ int ret, beg, end;
439
+ RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
440
+ struct RString *rtext = RSTRING(rets->rtext);
441
+ Check_Type(rets->regex, T_REGEXP);
442
+ ret = ruby_re_search(RREGEXP(rets->regex)->ptr,
443
+ rtext->ptr, rtext->len,
444
+ rets->curr_ind, rtext->len - rets->curr_ind,
445
+ &regs);
446
+
447
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
448
+ if (ret < 0) return NULL; /* not matched */
449
+
450
+ beg = regs.beg[0];
451
+ rets->curr_ind = end = regs.end[0];
452
+ if (NIL_P(rets->proc)) {
453
+ return tk_set(ts->token, rtext->ptr + beg, end - beg, beg, end, 1);
454
+ } else {
455
+ VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
456
+ rtok = rb_funcall(rets->proc, id_call, 1, rtok);
457
+ return tk_set(ts->token, RSTRING(rtok)->ptr, RSTRING(rtok)->len, beg, end, 1);
458
+ }
459
+ }
460
+
461
+ static void
462
+ rets_reset(TokenStream *ts, char *text)
463
+ {
464
+ RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
465
+ rets->rtext = rb_str_new2(text);
466
+ rets->curr_ind = 0;
467
+ }
468
+
469
+ void
470
+ rets_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
471
+ {
472
+ RegExpTokenStream *new_rets = ALLOC(RegExpTokenStream);
473
+ RegExpTokenStream *orig_rets = (RegExpTokenStream *)orig_ts->data;
474
+ memcpy(new_rets, orig_rets, sizeof(RegExpTokenStream));
475
+ new_ts->data = new_rets;
476
+ }
477
+
478
+ static TokenStream *
479
+ rets_create(VALUE rtext, VALUE regex, VALUE proc)
480
+ {
481
+ RegExpTokenStream *rets;
482
+ TokenStream *ts;
483
+
484
+ if (rtext != Qnil) {
485
+ rtext = StringValue(rtext);
486
+ }
487
+ ts = ts_create();
488
+ ts->reset = &rets_reset;
489
+ ts->next = &rets_next;
490
+ ts->clone_i = &rets_clone_i;
491
+ ts->destroy = &rets_destroy;
492
+ ts->ref_cnt = 1;
493
+
494
+ rets = ALLOC(RegExpTokenStream);
495
+ rets->curr_ind = 0;
496
+ rets->rtext = rtext;
497
+ rets->proc = proc;
498
+ if (NIL_P(regex)) {
499
+ rets->regex = rtoken_re;
500
+ } else {
501
+ Check_Type(regex, T_REGEXP);
502
+ rets->regex = regex;
503
+ }
504
+
505
+ ts->data = rets;
506
+
507
+ return ts;
508
+ }
509
+
510
+ static VALUE
511
+ frt_rets_init(int argc, VALUE *argv, VALUE self)
512
+ {
513
+ VALUE rtext, regex, proc;
514
+ TokenStream *ts;
515
+
516
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
517
+
518
+ ts = rets_create(rtext, regex, proc);
519
+
520
+ Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
521
+ object_add(ts, self);
522
+ /* no need to add to object space as it is going to ruby space
523
+ * rb_hash_aset(object_space, LONG2NUM((long)self), self);
524
+ */
525
+ return self;
526
+ }
527
+
341
528
  /****************************************************************************
342
529
  * Tokenizers
343
530
  ****************************************************************************/
@@ -394,10 +581,8 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
394
581
  static VALUE
395
582
  frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
396
583
  {
397
- bool self_destroy;
398
- TokenStream *ts = lowercase_filter_create(
399
- get_cwrapped_rts(rsub_ts, &self_destroy));
400
- ts->destroy_sub = !self_destroy;
584
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
585
+ ts = lowercase_filter_create(ts);
401
586
  object_add(&ts->sub_ts, rsub_ts);
402
587
 
403
588
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -408,10 +593,8 @@ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
408
593
  static VALUE
409
594
  frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
410
595
  {
411
- bool self_destroy;
412
- TokenStream *ts = mb_lowercase_filter_create(
413
- get_cwrapped_rts(rsub_ts, &self_destroy));
414
- ts->destroy_sub = !self_destroy;
596
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
597
+ ts = mb_lowercase_filter_create(ts);
415
598
  object_add(&ts->sub_ts, rsub_ts);
416
599
 
417
600
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -423,19 +606,17 @@ static VALUE
423
606
  frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
424
607
  {
425
608
  VALUE rsub_ts, rstop_words;
426
- bool self_destroy;
427
609
  TokenStream *ts;
428
610
  rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
611
+ ts = frt_get_cwrapped_rts(rsub_ts);
429
612
  if (rstop_words != Qnil) {
430
613
  char **stop_words = get_stopwords(rstop_words);
431
- ts = stop_filter_create_with_words(
432
- get_cwrapped_rts(rsub_ts, &self_destroy), (const char **)stop_words);
614
+ ts = stop_filter_create_with_words(ts, (const char **)stop_words);
615
+
433
616
  free(stop_words);
434
617
  } else {
435
- ts = stop_filter_create(
436
- get_cwrapped_rts(rsub_ts, &self_destroy));
618
+ ts = stop_filter_create(ts);
437
619
  }
438
- ts->destroy_sub = !self_destroy;
439
620
  object_add(&ts->sub_ts, rsub_ts);
440
621
 
441
622
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -449,16 +630,14 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
449
630
  VALUE rsub_ts, ralgorithm, rcharenc;
450
631
  char *algorithm = "english";
451
632
  char *charenc = NULL;
452
- bool self_destroy;
453
633
  TokenStream *ts;
454
634
  rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
635
+ ts = frt_get_cwrapped_rts(rsub_ts);
455
636
  switch (argc) {
456
637
  case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
457
638
  case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
458
639
  }
459
- ts = stem_filter_create(
460
- get_cwrapped_rts(rsub_ts, &self_destroy), algorithm, charenc);
461
- ts->destroy_sub = !self_destroy;
640
+ ts = stem_filter_create(ts, algorithm, charenc);
462
641
  object_add(&ts->sub_ts, rsub_ts);
463
642
 
464
643
  Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
@@ -472,34 +651,49 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
472
651
  *
473
652
  ****************************************************************************/
474
653
 
475
- Analyzer *get_cwrapped_analyzer(ranalyzer)
654
+ /****************************************************************************
655
+ * CWrappedAnalyzer Methods
656
+ ****************************************************************************/
657
+
658
+ static void
659
+ cwa_destroy(Analyzer *a)
660
+ {
661
+ rb_hash_delete(object_space, LONG2NUM((long)a->data));
662
+ a_standard_destroy(a);
663
+ }
664
+
665
+ static TokenStream *
666
+ cwa_get_ts(Analyzer *a, char *field, char *text)
667
+ {
668
+ VALUE ranalyzer = (VALUE)a->data;
669
+ VALUE rts = rb_funcall(ranalyzer, id_token_stream, 2,
670
+ rb_str_new2(field), rb_str_new2(text));
671
+ return frt_get_cwrapped_rts(rts);
672
+ }
673
+
674
+ Analyzer *
675
+ frt_get_cwrapped_analyzer(ranalyzer)
476
676
  {
477
677
  Analyzer *a = NULL;
478
678
  switch (TYPE(ranalyzer)) {
479
679
  case T_DATA:
480
680
  Data_Get_Struct(ranalyzer, Analyzer, a);
681
+ ref(a);
481
682
  break;
482
683
  default:
483
- printf("Oh RFuck\n");
484
- //ts = ALLOC(TokenStream);
485
- //ts->token = ALLOC(Token);
486
- //ts->data = (void *)rts;
487
- //ts->next = &cwrts_next;
488
- //ts->reset = &cwrts_reset;
489
- //ts->clone_i = &cwrts_clone_i;
490
- //ts->destroy = &cwrts_destroy;
491
- //ts->sub_ts = NULL;
684
+ a = analyzer_create((void *)ranalyzer, NULL, &cwa_destroy, &cwa_get_ts);
685
+ // prevent from being garbage collected
686
+ rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
492
687
  break;
493
688
  }
494
689
  return a;
495
690
  }
496
691
 
497
692
  static void
498
- frt_analyzer_free(void *p)
693
+ frt_analyzer_free(Analyzer *a)
499
694
  {
500
- Analyzer *a = (Analyzer *)p;
501
695
  object_del(a);
502
- a->destroy(a);
696
+ a_deref(a);
503
697
  }
504
698
 
505
699
  VALUE
@@ -513,13 +707,16 @@ frt_get_analyzer(Analyzer *a)
513
707
  static VALUE
514
708
  frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
515
709
  {
516
- Analyzer *a = ((struct RData *)(self))->data;
710
+ TokenStream *ts;
711
+ Analyzer *a = (Analyzer *)DATA_PTR(self);
712
+
517
713
  rfield = rb_obj_as_string(rfield);
518
714
  rstring = rb_obj_as_string(rstring);
519
715
 
520
- TokenStream *ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
716
+ ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
521
717
 
522
- object_set(&ts->text, rstring); // Make sure that there is no entry already
718
+ /* Make sure that there is no entry already */
719
+ object_set(&ts->text, rstring);
523
720
  return get_token_stream(ts);
524
721
  }
525
722
 
@@ -533,8 +730,9 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
533
730
  static VALUE
534
731
  frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
535
732
  {
733
+ Analyzer *a;
536
734
  GET_LOWER(false);
537
- Analyzer *a = whitespace_analyzer_create(lower);
735
+ a = whitespace_analyzer_create(lower);
538
736
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
539
737
  object_add(a, self);
540
738
  return self;
@@ -544,8 +742,9 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
544
742
  static VALUE
545
743
  frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
546
744
  {
745
+ Analyzer *a;
547
746
  GET_LOWER(false);
548
- Analyzer *a = mb_whitespace_analyzer_create(lower);
747
+ a = mb_whitespace_analyzer_create(lower);
549
748
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
550
749
  object_add(a, self);
551
750
  return self;
@@ -555,8 +754,9 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
555
754
  static VALUE
556
755
  frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
557
756
  {
757
+ Analyzer *a;
558
758
  GET_LOWER(true);
559
- Analyzer *a = letter_analyzer_create(lower);
759
+ a = letter_analyzer_create(lower);
560
760
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
561
761
  object_add(a, self);
562
762
  return self;
@@ -566,8 +766,9 @@ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
566
766
  static VALUE
567
767
  frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
568
768
  {
769
+ Analyzer *a;
569
770
  GET_LOWER(true);
570
- Analyzer *a = mb_letter_analyzer_create(lower);
771
+ a = mb_letter_analyzer_create(lower);
571
772
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
572
773
  object_add(a, self);
573
774
  return self;
@@ -628,13 +829,29 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
628
829
  return self;
629
830
  }
630
831
 
832
+ void
833
+ frt_h_mark_values_i(void *key, void *value, void *arg)
834
+ {
835
+ frt_gc_mark(value);
836
+ }
837
+
838
+ void
839
+ frt_pfa_mark(void *p)
840
+ {
841
+ Analyzer *a = (Analyzer *)p;
842
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)a->data;
843
+ frt_gc_mark(pfa->def);
844
+ h_each(pfa->dict, &frt_h_mark_values_i, NULL);
845
+ }
846
+
631
847
  /*** PerFieldAnalyzer ***/
848
+
632
849
  static VALUE
633
850
  frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
634
851
  {
635
- Analyzer *def = get_cwrapped_analyzer(ranalyzer);
636
- Analyzer *a = per_field_analyzer_create(def, false);
637
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
852
+ Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
853
+ Analyzer *a = per_field_analyzer_create(def);
854
+ Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
638
855
  object_add(a, self);
639
856
  return self;
640
857
  }
@@ -644,42 +861,48 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
644
861
  {
645
862
  Analyzer *pfa, *a;
646
863
  Data_Get_Struct(self, Analyzer, pfa);
647
- Data_Get_Struct(ranalyzer, Analyzer, a);
864
+ a = frt_get_cwrapped_analyzer(ranalyzer);
648
865
 
649
866
  pfa_add_field(pfa, StringValuePtr(rfield), a);
650
867
  return self;
651
868
  }
652
869
 
870
+ /*** RegExpAnalyzer ***/
653
871
 
654
- /** RegexAnalyzer **/
655
- /*
656
- static VALUE
657
- frt_regex_analyzer_init(VALUE self)
872
+ static void
873
+ frt_re_analyzer_mark(Analyzer *a)
658
874
  {
659
- Analyzer *a = regex_analyzer_create();
660
- // keine Ahnung warum hier das Makro und nicht Data_Wrap_Struct:
661
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
662
- // wofuer?:
663
- object_add(a, self);
664
- return self;
875
+ frt_gc_mark(a->current_ts);
665
876
  }
666
877
 
667
- // convenience method
668
- // XXX this sets the locale for the entire program
669
- static VALUE
670
- frt_regex_analyzer_token_stream(VALUE self, VALUE field, VALUE string)
878
+ static void
879
+ re_analyzer_destroy(Analyzer *a)
671
880
  {
672
- Analyzer *a =((struct RData *)(self))->data;
673
- TokenStream *ts = a->get_ts( a, StringValuePtr(field), StringValuePtr(string) );
674
- // already freed via analyzer's free()
675
- VALUE token_stream = Data_Wrap_Struct(cTokenStream, NULL, NULL, ts);
676
- return token_stream;
881
+ free(a->data);
882
+ a_standard_destroy(a);
677
883
  }
678
- */
679
- /** /RegexAnalyzer **/
680
884
 
681
- /** TokenStream **/
682
- /** /TokenStream **/
885
+ static VALUE
886
+ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
887
+ {
888
+ VALUE lower, rets, regex, proc;
889
+ Analyzer *a;
890
+ TokenStream *ts;
891
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
892
+
893
+ ts = rets_create(Qnil, regex, proc);
894
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
895
+ ref(ts);
896
+ rb_hash_aset(object_space, LONG2NUM((long)rets), rets);
897
+ object_add(ts, rets);
898
+
899
+ if (lower != Qfalse) ts = mb_lowercase_filter_create(ts);
900
+
901
+ a = analyzer_create(NULL, ts, &re_analyzer_destroy, NULL);
902
+ Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
903
+ object_add(a, self);
904
+ return self;
905
+ }
683
906
 
684
907
  /****************************************************************************
685
908
  *
@@ -710,10 +933,17 @@ static VALUE frt_setlocale(VALUE self, VALUE locale)
710
933
  void
711
934
  Init_analysis(void)
712
935
  {
936
+ /* TokenStream Methods */
713
937
  id_next = rb_intern("next");
714
938
  id_reset = rb_intern("text=");
715
939
  id_clone = rb_intern("clone");
716
940
 
941
+ /* Analyzer Methods */
942
+ id_token_stream = rb_intern("token_stream");
943
+
944
+ object_space = rb_hash_new();
945
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
946
+
717
947
  /*** * * Locale stuff * * ***/
718
948
  frt_locale = setlocale(LC_ALL, "");
719
949
  rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
@@ -790,6 +1020,18 @@ Init_analysis(void)
790
1020
  rb_define_method(cStandardTokenizer, "initialize",
791
1021
  frt_standard_tokenizer_init, 1);
792
1022
 
1023
+ /*** * * RegExpTokenizer * * ***/
1024
+ cRegExpTokenizer =
1025
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1026
+ rtoken_re = rb_reg_new(token_re, strlen(token_re), 0);
1027
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1028
+ rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1029
+ rb_define_method(cRegExpTokenizer, "initialize",
1030
+ frt_rets_init, -1);
1031
+ rb_define_method(cRegExpTokenizer, "next", frt_ts_next, 0);
1032
+ rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1033
+ rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1034
+
793
1035
  /***************/
794
1036
  /*** Filters ***/
795
1037
  /***************/
@@ -911,7 +1153,13 @@ Init_analysis(void)
911
1153
  rb_define_method(cPerFieldAnalyzer, "[]=",
912
1154
  frt_per_field_analyzer_add_field, 2);
913
1155
 
914
- /** RegexAnalyzer **/
1156
+ /*** * * RegexAnalyzer * * ***/
1157
+ cRegExpAnalyzer =
1158
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
1159
+ rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
1160
+ rb_define_method(cRegExpAnalyzer, "initialize",
1161
+ frt_re_analyzer_init, -1);
1162
+
915
1163
  /*
916
1164
  cRegexAnalyzer =
917
1165
  rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);