ferret 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/q_term.c
CHANGED
@@ -18,6 +18,14 @@ Scorer *tw_scorer(Weight *self, IndexReader *ir)
|
|
18
18
|
|
19
19
|
Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
20
20
|
{
|
21
|
+
Explanation *qnorm_expl;
|
22
|
+
Explanation *field_expl;
|
23
|
+
Scorer *scorer;
|
24
|
+
Explanation *tf_expl;
|
25
|
+
uchar *field_norms;
|
26
|
+
float field_norm;
|
27
|
+
Explanation *field_norm_expl;
|
28
|
+
|
21
29
|
char *query_str = self->query->to_s(self->query, "");
|
22
30
|
TermQuery *tq = (TermQuery *)self->query->data;
|
23
31
|
Term *term = tq->term;
|
@@ -26,14 +34,14 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
26
34
|
Explanation *expl = expl_create(0.0,
|
27
35
|
strfmt("weight(%s in %d), product of:", query_str, doc_num));
|
28
36
|
|
29
|
-
|
30
|
-
|
37
|
+
/* We need two of these as it's included in both the query explanation
|
38
|
+
* and the field explanation */
|
31
39
|
Explanation *idf_expl1 = expl_create(self->idf,
|
32
40
|
strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
|
33
41
|
Explanation *idf_expl2 = expl_create(self->idf,
|
34
42
|
strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
|
35
43
|
|
36
|
-
|
44
|
+
/* explain query weight */
|
37
45
|
Explanation *query_expl = expl_create(0.0,
|
38
46
|
strfmt("query_weight(%s), product of:", query_str));
|
39
47
|
free(query_str);
|
@@ -44,33 +52,35 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
44
52
|
|
45
53
|
expl_add_detail(query_expl, idf_expl1);
|
46
54
|
|
47
|
-
|
55
|
+
qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
|
48
56
|
expl_add_detail(query_expl, qnorm_expl);
|
49
57
|
|
50
58
|
query_expl->value = self->query->boost * idf_expl1->value * qnorm_expl->value;
|
51
59
|
|
52
60
|
expl_add_detail(expl, query_expl);
|
53
61
|
|
54
|
-
|
55
|
-
|
62
|
+
/* explain field weight */
|
63
|
+
field_expl = expl_create(0.0,
|
56
64
|
strfmt("field_weight(%s:%s in %d), product of:",
|
57
65
|
field_name, term->text, doc_num));
|
58
66
|
|
59
|
-
|
60
|
-
|
67
|
+
scorer = self->scorer(self, ir);
|
68
|
+
tf_expl = scorer->explain(scorer, doc_num);
|
61
69
|
scorer->destroy(scorer);
|
62
70
|
expl_add_detail(field_expl, tf_expl);
|
63
71
|
expl_add_detail(field_expl, idf_expl2);
|
64
72
|
|
65
|
-
|
66
|
-
|
67
|
-
|
73
|
+
field_norms = ir->get_norms(ir, field_name);
|
74
|
+
field_norm = (field_norms
|
75
|
+
? sim_decode_norm(self->similarity, field_norms[doc_num])
|
76
|
+
: (float)0.0);
|
77
|
+
field_norm_expl = expl_create(field_norm,
|
68
78
|
strfmt("field_norm(field=%s, doc=%d)", field_name, doc_num));
|
69
79
|
expl_add_detail(field_expl, field_norm_expl);
|
70
80
|
|
71
81
|
field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
|
72
82
|
|
73
|
-
|
83
|
+
/* combine them */
|
74
84
|
if (query_expl->value == 1.0) {
|
75
85
|
expl_destoy(expl);
|
76
86
|
return field_expl;
|
@@ -86,30 +96,18 @@ char *tw_to_s(Weight *self)
|
|
86
96
|
return strfmt("TermWeight(%f)", self->value);
|
87
97
|
}
|
88
98
|
|
89
|
-
void tw_destroy(void *p)
|
90
|
-
{
|
91
|
-
free(p);
|
92
|
-
}
|
93
|
-
|
94
99
|
Weight *tw_create(Query *query, Searcher *searcher)
|
95
100
|
{
|
96
|
-
Weight *self =
|
97
|
-
ZEROSET(self, Weight, 1);
|
98
|
-
self->get_query = &w_get_query;
|
99
|
-
self->get_value = &w_get_value;
|
100
|
-
self->normalize = &w_normalize;
|
101
|
+
Weight *self = w_create(query);
|
101
102
|
self->scorer = &tw_scorer;
|
102
103
|
self->explain = &tw_explain;
|
103
104
|
self->to_s = &tw_to_s;
|
104
|
-
self->destroy = &tw_destroy;
|
105
105
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
106
106
|
|
107
107
|
self->similarity = query->get_similarity(query, searcher);
|
108
108
|
self->idf = sim_idf(self->similarity,
|
109
109
|
searcher->doc_freq(searcher, ((TermQuery *)query->data)->term),
|
110
110
|
searcher->max_doc(searcher)); // compute idf
|
111
|
-
self->query = query;
|
112
|
-
self->value = 0.0;
|
113
111
|
|
114
112
|
return self;
|
115
113
|
}
|
@@ -120,20 +118,19 @@ Weight *tw_create(Query *query, Searcher *searcher)
|
|
120
118
|
*
|
121
119
|
***************************************************************************/
|
122
120
|
|
123
|
-
void tq_destroy(
|
121
|
+
void tq_destroy(Query *self)
|
124
122
|
{
|
125
|
-
|
126
|
-
TermQuery *tq = q->data;
|
123
|
+
TermQuery *tq = self->data;
|
127
124
|
term_destroy(tq->term);
|
128
125
|
free(tq);
|
129
|
-
|
126
|
+
q_destroy_i(self);
|
130
127
|
}
|
131
128
|
|
132
129
|
char *tq_to_s(Query *self, char *field)
|
133
130
|
{
|
134
131
|
Term *term = ((TermQuery *)self->data)->term;
|
135
|
-
|
136
|
-
|
132
|
+
size_t flen = strlen(term->field);
|
133
|
+
size_t tlen = strlen(term->text);
|
137
134
|
char *buffer = ALLOC_N(char, 34 + flen + tlen);
|
138
135
|
char *b = buffer;
|
139
136
|
if (strcmp(field, term->field) != 0) {
|
@@ -151,10 +148,21 @@ char *tq_to_s(Query *self, char *field)
|
|
151
148
|
return buffer;
|
152
149
|
}
|
153
150
|
|
154
|
-
void tq_extract_terms(Query *self,
|
151
|
+
static void tq_extract_terms(Query *self, HashSet *terms)
|
155
152
|
{
|
156
153
|
Term *term = ((TermQuery *)self->data)->term;
|
157
|
-
|
154
|
+
hs_add(terms, term_clone(term));
|
155
|
+
}
|
156
|
+
|
157
|
+
static uint tq_hash(Query *self)
|
158
|
+
{
|
159
|
+
return term_hash(((TermQuery *)self->data)->term);
|
160
|
+
}
|
161
|
+
|
162
|
+
static int tq_eq(Query *self, Query *o)
|
163
|
+
{
|
164
|
+
return term_eq(((TermQuery *)self->data)->term,
|
165
|
+
((TermQuery *)o->data)->term);
|
158
166
|
}
|
159
167
|
|
160
168
|
Query *tq_create(Term *term)
|
@@ -164,14 +172,18 @@ Query *tq_create(Term *term)
|
|
164
172
|
tq->term = term;
|
165
173
|
self->type = TERM_QUERY;
|
166
174
|
self->data = tq;
|
167
|
-
self->create_weight = &tw_create;
|
168
175
|
self->extract_terms = &tq_extract_terms;
|
169
176
|
self->to_s = &tq_to_s;
|
170
|
-
self->
|
177
|
+
self->hash = &tq_hash;
|
178
|
+
self->eq = &tq_eq;
|
179
|
+
|
180
|
+
self->destroy_i = &tq_destroy;
|
181
|
+
self->create_weight_i = &tw_create;
|
171
182
|
|
172
183
|
return self;
|
173
184
|
}
|
174
185
|
|
186
|
+
|
175
187
|
/***************************************************************************
|
176
188
|
*
|
177
189
|
* TermScorer
|
@@ -183,13 +195,13 @@ float tsc_score(Scorer *self)
|
|
183
195
|
TermScorer *ts = (TermScorer *)self->data;
|
184
196
|
int freq = ts->freqs[ts->pointer];
|
185
197
|
float score;
|
186
|
-
|
187
|
-
if (freq < SCORE_CACHE_SIZE) {
|
188
|
-
score = ts->score_cache[freq];
|
198
|
+
/* compute tf(f)*weight */
|
199
|
+
if (freq < SCORE_CACHE_SIZE) { /* check cache */
|
200
|
+
score = ts->score_cache[freq]; /* cache hit */
|
189
201
|
} else {
|
190
|
-
score = sim_tf(self->similarity, freq) * ts->weight_value;
|
202
|
+
score = sim_tf(self->similarity, (float)freq) * ts->weight_value; /* cache miss */
|
191
203
|
}
|
192
|
-
|
204
|
+
/* normalize for field */
|
193
205
|
score *= sim_decode_norm(self->similarity, ts->norms[self->doc]);
|
194
206
|
return score;
|
195
207
|
}
|
@@ -217,8 +229,9 @@ bool tsc_next(Scorer *self)
|
|
217
229
|
bool tsc_skip_to(Scorer *self, int doc_num)
|
218
230
|
{
|
219
231
|
TermScorer *ts = (TermScorer *)self->data;
|
220
|
-
|
221
|
-
|
232
|
+
TermDocEnum *tde = ts->tde;
|
233
|
+
|
234
|
+
/* first scan in cache */
|
222
235
|
while (++(ts->pointer) < ts->pointer_max) {
|
223
236
|
if (ts->docs[ts->pointer] >= doc_num) {
|
224
237
|
self->doc = ts->docs[ts->pointer];
|
@@ -226,10 +239,8 @@ bool tsc_skip_to(Scorer *self, int doc_num)
|
|
226
239
|
}
|
227
240
|
}
|
228
241
|
|
229
|
-
|
230
|
-
|
231
|
-
bool result = tde->skip_to(tde, doc_num);
|
232
|
-
if (result) {
|
242
|
+
/* not found in cache, seek underlying stream */
|
243
|
+
if (tde->skip_to(tde, doc_num)) {
|
233
244
|
ts->pointer_max = 1;
|
234
245
|
ts->pointer = 0;
|
235
246
|
ts->docs[0] = self->doc = tde->doc_num(tde);
|
@@ -242,6 +253,7 @@ bool tsc_skip_to(Scorer *self, int doc_num)
|
|
242
253
|
|
243
254
|
Explanation *tsc_explain(Scorer *self, int doc_num)
|
244
255
|
{
|
256
|
+
Explanation *tf_explanation;
|
245
257
|
TermScorer *ts = (TermScorer *)self->data;
|
246
258
|
Query *query = ts->weight->get_query(ts->weight);
|
247
259
|
Term *term = ((TermQuery *)query->data)->term;
|
@@ -260,18 +272,17 @@ Explanation *tsc_explain(Scorer *self, int doc_num)
|
|
260
272
|
}
|
261
273
|
tde->close(tde);
|
262
274
|
ts->tde = NULL;
|
263
|
-
|
275
|
+
tf_explanation = expl_create(sim_tf(self->similarity, (float)tf),
|
264
276
|
strfmt("tf(term_freq(%s:%s)=%d)", term->field, term->text, tf));
|
265
277
|
|
266
278
|
return tf_explanation;
|
267
279
|
}
|
268
280
|
|
269
|
-
void tsc_destroy(
|
281
|
+
void tsc_destroy(Scorer *self)
|
270
282
|
{
|
271
|
-
Scorer *self = (Scorer *)p;
|
272
283
|
TermScorer *ts = (TermScorer *)self->data;
|
273
284
|
if (ts->tde) ts->tde->close(ts->tde);
|
274
|
-
|
285
|
+
scorer_destroy_i(self);
|
275
286
|
}
|
276
287
|
|
277
288
|
Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
|
@@ -287,7 +298,7 @@ Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
|
|
287
298
|
ts->weight_value = weight->value;
|
288
299
|
|
289
300
|
for (i = 0; i < SCORE_CACHE_SIZE; i++) {
|
290
|
-
ts->score_cache[i] = sim_tf(self->similarity, i) * ts->weight_value;
|
301
|
+
ts->score_cache[i] = sim_tf(self->similarity, (float)i) * ts->weight_value;
|
291
302
|
}
|
292
303
|
|
293
304
|
self->score = &tsc_score;
|
data/ext/q_wildcard.c
CHANGED
@@ -11,8 +11,8 @@ char *wcq_to_s(Query *self, char *field)
|
|
11
11
|
{
|
12
12
|
char *buffer, *bptr;
|
13
13
|
Term *term = (Term *)self->data;
|
14
|
-
|
15
|
-
|
14
|
+
size_t tlen = strlen(term->text);
|
15
|
+
size_t flen = strlen(term->field);
|
16
16
|
bptr = buffer = ALLOC_N(char, tlen + flen + 35);
|
17
17
|
|
18
18
|
if (strcmp(term->field, field) != 0) {
|
@@ -77,8 +77,8 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
77
77
|
Term *term = (Term *)self->data;
|
78
78
|
char *text = term->text;
|
79
79
|
char *field = term->field;
|
80
|
-
char *first_star =
|
81
|
-
char *first_ques =
|
80
|
+
char *first_star = strrchr(text, WILD_STRING);
|
81
|
+
char *first_ques = strrchr(text, WILD_CHAR);
|
82
82
|
if (!first_star && !first_ques) {
|
83
83
|
q = tq_create(term_clone(term));
|
84
84
|
} else {
|
@@ -89,7 +89,7 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
89
89
|
char *pattern = (first_ques && first_star > first_ques)
|
90
90
|
? first_ques : first_star;
|
91
91
|
|
92
|
-
int prefix_len = pattern - text;
|
92
|
+
int prefix_len = (int)(pattern - text);
|
93
93
|
|
94
94
|
prefix_term.field = field;
|
95
95
|
prefix_term.text = (char *)EMPTY_STRING;
|
@@ -120,15 +120,23 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
120
120
|
free(prefix);
|
121
121
|
}
|
122
122
|
|
123
|
-
|
124
|
-
return self->rewritten = q;
|
123
|
+
return q;
|
125
124
|
}
|
126
125
|
|
127
|
-
void wcq_destroy(
|
126
|
+
static void wcq_destroy(Query *self)
|
128
127
|
{
|
129
|
-
Query *self = (Query *)p;
|
130
128
|
if (self->destroy_all) term_destroy((Term *)self->data);
|
131
|
-
|
129
|
+
q_destroy_i(self);
|
130
|
+
}
|
131
|
+
|
132
|
+
static uint wcq_hash(Query *self)
|
133
|
+
{
|
134
|
+
return term_hash((Term *)self->data);
|
135
|
+
}
|
136
|
+
|
137
|
+
static int wcq_eq(Query *self, Query *o)
|
138
|
+
{
|
139
|
+
return term_eq((Term *)self->data, (Term *)o->data);
|
132
140
|
}
|
133
141
|
|
134
142
|
Query *wcq_create(Term *term)
|
@@ -136,11 +144,14 @@ Query *wcq_create(Term *term)
|
|
136
144
|
Query *self = q_create();
|
137
145
|
|
138
146
|
self->data = term;
|
147
|
+
|
139
148
|
self->type = WILD_CARD_QUERY;
|
140
|
-
self->create_weight = NULL;
|
141
|
-
self->to_s = &wcq_to_s;
|
142
149
|
self->rewrite = &wcq_rewrite;
|
143
|
-
self->
|
150
|
+
self->to_s = &wcq_to_s;
|
151
|
+
self->hash = &wcq_hash;
|
152
|
+
self->eq = &wcq_eq;
|
153
|
+
self->destroy_i = &wcq_destroy;
|
154
|
+
self->create_weight_i = &q_create_weight_unsup;
|
144
155
|
|
145
156
|
return self;
|
146
157
|
}
|
data/ext/r_analysis.c
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#include <regex.h>
|
1
2
|
#include "ferret.h"
|
2
3
|
#include "analysis.h"
|
3
4
|
#include "locale.h"
|
@@ -9,6 +10,7 @@ static VALUE cAsciiWhiteSpaceTokenizer;
|
|
9
10
|
static VALUE cWhiteSpaceTokenizer;
|
10
11
|
static VALUE cAsciiStandardTokenizer;
|
11
12
|
static VALUE cStandardTokenizer;
|
13
|
+
static VALUE cRegExpTokenizer;
|
12
14
|
|
13
15
|
static VALUE cAsciiLowerCaseFilter;
|
14
16
|
static VALUE cLowerCaseFilter;
|
@@ -23,14 +25,25 @@ static VALUE cWhiteSpaceAnalyzer;
|
|
23
25
|
static VALUE cAsciiStandardAnalyzer;
|
24
26
|
static VALUE cStandardAnalyzer;
|
25
27
|
static VALUE cPerFieldAnalyzer;
|
28
|
+
static VALUE cRegExpAnalyzer;
|
26
29
|
|
27
30
|
//static VALUE cRegexAnalyzer;
|
28
31
|
static VALUE cTokenStream;
|
29
32
|
|
33
|
+
/* TokenStream Methods */
|
30
34
|
static ID id_next;
|
31
35
|
static ID id_reset;
|
32
36
|
static ID id_clone;
|
33
37
|
|
38
|
+
/* Analyzer Methods */
|
39
|
+
static ID id_token_stream;
|
40
|
+
|
41
|
+
static VALUE object_space;
|
42
|
+
|
43
|
+
extern TokenStream *ts_create();
|
44
|
+
extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, int,
|
45
|
+
struct re_registers *);
|
46
|
+
|
34
47
|
/****************************************************************************
|
35
48
|
*
|
36
49
|
* Utility Methods
|
@@ -111,7 +124,7 @@ frt_set_token(Token *tk, VALUE rt)
|
|
111
124
|
return tk;
|
112
125
|
}
|
113
126
|
|
114
|
-
#define GET_TK RToken *token
|
127
|
+
#define GET_TK RToken *token = (RToken *)DATA_PTR(self)
|
115
128
|
static VALUE
|
116
129
|
frt_token_init(int argc, VALUE *argv, VALUE self)
|
117
130
|
{
|
@@ -212,13 +225,12 @@ frt_ts_mark(void *p)
|
|
212
225
|
}
|
213
226
|
|
214
227
|
static void
|
215
|
-
frt_ts_free(
|
228
|
+
frt_ts_free(TokenStream *ts)
|
216
229
|
{
|
217
|
-
TokenStream *ts = (TokenStream *)p;
|
218
230
|
if (object_get(&ts->text) != Qnil) object_del(&ts->text);
|
219
231
|
if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
|
220
232
|
object_del(ts);
|
221
|
-
|
233
|
+
ts_deref(ts);
|
222
234
|
}
|
223
235
|
|
224
236
|
static VALUE
|
@@ -273,8 +285,7 @@ frt_ts_get_text(VALUE self)
|
|
273
285
|
static VALUE
|
274
286
|
frt_ts_next(VALUE self)
|
275
287
|
{
|
276
|
-
TokenStream *ts;
|
277
|
-
Data_Get_Struct(self, TokenStream, ts);
|
288
|
+
TokenStream *ts = (TokenStream *)DATA_PTR(self);
|
278
289
|
Token *next = ts->next(ts);
|
279
290
|
if (next == NULL) {
|
280
291
|
return Qnil;
|
@@ -287,41 +298,45 @@ frt_ts_next(VALUE self)
|
|
287
298
|
* CWrappedTokenStream
|
288
299
|
****************************************************************************/
|
289
300
|
|
290
|
-
|
301
|
+
static void
|
302
|
+
cwrts_destroy(TokenStream *ts)
|
291
303
|
{
|
292
|
-
|
304
|
+
rb_hash_delete(object_space, LONG2NUM((long)ts->data));
|
293
305
|
free(ts->token);
|
294
306
|
free(ts);
|
295
307
|
}
|
296
308
|
|
297
|
-
Token *
|
309
|
+
static Token *
|
310
|
+
cwrts_next(TokenStream *ts)
|
298
311
|
{
|
299
312
|
VALUE rts = (VALUE)ts->data;
|
300
313
|
VALUE rtoken = rb_funcall(rts, id_next, 0);
|
301
314
|
return frt_set_token(ts->token, rtoken);
|
302
315
|
}
|
303
316
|
|
304
|
-
void
|
317
|
+
static void
|
318
|
+
cwrts_reset(TokenStream *ts, char *text)
|
305
319
|
{
|
306
320
|
VALUE rts = (VALUE)ts->data;
|
307
321
|
ts->t = ts->text = text;
|
308
322
|
rb_funcall(rts, id_reset, 1, rb_str_new2(text));
|
309
323
|
}
|
310
324
|
|
311
|
-
void
|
325
|
+
static void
|
326
|
+
cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
312
327
|
{
|
313
328
|
VALUE rorig_ts = (VALUE)orig_ts->data;
|
314
329
|
new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
|
315
330
|
}
|
316
331
|
|
317
332
|
static TokenStream *
|
318
|
-
|
333
|
+
frt_get_cwrapped_rts(VALUE rts)
|
319
334
|
{
|
320
335
|
TokenStream *ts;
|
321
336
|
switch (TYPE(rts)) {
|
322
337
|
case T_DATA:
|
323
338
|
Data_Get_Struct(rts, TokenStream, ts);
|
324
|
-
|
339
|
+
ref(ts);
|
325
340
|
break;
|
326
341
|
default:
|
327
342
|
ts = ALLOC(TokenStream);
|
@@ -332,12 +347,184 @@ get_cwrapped_rts(VALUE rts, bool *self_destroy)
|
|
332
347
|
ts->clone_i = &cwrts_clone_i;
|
333
348
|
ts->destroy = &cwrts_destroy;
|
334
349
|
ts->sub_ts = NULL;
|
335
|
-
|
350
|
+
// prevent from being garbage collected
|
351
|
+
rb_hash_aset(object_space, LONG2NUM(rts), rts);
|
352
|
+
ts->ref_cnt = 1;
|
336
353
|
break;
|
337
354
|
}
|
338
355
|
return ts;
|
339
356
|
}
|
340
357
|
|
358
|
+
/****************************************************************************
|
359
|
+
* RegExpTokenStream
|
360
|
+
****************************************************************************/
|
361
|
+
|
362
|
+
#define P "[_\\/.,-]"
|
363
|
+
#define HASDIGIT "\\w*\\d\\w*"
|
364
|
+
#define ALPHA "[-_[:alpha:]]"
|
365
|
+
#define ALNUM "[-_[:alnum:]]"
|
366
|
+
|
367
|
+
static char *token_re =
|
368
|
+
ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
|
369
|
+
"(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
|
370
|
+
"|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
|
371
|
+
"|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
|
372
|
+
"|(\\.\\w+)+"
|
373
|
+
"|"
|
374
|
+
")";
|
375
|
+
static VALUE rtoken_re;
|
376
|
+
|
377
|
+
typedef struct RegExpTokenStream {
|
378
|
+
VALUE rtext;
|
379
|
+
VALUE regex;
|
380
|
+
VALUE proc;
|
381
|
+
int curr_ind;
|
382
|
+
} RegExpTokenStream;
|
383
|
+
|
384
|
+
static void
|
385
|
+
rets_destroy(TokenStream *ts)
|
386
|
+
{
|
387
|
+
rb_hash_delete(object_space, LONG2NUM((long)object_get(ts)));
|
388
|
+
free(ts->data);
|
389
|
+
free(ts->token);
|
390
|
+
free(ts);
|
391
|
+
}
|
392
|
+
|
393
|
+
static void
|
394
|
+
frt_rets_free(TokenStream *ts)
|
395
|
+
{
|
396
|
+
object_del(ts);
|
397
|
+
ts_deref(ts);
|
398
|
+
}
|
399
|
+
|
400
|
+
static void
|
401
|
+
frt_rets_mark(TokenStream *ts)
|
402
|
+
{
|
403
|
+
RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
|
404
|
+
rb_gc_mark(rets->rtext);
|
405
|
+
rb_gc_mark(rets->regex);
|
406
|
+
rb_gc_mark(rets->proc);
|
407
|
+
}
|
408
|
+
|
409
|
+
static VALUE
|
410
|
+
frt_rets_set_text(VALUE self, VALUE rtext)
|
411
|
+
{
|
412
|
+
TokenStream *ts;
|
413
|
+
RegExpTokenStream *rets;
|
414
|
+
Data_Get_Struct(self, TokenStream, ts);
|
415
|
+
|
416
|
+
StringValue(rtext);
|
417
|
+
rets = (RegExpTokenStream *)ts->data;
|
418
|
+
rets->rtext = rtext;
|
419
|
+
rets->curr_ind = 0;
|
420
|
+
|
421
|
+
return rtext;
|
422
|
+
}
|
423
|
+
|
424
|
+
static VALUE
|
425
|
+
frt_rets_get_text(VALUE self)
|
426
|
+
{
|
427
|
+
TokenStream *ts;
|
428
|
+
RegExpTokenStream *rets;
|
429
|
+
Data_Get_Struct(self, TokenStream, ts);
|
430
|
+
rets = (RegExpTokenStream *)ts->data;
|
431
|
+
return rets->rtext;
|
432
|
+
}
|
433
|
+
|
434
|
+
static Token *
|
435
|
+
rets_next(TokenStream *ts)
|
436
|
+
{
|
437
|
+
static struct re_registers regs;
|
438
|
+
int ret, beg, end;
|
439
|
+
RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
|
440
|
+
struct RString *rtext = RSTRING(rets->rtext);
|
441
|
+
Check_Type(rets->regex, T_REGEXP);
|
442
|
+
ret = ruby_re_search(RREGEXP(rets->regex)->ptr,
|
443
|
+
rtext->ptr, rtext->len,
|
444
|
+
rets->curr_ind, rtext->len - rets->curr_ind,
|
445
|
+
®s);
|
446
|
+
|
447
|
+
if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
|
448
|
+
if (ret < 0) return NULL; /* not matched */
|
449
|
+
|
450
|
+
beg = regs.beg[0];
|
451
|
+
rets->curr_ind = end = regs.end[0];
|
452
|
+
if (NIL_P(rets->proc)) {
|
453
|
+
return tk_set(ts->token, rtext->ptr + beg, end - beg, beg, end, 1);
|
454
|
+
} else {
|
455
|
+
VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
|
456
|
+
rtok = rb_funcall(rets->proc, id_call, 1, rtok);
|
457
|
+
return tk_set(ts->token, RSTRING(rtok)->ptr, RSTRING(rtok)->len, beg, end, 1);
|
458
|
+
}
|
459
|
+
}
|
460
|
+
|
461
|
+
static void
|
462
|
+
rets_reset(TokenStream *ts, char *text)
|
463
|
+
{
|
464
|
+
RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
|
465
|
+
rets->rtext = rb_str_new2(text);
|
466
|
+
rets->curr_ind = 0;
|
467
|
+
}
|
468
|
+
|
469
|
+
void
|
470
|
+
rets_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
471
|
+
{
|
472
|
+
RegExpTokenStream *new_rets = ALLOC(RegExpTokenStream);
|
473
|
+
RegExpTokenStream *orig_rets = (RegExpTokenStream *)orig_ts->data;
|
474
|
+
memcpy(new_rets, orig_rets, sizeof(RegExpTokenStream));
|
475
|
+
new_ts->data = new_rets;
|
476
|
+
}
|
477
|
+
|
478
|
+
static TokenStream *
|
479
|
+
rets_create(VALUE rtext, VALUE regex, VALUE proc)
|
480
|
+
{
|
481
|
+
RegExpTokenStream *rets;
|
482
|
+
TokenStream *ts;
|
483
|
+
|
484
|
+
if (rtext != Qnil) {
|
485
|
+
rtext = StringValue(rtext);
|
486
|
+
}
|
487
|
+
ts = ts_create();
|
488
|
+
ts->reset = &rets_reset;
|
489
|
+
ts->next = &rets_next;
|
490
|
+
ts->clone_i = &rets_clone_i;
|
491
|
+
ts->destroy = &rets_destroy;
|
492
|
+
ts->ref_cnt = 1;
|
493
|
+
|
494
|
+
rets = ALLOC(RegExpTokenStream);
|
495
|
+
rets->curr_ind = 0;
|
496
|
+
rets->rtext = rtext;
|
497
|
+
rets->proc = proc;
|
498
|
+
if (NIL_P(regex)) {
|
499
|
+
rets->regex = rtoken_re;
|
500
|
+
} else {
|
501
|
+
Check_Type(regex, T_REGEXP);
|
502
|
+
rets->regex = regex;
|
503
|
+
}
|
504
|
+
|
505
|
+
ts->data = rets;
|
506
|
+
|
507
|
+
return ts;
|
508
|
+
}
|
509
|
+
|
510
|
+
static VALUE
|
511
|
+
frt_rets_init(int argc, VALUE *argv, VALUE self)
|
512
|
+
{
|
513
|
+
VALUE rtext, regex, proc;
|
514
|
+
TokenStream *ts;
|
515
|
+
|
516
|
+
rb_scan_args(argc, argv, "11&", &rtext, ®ex, &proc);
|
517
|
+
|
518
|
+
ts = rets_create(rtext, regex, proc);
|
519
|
+
|
520
|
+
Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
|
521
|
+
object_add(ts, self);
|
522
|
+
/* no need to add to object space as it is going to ruby space
|
523
|
+
* rb_hash_aset(object_space, LONG2NUM((long)self), self);
|
524
|
+
*/
|
525
|
+
return self;
|
526
|
+
}
|
527
|
+
|
341
528
|
/****************************************************************************
|
342
529
|
* Tokenizers
|
343
530
|
****************************************************************************/
|
@@ -394,10 +581,8 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
394
581
|
static VALUE
|
395
582
|
frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
396
583
|
{
|
397
|
-
|
398
|
-
|
399
|
-
get_cwrapped_rts(rsub_ts, &self_destroy));
|
400
|
-
ts->destroy_sub = !self_destroy;
|
584
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
585
|
+
ts = lowercase_filter_create(ts);
|
401
586
|
object_add(&ts->sub_ts, rsub_ts);
|
402
587
|
|
403
588
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -408,10 +593,8 @@ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
|
408
593
|
static VALUE
|
409
594
|
frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
410
595
|
{
|
411
|
-
|
412
|
-
|
413
|
-
get_cwrapped_rts(rsub_ts, &self_destroy));
|
414
|
-
ts->destroy_sub = !self_destroy;
|
596
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
597
|
+
ts = mb_lowercase_filter_create(ts);
|
415
598
|
object_add(&ts->sub_ts, rsub_ts);
|
416
599
|
|
417
600
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -423,19 +606,17 @@ static VALUE
|
|
423
606
|
frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
424
607
|
{
|
425
608
|
VALUE rsub_ts, rstop_words;
|
426
|
-
bool self_destroy;
|
427
609
|
TokenStream *ts;
|
428
610
|
rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
|
611
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
429
612
|
if (rstop_words != Qnil) {
|
430
613
|
char **stop_words = get_stopwords(rstop_words);
|
431
|
-
ts = stop_filter_create_with_words(
|
432
|
-
|
614
|
+
ts = stop_filter_create_with_words(ts, (const char **)stop_words);
|
615
|
+
|
433
616
|
free(stop_words);
|
434
617
|
} else {
|
435
|
-
ts = stop_filter_create(
|
436
|
-
get_cwrapped_rts(rsub_ts, &self_destroy));
|
618
|
+
ts = stop_filter_create(ts);
|
437
619
|
}
|
438
|
-
ts->destroy_sub = !self_destroy;
|
439
620
|
object_add(&ts->sub_ts, rsub_ts);
|
440
621
|
|
441
622
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -449,16 +630,14 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
|
449
630
|
VALUE rsub_ts, ralgorithm, rcharenc;
|
450
631
|
char *algorithm = "english";
|
451
632
|
char *charenc = NULL;
|
452
|
-
bool self_destroy;
|
453
633
|
TokenStream *ts;
|
454
634
|
rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
|
635
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
455
636
|
switch (argc) {
|
456
637
|
case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
|
457
638
|
case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
|
458
639
|
}
|
459
|
-
ts = stem_filter_create(
|
460
|
-
get_cwrapped_rts(rsub_ts, &self_destroy), algorithm, charenc);
|
461
|
-
ts->destroy_sub = !self_destroy;
|
640
|
+
ts = stem_filter_create(ts, algorithm, charenc);
|
462
641
|
object_add(&ts->sub_ts, rsub_ts);
|
463
642
|
|
464
643
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -472,34 +651,49 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
|
472
651
|
*
|
473
652
|
****************************************************************************/
|
474
653
|
|
475
|
-
|
654
|
+
/****************************************************************************
|
655
|
+
* CWrappedAnalyzer Methods
|
656
|
+
****************************************************************************/
|
657
|
+
|
658
|
+
static void
|
659
|
+
cwa_destroy(Analyzer *a)
|
660
|
+
{
|
661
|
+
rb_hash_delete(object_space, LONG2NUM((long)a->data));
|
662
|
+
a_standard_destroy(a);
|
663
|
+
}
|
664
|
+
|
665
|
+
static TokenStream *
|
666
|
+
cwa_get_ts(Analyzer *a, char *field, char *text)
|
667
|
+
{
|
668
|
+
VALUE ranalyzer = (VALUE)a->data;
|
669
|
+
VALUE rts = rb_funcall(ranalyzer, id_token_stream, 2,
|
670
|
+
rb_str_new2(field), rb_str_new2(text));
|
671
|
+
return frt_get_cwrapped_rts(rts);
|
672
|
+
}
|
673
|
+
|
674
|
+
Analyzer *
|
675
|
+
frt_get_cwrapped_analyzer(ranalyzer)
|
476
676
|
{
|
477
677
|
Analyzer *a = NULL;
|
478
678
|
switch (TYPE(ranalyzer)) {
|
479
679
|
case T_DATA:
|
480
680
|
Data_Get_Struct(ranalyzer, Analyzer, a);
|
681
|
+
ref(a);
|
481
682
|
break;
|
482
683
|
default:
|
483
|
-
|
484
|
-
//
|
485
|
-
|
486
|
-
//ts->data = (void *)rts;
|
487
|
-
//ts->next = &cwrts_next;
|
488
|
-
//ts->reset = &cwrts_reset;
|
489
|
-
//ts->clone_i = &cwrts_clone_i;
|
490
|
-
//ts->destroy = &cwrts_destroy;
|
491
|
-
//ts->sub_ts = NULL;
|
684
|
+
a = analyzer_create((void *)ranalyzer, NULL, &cwa_destroy, &cwa_get_ts);
|
685
|
+
// prevent from being garbage collected
|
686
|
+
rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
|
492
687
|
break;
|
493
688
|
}
|
494
689
|
return a;
|
495
690
|
}
|
496
691
|
|
497
692
|
static void
|
498
|
-
frt_analyzer_free(
|
693
|
+
frt_analyzer_free(Analyzer *a)
|
499
694
|
{
|
500
|
-
Analyzer *a = (Analyzer *)p;
|
501
695
|
object_del(a);
|
502
|
-
|
696
|
+
a_deref(a);
|
503
697
|
}
|
504
698
|
|
505
699
|
VALUE
|
@@ -513,13 +707,16 @@ frt_get_analyzer(Analyzer *a)
|
|
513
707
|
static VALUE
|
514
708
|
frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
515
709
|
{
|
516
|
-
|
710
|
+
TokenStream *ts;
|
711
|
+
Analyzer *a = (Analyzer *)DATA_PTR(self);
|
712
|
+
|
517
713
|
rfield = rb_obj_as_string(rfield);
|
518
714
|
rstring = rb_obj_as_string(rstring);
|
519
715
|
|
520
|
-
|
716
|
+
ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
|
521
717
|
|
522
|
-
|
718
|
+
/* Make sure that there is no entry already */
|
719
|
+
object_set(&ts->text, rstring);
|
523
720
|
return get_token_stream(ts);
|
524
721
|
}
|
525
722
|
|
@@ -533,8 +730,9 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
|
533
730
|
static VALUE
|
534
731
|
frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
535
732
|
{
|
733
|
+
Analyzer *a;
|
536
734
|
GET_LOWER(false);
|
537
|
-
|
735
|
+
a = whitespace_analyzer_create(lower);
|
538
736
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
539
737
|
object_add(a, self);
|
540
738
|
return self;
|
@@ -544,8 +742,9 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
544
742
|
static VALUE
|
545
743
|
frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
546
744
|
{
|
745
|
+
Analyzer *a;
|
547
746
|
GET_LOWER(false);
|
548
|
-
|
747
|
+
a = mb_whitespace_analyzer_create(lower);
|
549
748
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
550
749
|
object_add(a, self);
|
551
750
|
return self;
|
@@ -555,8 +754,9 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
555
754
|
static VALUE
|
556
755
|
frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
557
756
|
{
|
757
|
+
Analyzer *a;
|
558
758
|
GET_LOWER(true);
|
559
|
-
|
759
|
+
a = letter_analyzer_create(lower);
|
560
760
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
561
761
|
object_add(a, self);
|
562
762
|
return self;
|
@@ -566,8 +766,9 @@ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
566
766
|
static VALUE
|
567
767
|
frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
568
768
|
{
|
769
|
+
Analyzer *a;
|
569
770
|
GET_LOWER(true);
|
570
|
-
|
771
|
+
a = mb_letter_analyzer_create(lower);
|
571
772
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
572
773
|
object_add(a, self);
|
573
774
|
return self;
|
@@ -628,13 +829,29 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
628
829
|
return self;
|
629
830
|
}
|
630
831
|
|
832
|
+
void
|
833
|
+
frt_h_mark_values_i(void *key, void *value, void *arg)
|
834
|
+
{
|
835
|
+
frt_gc_mark(value);
|
836
|
+
}
|
837
|
+
|
838
|
+
void
|
839
|
+
frt_pfa_mark(void *p)
|
840
|
+
{
|
841
|
+
Analyzer *a = (Analyzer *)p;
|
842
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)a->data;
|
843
|
+
frt_gc_mark(pfa->def);
|
844
|
+
h_each(pfa->dict, &frt_h_mark_values_i, NULL);
|
845
|
+
}
|
846
|
+
|
631
847
|
/*** PerFieldAnalyzer ***/
|
848
|
+
|
632
849
|
static VALUE
|
633
850
|
frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
|
634
851
|
{
|
635
|
-
Analyzer *def =
|
636
|
-
Analyzer *a = per_field_analyzer_create(def
|
637
|
-
Frt_Wrap_Struct(self,
|
852
|
+
Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
|
853
|
+
Analyzer *a = per_field_analyzer_create(def);
|
854
|
+
Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
|
638
855
|
object_add(a, self);
|
639
856
|
return self;
|
640
857
|
}
|
@@ -644,42 +861,48 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
644
861
|
{
|
645
862
|
Analyzer *pfa, *a;
|
646
863
|
Data_Get_Struct(self, Analyzer, pfa);
|
647
|
-
|
864
|
+
a = frt_get_cwrapped_analyzer(ranalyzer);
|
648
865
|
|
649
866
|
pfa_add_field(pfa, StringValuePtr(rfield), a);
|
650
867
|
return self;
|
651
868
|
}
|
652
869
|
|
870
|
+
/*** RegExpAnalyzer ***/
|
653
871
|
|
654
|
-
|
655
|
-
|
656
|
-
static VALUE
|
657
|
-
frt_regex_analyzer_init(VALUE self)
|
872
|
+
static void
|
873
|
+
frt_re_analyzer_mark(Analyzer *a)
|
658
874
|
{
|
659
|
-
|
660
|
-
// keine Ahnung warum hier das Makro und nicht Data_Wrap_Struct:
|
661
|
-
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
662
|
-
// wofuer?:
|
663
|
-
object_add(a, self);
|
664
|
-
return self;
|
875
|
+
frt_gc_mark(a->current_ts);
|
665
876
|
}
|
666
877
|
|
667
|
-
|
668
|
-
|
669
|
-
static VALUE
|
670
|
-
frt_regex_analyzer_token_stream(VALUE self, VALUE field, VALUE string)
|
878
|
+
static void
|
879
|
+
re_analyzer_destroy(Analyzer *a)
|
671
880
|
{
|
672
|
-
|
673
|
-
|
674
|
-
// already freed via analyzer's free()
|
675
|
-
VALUE token_stream = Data_Wrap_Struct(cTokenStream, NULL, NULL, ts);
|
676
|
-
return token_stream;
|
881
|
+
free(a->data);
|
882
|
+
a_standard_destroy(a);
|
677
883
|
}
|
678
|
-
*/
|
679
|
-
/** /RegexAnalyzer **/
|
680
884
|
|
681
|
-
|
682
|
-
|
885
|
+
static VALUE
|
886
|
+
frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
887
|
+
{
|
888
|
+
VALUE lower, rets, regex, proc;
|
889
|
+
Analyzer *a;
|
890
|
+
TokenStream *ts;
|
891
|
+
rb_scan_args(argc, argv, "02&", ®ex, &lower, &proc);
|
892
|
+
|
893
|
+
ts = rets_create(Qnil, regex, proc);
|
894
|
+
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
|
895
|
+
ref(ts);
|
896
|
+
rb_hash_aset(object_space, LONG2NUM((long)rets), rets);
|
897
|
+
object_add(ts, rets);
|
898
|
+
|
899
|
+
if (lower != Qfalse) ts = mb_lowercase_filter_create(ts);
|
900
|
+
|
901
|
+
a = analyzer_create(NULL, ts, &re_analyzer_destroy, NULL);
|
902
|
+
Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
|
903
|
+
object_add(a, self);
|
904
|
+
return self;
|
905
|
+
}
|
683
906
|
|
684
907
|
/****************************************************************************
|
685
908
|
*
|
@@ -710,10 +933,17 @@ static VALUE frt_setlocale(VALUE self, VALUE locale)
|
|
710
933
|
void
|
711
934
|
Init_analysis(void)
|
712
935
|
{
|
936
|
+
/* TokenStream Methods */
|
713
937
|
id_next = rb_intern("next");
|
714
938
|
id_reset = rb_intern("text=");
|
715
939
|
id_clone = rb_intern("clone");
|
716
940
|
|
941
|
+
/* Analyzer Methods */
|
942
|
+
id_token_stream = rb_intern("token_stream");
|
943
|
+
|
944
|
+
object_space = rb_hash_new();
|
945
|
+
rb_define_const(mFerret, "OBJECT_SPACE", object_space);
|
946
|
+
|
717
947
|
/*** * * Locale stuff * * ***/
|
718
948
|
frt_locale = setlocale(LC_ALL, "");
|
719
949
|
rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
|
@@ -790,6 +1020,18 @@ Init_analysis(void)
|
|
790
1020
|
rb_define_method(cStandardTokenizer, "initialize",
|
791
1021
|
frt_standard_tokenizer_init, 1);
|
792
1022
|
|
1023
|
+
/*** * * RegExpTokenizer * * ***/
|
1024
|
+
cRegExpTokenizer =
|
1025
|
+
rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1026
|
+
rtoken_re = rb_reg_new(token_re, strlen(token_re), 0);
|
1027
|
+
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1028
|
+
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
1029
|
+
rb_define_method(cRegExpTokenizer, "initialize",
|
1030
|
+
frt_rets_init, -1);
|
1031
|
+
rb_define_method(cRegExpTokenizer, "next", frt_ts_next, 0);
|
1032
|
+
rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
|
1033
|
+
rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
|
1034
|
+
|
793
1035
|
/***************/
|
794
1036
|
/*** Filters ***/
|
795
1037
|
/***************/
|
@@ -911,7 +1153,13 @@ Init_analysis(void)
|
|
911
1153
|
rb_define_method(cPerFieldAnalyzer, "[]=",
|
912
1154
|
frt_per_field_analyzer_add_field, 2);
|
913
1155
|
|
914
|
-
|
1156
|
+
/*** * * RegexAnalyzer * * ***/
|
1157
|
+
cRegExpAnalyzer =
|
1158
|
+
rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
1159
|
+
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
1160
|
+
rb_define_method(cRegExpAnalyzer, "initialize",
|
1161
|
+
frt_re_analyzer_init, -1);
|
1162
|
+
|
915
1163
|
/*
|
916
1164
|
cRegexAnalyzer =
|
917
1165
|
rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);
|