ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/q_term.c
CHANGED
@@ -18,6 +18,14 @@ Scorer *tw_scorer(Weight *self, IndexReader *ir)
|
|
18
18
|
|
19
19
|
Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
20
20
|
{
|
21
|
+
Explanation *qnorm_expl;
|
22
|
+
Explanation *field_expl;
|
23
|
+
Scorer *scorer;
|
24
|
+
Explanation *tf_expl;
|
25
|
+
uchar *field_norms;
|
26
|
+
float field_norm;
|
27
|
+
Explanation *field_norm_expl;
|
28
|
+
|
21
29
|
char *query_str = self->query->to_s(self->query, "");
|
22
30
|
TermQuery *tq = (TermQuery *)self->query->data;
|
23
31
|
Term *term = tq->term;
|
@@ -26,14 +34,14 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
26
34
|
Explanation *expl = expl_create(0.0,
|
27
35
|
strfmt("weight(%s in %d), product of:", query_str, doc_num));
|
28
36
|
|
29
|
-
|
30
|
-
|
37
|
+
/* We need two of these as it's included in both the query explanation
|
38
|
+
* and the field explanation */
|
31
39
|
Explanation *idf_expl1 = expl_create(self->idf,
|
32
40
|
strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
|
33
41
|
Explanation *idf_expl2 = expl_create(self->idf,
|
34
42
|
strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
|
35
43
|
|
36
|
-
|
44
|
+
/* explain query weight */
|
37
45
|
Explanation *query_expl = expl_create(0.0,
|
38
46
|
strfmt("query_weight(%s), product of:", query_str));
|
39
47
|
free(query_str);
|
@@ -44,33 +52,35 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
44
52
|
|
45
53
|
expl_add_detail(query_expl, idf_expl1);
|
46
54
|
|
47
|
-
|
55
|
+
qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
|
48
56
|
expl_add_detail(query_expl, qnorm_expl);
|
49
57
|
|
50
58
|
query_expl->value = self->query->boost * idf_expl1->value * qnorm_expl->value;
|
51
59
|
|
52
60
|
expl_add_detail(expl, query_expl);
|
53
61
|
|
54
|
-
|
55
|
-
|
62
|
+
/* explain field weight */
|
63
|
+
field_expl = expl_create(0.0,
|
56
64
|
strfmt("field_weight(%s:%s in %d), product of:",
|
57
65
|
field_name, term->text, doc_num));
|
58
66
|
|
59
|
-
|
60
|
-
|
67
|
+
scorer = self->scorer(self, ir);
|
68
|
+
tf_expl = scorer->explain(scorer, doc_num);
|
61
69
|
scorer->destroy(scorer);
|
62
70
|
expl_add_detail(field_expl, tf_expl);
|
63
71
|
expl_add_detail(field_expl, idf_expl2);
|
64
72
|
|
65
|
-
|
66
|
-
|
67
|
-
|
73
|
+
field_norms = ir->get_norms(ir, field_name);
|
74
|
+
field_norm = (field_norms
|
75
|
+
? sim_decode_norm(self->similarity, field_norms[doc_num])
|
76
|
+
: (float)0.0);
|
77
|
+
field_norm_expl = expl_create(field_norm,
|
68
78
|
strfmt("field_norm(field=%s, doc=%d)", field_name, doc_num));
|
69
79
|
expl_add_detail(field_expl, field_norm_expl);
|
70
80
|
|
71
81
|
field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
|
72
82
|
|
73
|
-
|
83
|
+
/* combine them */
|
74
84
|
if (query_expl->value == 1.0) {
|
75
85
|
expl_destoy(expl);
|
76
86
|
return field_expl;
|
@@ -86,30 +96,18 @@ char *tw_to_s(Weight *self)
|
|
86
96
|
return strfmt("TermWeight(%f)", self->value);
|
87
97
|
}
|
88
98
|
|
89
|
-
void tw_destroy(void *p)
|
90
|
-
{
|
91
|
-
free(p);
|
92
|
-
}
|
93
|
-
|
94
99
|
Weight *tw_create(Query *query, Searcher *searcher)
|
95
100
|
{
|
96
|
-
Weight *self =
|
97
|
-
ZEROSET(self, Weight, 1);
|
98
|
-
self->get_query = &w_get_query;
|
99
|
-
self->get_value = &w_get_value;
|
100
|
-
self->normalize = &w_normalize;
|
101
|
+
Weight *self = w_create(query);
|
101
102
|
self->scorer = &tw_scorer;
|
102
103
|
self->explain = &tw_explain;
|
103
104
|
self->to_s = &tw_to_s;
|
104
|
-
self->destroy = &tw_destroy;
|
105
105
|
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
106
106
|
|
107
107
|
self->similarity = query->get_similarity(query, searcher);
|
108
108
|
self->idf = sim_idf(self->similarity,
|
109
109
|
searcher->doc_freq(searcher, ((TermQuery *)query->data)->term),
|
110
110
|
searcher->max_doc(searcher)); // compute idf
|
111
|
-
self->query = query;
|
112
|
-
self->value = 0.0;
|
113
111
|
|
114
112
|
return self;
|
115
113
|
}
|
@@ -120,20 +118,19 @@ Weight *tw_create(Query *query, Searcher *searcher)
|
|
120
118
|
*
|
121
119
|
***************************************************************************/
|
122
120
|
|
123
|
-
void tq_destroy(
|
121
|
+
void tq_destroy(Query *self)
|
124
122
|
{
|
125
|
-
|
126
|
-
TermQuery *tq = q->data;
|
123
|
+
TermQuery *tq = self->data;
|
127
124
|
term_destroy(tq->term);
|
128
125
|
free(tq);
|
129
|
-
|
126
|
+
q_destroy_i(self);
|
130
127
|
}
|
131
128
|
|
132
129
|
char *tq_to_s(Query *self, char *field)
|
133
130
|
{
|
134
131
|
Term *term = ((TermQuery *)self->data)->term;
|
135
|
-
|
136
|
-
|
132
|
+
size_t flen = strlen(term->field);
|
133
|
+
size_t tlen = strlen(term->text);
|
137
134
|
char *buffer = ALLOC_N(char, 34 + flen + tlen);
|
138
135
|
char *b = buffer;
|
139
136
|
if (strcmp(field, term->field) != 0) {
|
@@ -151,10 +148,21 @@ char *tq_to_s(Query *self, char *field)
|
|
151
148
|
return buffer;
|
152
149
|
}
|
153
150
|
|
154
|
-
void tq_extract_terms(Query *self,
|
151
|
+
static void tq_extract_terms(Query *self, HashSet *terms)
|
155
152
|
{
|
156
153
|
Term *term = ((TermQuery *)self->data)->term;
|
157
|
-
|
154
|
+
hs_add(terms, term_clone(term));
|
155
|
+
}
|
156
|
+
|
157
|
+
static uint tq_hash(Query *self)
|
158
|
+
{
|
159
|
+
return term_hash(((TermQuery *)self->data)->term);
|
160
|
+
}
|
161
|
+
|
162
|
+
static int tq_eq(Query *self, Query *o)
|
163
|
+
{
|
164
|
+
return term_eq(((TermQuery *)self->data)->term,
|
165
|
+
((TermQuery *)o->data)->term);
|
158
166
|
}
|
159
167
|
|
160
168
|
Query *tq_create(Term *term)
|
@@ -164,14 +172,18 @@ Query *tq_create(Term *term)
|
|
164
172
|
tq->term = term;
|
165
173
|
self->type = TERM_QUERY;
|
166
174
|
self->data = tq;
|
167
|
-
self->create_weight = &tw_create;
|
168
175
|
self->extract_terms = &tq_extract_terms;
|
169
176
|
self->to_s = &tq_to_s;
|
170
|
-
self->
|
177
|
+
self->hash = &tq_hash;
|
178
|
+
self->eq = &tq_eq;
|
179
|
+
|
180
|
+
self->destroy_i = &tq_destroy;
|
181
|
+
self->create_weight_i = &tw_create;
|
171
182
|
|
172
183
|
return self;
|
173
184
|
}
|
174
185
|
|
186
|
+
|
175
187
|
/***************************************************************************
|
176
188
|
*
|
177
189
|
* TermScorer
|
@@ -183,13 +195,13 @@ float tsc_score(Scorer *self)
|
|
183
195
|
TermScorer *ts = (TermScorer *)self->data;
|
184
196
|
int freq = ts->freqs[ts->pointer];
|
185
197
|
float score;
|
186
|
-
|
187
|
-
if (freq < SCORE_CACHE_SIZE) {
|
188
|
-
score = ts->score_cache[freq];
|
198
|
+
/* compute tf(f)*weight */
|
199
|
+
if (freq < SCORE_CACHE_SIZE) { /* check cache */
|
200
|
+
score = ts->score_cache[freq]; /* cache hit */
|
189
201
|
} else {
|
190
|
-
score = sim_tf(self->similarity, freq) * ts->weight_value;
|
202
|
+
score = sim_tf(self->similarity, (float)freq) * ts->weight_value; /* cache miss */
|
191
203
|
}
|
192
|
-
|
204
|
+
/* normalize for field */
|
193
205
|
score *= sim_decode_norm(self->similarity, ts->norms[self->doc]);
|
194
206
|
return score;
|
195
207
|
}
|
@@ -217,8 +229,9 @@ bool tsc_next(Scorer *self)
|
|
217
229
|
bool tsc_skip_to(Scorer *self, int doc_num)
|
218
230
|
{
|
219
231
|
TermScorer *ts = (TermScorer *)self->data;
|
220
|
-
|
221
|
-
|
232
|
+
TermDocEnum *tde = ts->tde;
|
233
|
+
|
234
|
+
/* first scan in cache */
|
222
235
|
while (++(ts->pointer) < ts->pointer_max) {
|
223
236
|
if (ts->docs[ts->pointer] >= doc_num) {
|
224
237
|
self->doc = ts->docs[ts->pointer];
|
@@ -226,10 +239,8 @@ bool tsc_skip_to(Scorer *self, int doc_num)
|
|
226
239
|
}
|
227
240
|
}
|
228
241
|
|
229
|
-
|
230
|
-
|
231
|
-
bool result = tde->skip_to(tde, doc_num);
|
232
|
-
if (result) {
|
242
|
+
/* not found in cache, seek underlying stream */
|
243
|
+
if (tde->skip_to(tde, doc_num)) {
|
233
244
|
ts->pointer_max = 1;
|
234
245
|
ts->pointer = 0;
|
235
246
|
ts->docs[0] = self->doc = tde->doc_num(tde);
|
@@ -242,6 +253,7 @@ bool tsc_skip_to(Scorer *self, int doc_num)
|
|
242
253
|
|
243
254
|
Explanation *tsc_explain(Scorer *self, int doc_num)
|
244
255
|
{
|
256
|
+
Explanation *tf_explanation;
|
245
257
|
TermScorer *ts = (TermScorer *)self->data;
|
246
258
|
Query *query = ts->weight->get_query(ts->weight);
|
247
259
|
Term *term = ((TermQuery *)query->data)->term;
|
@@ -260,18 +272,17 @@ Explanation *tsc_explain(Scorer *self, int doc_num)
|
|
260
272
|
}
|
261
273
|
tde->close(tde);
|
262
274
|
ts->tde = NULL;
|
263
|
-
|
275
|
+
tf_explanation = expl_create(sim_tf(self->similarity, (float)tf),
|
264
276
|
strfmt("tf(term_freq(%s:%s)=%d)", term->field, term->text, tf));
|
265
277
|
|
266
278
|
return tf_explanation;
|
267
279
|
}
|
268
280
|
|
269
|
-
void tsc_destroy(
|
281
|
+
void tsc_destroy(Scorer *self)
|
270
282
|
{
|
271
|
-
Scorer *self = (Scorer *)p;
|
272
283
|
TermScorer *ts = (TermScorer *)self->data;
|
273
284
|
if (ts->tde) ts->tde->close(ts->tde);
|
274
|
-
|
285
|
+
scorer_destroy_i(self);
|
275
286
|
}
|
276
287
|
|
277
288
|
Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
|
@@ -287,7 +298,7 @@ Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
|
|
287
298
|
ts->weight_value = weight->value;
|
288
299
|
|
289
300
|
for (i = 0; i < SCORE_CACHE_SIZE; i++) {
|
290
|
-
ts->score_cache[i] = sim_tf(self->similarity, i) * ts->weight_value;
|
301
|
+
ts->score_cache[i] = sim_tf(self->similarity, (float)i) * ts->weight_value;
|
291
302
|
}
|
292
303
|
|
293
304
|
self->score = &tsc_score;
|
data/ext/q_wildcard.c
CHANGED
@@ -11,8 +11,8 @@ char *wcq_to_s(Query *self, char *field)
|
|
11
11
|
{
|
12
12
|
char *buffer, *bptr;
|
13
13
|
Term *term = (Term *)self->data;
|
14
|
-
|
15
|
-
|
14
|
+
size_t tlen = strlen(term->text);
|
15
|
+
size_t flen = strlen(term->field);
|
16
16
|
bptr = buffer = ALLOC_N(char, tlen + flen + 35);
|
17
17
|
|
18
18
|
if (strcmp(term->field, field) != 0) {
|
@@ -77,8 +77,8 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
77
77
|
Term *term = (Term *)self->data;
|
78
78
|
char *text = term->text;
|
79
79
|
char *field = term->field;
|
80
|
-
char *first_star =
|
81
|
-
char *first_ques =
|
80
|
+
char *first_star = strrchr(text, WILD_STRING);
|
81
|
+
char *first_ques = strrchr(text, WILD_CHAR);
|
82
82
|
if (!first_star && !first_ques) {
|
83
83
|
q = tq_create(term_clone(term));
|
84
84
|
} else {
|
@@ -89,7 +89,7 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
89
89
|
char *pattern = (first_ques && first_star > first_ques)
|
90
90
|
? first_ques : first_star;
|
91
91
|
|
92
|
-
int prefix_len = pattern - text;
|
92
|
+
int prefix_len = (int)(pattern - text);
|
93
93
|
|
94
94
|
prefix_term.field = field;
|
95
95
|
prefix_term.text = (char *)EMPTY_STRING;
|
@@ -120,15 +120,23 @@ Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
120
120
|
free(prefix);
|
121
121
|
}
|
122
122
|
|
123
|
-
|
124
|
-
return self->rewritten = q;
|
123
|
+
return q;
|
125
124
|
}
|
126
125
|
|
127
|
-
void wcq_destroy(
|
126
|
+
static void wcq_destroy(Query *self)
|
128
127
|
{
|
129
|
-
Query *self = (Query *)p;
|
130
128
|
if (self->destroy_all) term_destroy((Term *)self->data);
|
131
|
-
|
129
|
+
q_destroy_i(self);
|
130
|
+
}
|
131
|
+
|
132
|
+
static uint wcq_hash(Query *self)
|
133
|
+
{
|
134
|
+
return term_hash((Term *)self->data);
|
135
|
+
}
|
136
|
+
|
137
|
+
static int wcq_eq(Query *self, Query *o)
|
138
|
+
{
|
139
|
+
return term_eq((Term *)self->data, (Term *)o->data);
|
132
140
|
}
|
133
141
|
|
134
142
|
Query *wcq_create(Term *term)
|
@@ -136,11 +144,14 @@ Query *wcq_create(Term *term)
|
|
136
144
|
Query *self = q_create();
|
137
145
|
|
138
146
|
self->data = term;
|
147
|
+
|
139
148
|
self->type = WILD_CARD_QUERY;
|
140
|
-
self->create_weight = NULL;
|
141
|
-
self->to_s = &wcq_to_s;
|
142
149
|
self->rewrite = &wcq_rewrite;
|
143
|
-
self->
|
150
|
+
self->to_s = &wcq_to_s;
|
151
|
+
self->hash = &wcq_hash;
|
152
|
+
self->eq = &wcq_eq;
|
153
|
+
self->destroy_i = &wcq_destroy;
|
154
|
+
self->create_weight_i = &q_create_weight_unsup;
|
144
155
|
|
145
156
|
return self;
|
146
157
|
}
|
data/ext/r_analysis.c
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#include <regex.h>
|
1
2
|
#include "ferret.h"
|
2
3
|
#include "analysis.h"
|
3
4
|
#include "locale.h"
|
@@ -9,6 +10,7 @@ static VALUE cAsciiWhiteSpaceTokenizer;
|
|
9
10
|
static VALUE cWhiteSpaceTokenizer;
|
10
11
|
static VALUE cAsciiStandardTokenizer;
|
11
12
|
static VALUE cStandardTokenizer;
|
13
|
+
static VALUE cRegExpTokenizer;
|
12
14
|
|
13
15
|
static VALUE cAsciiLowerCaseFilter;
|
14
16
|
static VALUE cLowerCaseFilter;
|
@@ -23,14 +25,25 @@ static VALUE cWhiteSpaceAnalyzer;
|
|
23
25
|
static VALUE cAsciiStandardAnalyzer;
|
24
26
|
static VALUE cStandardAnalyzer;
|
25
27
|
static VALUE cPerFieldAnalyzer;
|
28
|
+
static VALUE cRegExpAnalyzer;
|
26
29
|
|
27
30
|
//static VALUE cRegexAnalyzer;
|
28
31
|
static VALUE cTokenStream;
|
29
32
|
|
33
|
+
/* TokenStream Methods */
|
30
34
|
static ID id_next;
|
31
35
|
static ID id_reset;
|
32
36
|
static ID id_clone;
|
33
37
|
|
38
|
+
/* Analyzer Methods */
|
39
|
+
static ID id_token_stream;
|
40
|
+
|
41
|
+
static VALUE object_space;
|
42
|
+
|
43
|
+
extern TokenStream *ts_create();
|
44
|
+
extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, int,
|
45
|
+
struct re_registers *);
|
46
|
+
|
34
47
|
/****************************************************************************
|
35
48
|
*
|
36
49
|
* Utility Methods
|
@@ -111,7 +124,7 @@ frt_set_token(Token *tk, VALUE rt)
|
|
111
124
|
return tk;
|
112
125
|
}
|
113
126
|
|
114
|
-
#define GET_TK RToken *token
|
127
|
+
#define GET_TK RToken *token = (RToken *)DATA_PTR(self)
|
115
128
|
static VALUE
|
116
129
|
frt_token_init(int argc, VALUE *argv, VALUE self)
|
117
130
|
{
|
@@ -212,13 +225,12 @@ frt_ts_mark(void *p)
|
|
212
225
|
}
|
213
226
|
|
214
227
|
static void
|
215
|
-
frt_ts_free(
|
228
|
+
frt_ts_free(TokenStream *ts)
|
216
229
|
{
|
217
|
-
TokenStream *ts = (TokenStream *)p;
|
218
230
|
if (object_get(&ts->text) != Qnil) object_del(&ts->text);
|
219
231
|
if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
|
220
232
|
object_del(ts);
|
221
|
-
|
233
|
+
ts_deref(ts);
|
222
234
|
}
|
223
235
|
|
224
236
|
static VALUE
|
@@ -273,8 +285,7 @@ frt_ts_get_text(VALUE self)
|
|
273
285
|
static VALUE
|
274
286
|
frt_ts_next(VALUE self)
|
275
287
|
{
|
276
|
-
TokenStream *ts;
|
277
|
-
Data_Get_Struct(self, TokenStream, ts);
|
288
|
+
TokenStream *ts = (TokenStream *)DATA_PTR(self);
|
278
289
|
Token *next = ts->next(ts);
|
279
290
|
if (next == NULL) {
|
280
291
|
return Qnil;
|
@@ -287,41 +298,45 @@ frt_ts_next(VALUE self)
|
|
287
298
|
* CWrappedTokenStream
|
288
299
|
****************************************************************************/
|
289
300
|
|
290
|
-
|
301
|
+
static void
|
302
|
+
cwrts_destroy(TokenStream *ts)
|
291
303
|
{
|
292
|
-
|
304
|
+
rb_hash_delete(object_space, LONG2NUM((long)ts->data));
|
293
305
|
free(ts->token);
|
294
306
|
free(ts);
|
295
307
|
}
|
296
308
|
|
297
|
-
Token *
|
309
|
+
static Token *
|
310
|
+
cwrts_next(TokenStream *ts)
|
298
311
|
{
|
299
312
|
VALUE rts = (VALUE)ts->data;
|
300
313
|
VALUE rtoken = rb_funcall(rts, id_next, 0);
|
301
314
|
return frt_set_token(ts->token, rtoken);
|
302
315
|
}
|
303
316
|
|
304
|
-
void
|
317
|
+
static void
|
318
|
+
cwrts_reset(TokenStream *ts, char *text)
|
305
319
|
{
|
306
320
|
VALUE rts = (VALUE)ts->data;
|
307
321
|
ts->t = ts->text = text;
|
308
322
|
rb_funcall(rts, id_reset, 1, rb_str_new2(text));
|
309
323
|
}
|
310
324
|
|
311
|
-
void
|
325
|
+
static void
|
326
|
+
cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
312
327
|
{
|
313
328
|
VALUE rorig_ts = (VALUE)orig_ts->data;
|
314
329
|
new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
|
315
330
|
}
|
316
331
|
|
317
332
|
static TokenStream *
|
318
|
-
|
333
|
+
frt_get_cwrapped_rts(VALUE rts)
|
319
334
|
{
|
320
335
|
TokenStream *ts;
|
321
336
|
switch (TYPE(rts)) {
|
322
337
|
case T_DATA:
|
323
338
|
Data_Get_Struct(rts, TokenStream, ts);
|
324
|
-
|
339
|
+
ref(ts);
|
325
340
|
break;
|
326
341
|
default:
|
327
342
|
ts = ALLOC(TokenStream);
|
@@ -332,12 +347,184 @@ get_cwrapped_rts(VALUE rts, bool *self_destroy)
|
|
332
347
|
ts->clone_i = &cwrts_clone_i;
|
333
348
|
ts->destroy = &cwrts_destroy;
|
334
349
|
ts->sub_ts = NULL;
|
335
|
-
|
350
|
+
// prevent from being garbage collected
|
351
|
+
rb_hash_aset(object_space, LONG2NUM(rts), rts);
|
352
|
+
ts->ref_cnt = 1;
|
336
353
|
break;
|
337
354
|
}
|
338
355
|
return ts;
|
339
356
|
}
|
340
357
|
|
358
|
+
/****************************************************************************
|
359
|
+
* RegExpTokenStream
|
360
|
+
****************************************************************************/
|
361
|
+
|
362
|
+
#define P "[_\\/.,-]"
|
363
|
+
#define HASDIGIT "\\w*\\d\\w*"
|
364
|
+
#define ALPHA "[-_[:alpha:]]"
|
365
|
+
#define ALNUM "[-_[:alnum:]]"
|
366
|
+
|
367
|
+
static char *token_re =
|
368
|
+
ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
|
369
|
+
"(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
|
370
|
+
"|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
|
371
|
+
"|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
|
372
|
+
"|(\\.\\w+)+"
|
373
|
+
"|"
|
374
|
+
")";
|
375
|
+
static VALUE rtoken_re;
|
376
|
+
|
377
|
+
typedef struct RegExpTokenStream {
|
378
|
+
VALUE rtext;
|
379
|
+
VALUE regex;
|
380
|
+
VALUE proc;
|
381
|
+
int curr_ind;
|
382
|
+
} RegExpTokenStream;
|
383
|
+
|
384
|
+
static void
|
385
|
+
rets_destroy(TokenStream *ts)
|
386
|
+
{
|
387
|
+
rb_hash_delete(object_space, LONG2NUM((long)object_get(ts)));
|
388
|
+
free(ts->data);
|
389
|
+
free(ts->token);
|
390
|
+
free(ts);
|
391
|
+
}
|
392
|
+
|
393
|
+
static void
|
394
|
+
frt_rets_free(TokenStream *ts)
|
395
|
+
{
|
396
|
+
object_del(ts);
|
397
|
+
ts_deref(ts);
|
398
|
+
}
|
399
|
+
|
400
|
+
static void
|
401
|
+
frt_rets_mark(TokenStream *ts)
|
402
|
+
{
|
403
|
+
RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
|
404
|
+
rb_gc_mark(rets->rtext);
|
405
|
+
rb_gc_mark(rets->regex);
|
406
|
+
rb_gc_mark(rets->proc);
|
407
|
+
}
|
408
|
+
|
409
|
+
static VALUE
|
410
|
+
frt_rets_set_text(VALUE self, VALUE rtext)
|
411
|
+
{
|
412
|
+
TokenStream *ts;
|
413
|
+
RegExpTokenStream *rets;
|
414
|
+
Data_Get_Struct(self, TokenStream, ts);
|
415
|
+
|
416
|
+
StringValue(rtext);
|
417
|
+
rets = (RegExpTokenStream *)ts->data;
|
418
|
+
rets->rtext = rtext;
|
419
|
+
rets->curr_ind = 0;
|
420
|
+
|
421
|
+
return rtext;
|
422
|
+
}
|
423
|
+
|
424
|
+
static VALUE
|
425
|
+
frt_rets_get_text(VALUE self)
|
426
|
+
{
|
427
|
+
TokenStream *ts;
|
428
|
+
RegExpTokenStream *rets;
|
429
|
+
Data_Get_Struct(self, TokenStream, ts);
|
430
|
+
rets = (RegExpTokenStream *)ts->data;
|
431
|
+
return rets->rtext;
|
432
|
+
}
|
433
|
+
|
434
|
+
static Token *
|
435
|
+
rets_next(TokenStream *ts)
|
436
|
+
{
|
437
|
+
static struct re_registers regs;
|
438
|
+
int ret, beg, end;
|
439
|
+
RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
|
440
|
+
struct RString *rtext = RSTRING(rets->rtext);
|
441
|
+
Check_Type(rets->regex, T_REGEXP);
|
442
|
+
ret = ruby_re_search(RREGEXP(rets->regex)->ptr,
|
443
|
+
rtext->ptr, rtext->len,
|
444
|
+
rets->curr_ind, rtext->len - rets->curr_ind,
|
445
|
+
®s);
|
446
|
+
|
447
|
+
if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
|
448
|
+
if (ret < 0) return NULL; /* not matched */
|
449
|
+
|
450
|
+
beg = regs.beg[0];
|
451
|
+
rets->curr_ind = end = regs.end[0];
|
452
|
+
if (NIL_P(rets->proc)) {
|
453
|
+
return tk_set(ts->token, rtext->ptr + beg, end - beg, beg, end, 1);
|
454
|
+
} else {
|
455
|
+
VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
|
456
|
+
rtok = rb_funcall(rets->proc, id_call, 1, rtok);
|
457
|
+
return tk_set(ts->token, RSTRING(rtok)->ptr, RSTRING(rtok)->len, beg, end, 1);
|
458
|
+
}
|
459
|
+
}
|
460
|
+
|
461
|
+
static void
|
462
|
+
rets_reset(TokenStream *ts, char *text)
|
463
|
+
{
|
464
|
+
RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
|
465
|
+
rets->rtext = rb_str_new2(text);
|
466
|
+
rets->curr_ind = 0;
|
467
|
+
}
|
468
|
+
|
469
|
+
void
|
470
|
+
rets_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
471
|
+
{
|
472
|
+
RegExpTokenStream *new_rets = ALLOC(RegExpTokenStream);
|
473
|
+
RegExpTokenStream *orig_rets = (RegExpTokenStream *)orig_ts->data;
|
474
|
+
memcpy(new_rets, orig_rets, sizeof(RegExpTokenStream));
|
475
|
+
new_ts->data = new_rets;
|
476
|
+
}
|
477
|
+
|
478
|
+
static TokenStream *
|
479
|
+
rets_create(VALUE rtext, VALUE regex, VALUE proc)
|
480
|
+
{
|
481
|
+
RegExpTokenStream *rets;
|
482
|
+
TokenStream *ts;
|
483
|
+
|
484
|
+
if (rtext != Qnil) {
|
485
|
+
rtext = StringValue(rtext);
|
486
|
+
}
|
487
|
+
ts = ts_create();
|
488
|
+
ts->reset = &rets_reset;
|
489
|
+
ts->next = &rets_next;
|
490
|
+
ts->clone_i = &rets_clone_i;
|
491
|
+
ts->destroy = &rets_destroy;
|
492
|
+
ts->ref_cnt = 1;
|
493
|
+
|
494
|
+
rets = ALLOC(RegExpTokenStream);
|
495
|
+
rets->curr_ind = 0;
|
496
|
+
rets->rtext = rtext;
|
497
|
+
rets->proc = proc;
|
498
|
+
if (NIL_P(regex)) {
|
499
|
+
rets->regex = rtoken_re;
|
500
|
+
} else {
|
501
|
+
Check_Type(regex, T_REGEXP);
|
502
|
+
rets->regex = regex;
|
503
|
+
}
|
504
|
+
|
505
|
+
ts->data = rets;
|
506
|
+
|
507
|
+
return ts;
|
508
|
+
}
|
509
|
+
|
510
|
+
static VALUE
|
511
|
+
frt_rets_init(int argc, VALUE *argv, VALUE self)
|
512
|
+
{
|
513
|
+
VALUE rtext, regex, proc;
|
514
|
+
TokenStream *ts;
|
515
|
+
|
516
|
+
rb_scan_args(argc, argv, "11&", &rtext, ®ex, &proc);
|
517
|
+
|
518
|
+
ts = rets_create(rtext, regex, proc);
|
519
|
+
|
520
|
+
Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
|
521
|
+
object_add(ts, self);
|
522
|
+
/* no need to add to object space as it is going to ruby space
|
523
|
+
* rb_hash_aset(object_space, LONG2NUM((long)self), self);
|
524
|
+
*/
|
525
|
+
return self;
|
526
|
+
}
|
527
|
+
|
341
528
|
/****************************************************************************
|
342
529
|
* Tokenizers
|
343
530
|
****************************************************************************/
|
@@ -394,10 +581,8 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
394
581
|
static VALUE
|
395
582
|
frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
396
583
|
{
|
397
|
-
|
398
|
-
|
399
|
-
get_cwrapped_rts(rsub_ts, &self_destroy));
|
400
|
-
ts->destroy_sub = !self_destroy;
|
584
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
585
|
+
ts = lowercase_filter_create(ts);
|
401
586
|
object_add(&ts->sub_ts, rsub_ts);
|
402
587
|
|
403
588
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -408,10 +593,8 @@ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
|
408
593
|
static VALUE
|
409
594
|
frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
410
595
|
{
|
411
|
-
|
412
|
-
|
413
|
-
get_cwrapped_rts(rsub_ts, &self_destroy));
|
414
|
-
ts->destroy_sub = !self_destroy;
|
596
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
597
|
+
ts = mb_lowercase_filter_create(ts);
|
415
598
|
object_add(&ts->sub_ts, rsub_ts);
|
416
599
|
|
417
600
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -423,19 +606,17 @@ static VALUE
|
|
423
606
|
frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
424
607
|
{
|
425
608
|
VALUE rsub_ts, rstop_words;
|
426
|
-
bool self_destroy;
|
427
609
|
TokenStream *ts;
|
428
610
|
rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
|
611
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
429
612
|
if (rstop_words != Qnil) {
|
430
613
|
char **stop_words = get_stopwords(rstop_words);
|
431
|
-
ts = stop_filter_create_with_words(
|
432
|
-
|
614
|
+
ts = stop_filter_create_with_words(ts, (const char **)stop_words);
|
615
|
+
|
433
616
|
free(stop_words);
|
434
617
|
} else {
|
435
|
-
ts = stop_filter_create(
|
436
|
-
get_cwrapped_rts(rsub_ts, &self_destroy));
|
618
|
+
ts = stop_filter_create(ts);
|
437
619
|
}
|
438
|
-
ts->destroy_sub = !self_destroy;
|
439
620
|
object_add(&ts->sub_ts, rsub_ts);
|
440
621
|
|
441
622
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -449,16 +630,14 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
|
449
630
|
VALUE rsub_ts, ralgorithm, rcharenc;
|
450
631
|
char *algorithm = "english";
|
451
632
|
char *charenc = NULL;
|
452
|
-
bool self_destroy;
|
453
633
|
TokenStream *ts;
|
454
634
|
rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
|
635
|
+
ts = frt_get_cwrapped_rts(rsub_ts);
|
455
636
|
switch (argc) {
|
456
637
|
case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
|
457
638
|
case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
|
458
639
|
}
|
459
|
-
ts = stem_filter_create(
|
460
|
-
get_cwrapped_rts(rsub_ts, &self_destroy), algorithm, charenc);
|
461
|
-
ts->destroy_sub = !self_destroy;
|
640
|
+
ts = stem_filter_create(ts, algorithm, charenc);
|
462
641
|
object_add(&ts->sub_ts, rsub_ts);
|
463
642
|
|
464
643
|
Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
|
@@ -472,34 +651,49 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
|
472
651
|
*
|
473
652
|
****************************************************************************/
|
474
653
|
|
475
|
-
|
654
|
+
/****************************************************************************
|
655
|
+
* CWrappedAnalyzer Methods
|
656
|
+
****************************************************************************/
|
657
|
+
|
658
|
+
static void
|
659
|
+
cwa_destroy(Analyzer *a)
|
660
|
+
{
|
661
|
+
rb_hash_delete(object_space, LONG2NUM((long)a->data));
|
662
|
+
a_standard_destroy(a);
|
663
|
+
}
|
664
|
+
|
665
|
+
static TokenStream *
|
666
|
+
cwa_get_ts(Analyzer *a, char *field, char *text)
|
667
|
+
{
|
668
|
+
VALUE ranalyzer = (VALUE)a->data;
|
669
|
+
VALUE rts = rb_funcall(ranalyzer, id_token_stream, 2,
|
670
|
+
rb_str_new2(field), rb_str_new2(text));
|
671
|
+
return frt_get_cwrapped_rts(rts);
|
672
|
+
}
|
673
|
+
|
674
|
+
Analyzer *
|
675
|
+
frt_get_cwrapped_analyzer(ranalyzer)
|
476
676
|
{
|
477
677
|
Analyzer *a = NULL;
|
478
678
|
switch (TYPE(ranalyzer)) {
|
479
679
|
case T_DATA:
|
480
680
|
Data_Get_Struct(ranalyzer, Analyzer, a);
|
681
|
+
ref(a);
|
481
682
|
break;
|
482
683
|
default:
|
483
|
-
|
484
|
-
//
|
485
|
-
|
486
|
-
//ts->data = (void *)rts;
|
487
|
-
//ts->next = &cwrts_next;
|
488
|
-
//ts->reset = &cwrts_reset;
|
489
|
-
//ts->clone_i = &cwrts_clone_i;
|
490
|
-
//ts->destroy = &cwrts_destroy;
|
491
|
-
//ts->sub_ts = NULL;
|
684
|
+
a = analyzer_create((void *)ranalyzer, NULL, &cwa_destroy, &cwa_get_ts);
|
685
|
+
// prevent from being garbage collected
|
686
|
+
rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
|
492
687
|
break;
|
493
688
|
}
|
494
689
|
return a;
|
495
690
|
}
|
496
691
|
|
497
692
|
static void
|
498
|
-
frt_analyzer_free(
|
693
|
+
frt_analyzer_free(Analyzer *a)
|
499
694
|
{
|
500
|
-
Analyzer *a = (Analyzer *)p;
|
501
695
|
object_del(a);
|
502
|
-
|
696
|
+
a_deref(a);
|
503
697
|
}
|
504
698
|
|
505
699
|
VALUE
|
@@ -513,13 +707,16 @@ frt_get_analyzer(Analyzer *a)
|
|
513
707
|
static VALUE
|
514
708
|
frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
515
709
|
{
|
516
|
-
|
710
|
+
TokenStream *ts;
|
711
|
+
Analyzer *a = (Analyzer *)DATA_PTR(self);
|
712
|
+
|
517
713
|
rfield = rb_obj_as_string(rfield);
|
518
714
|
rstring = rb_obj_as_string(rstring);
|
519
715
|
|
520
|
-
|
716
|
+
ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
|
521
717
|
|
522
|
-
|
718
|
+
/* Make sure that there is no entry already */
|
719
|
+
object_set(&ts->text, rstring);
|
523
720
|
return get_token_stream(ts);
|
524
721
|
}
|
525
722
|
|
@@ -533,8 +730,9 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
|
533
730
|
static VALUE
|
534
731
|
frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
535
732
|
{
|
733
|
+
Analyzer *a;
|
536
734
|
GET_LOWER(false);
|
537
|
-
|
735
|
+
a = whitespace_analyzer_create(lower);
|
538
736
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
539
737
|
object_add(a, self);
|
540
738
|
return self;
|
@@ -544,8 +742,9 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
544
742
|
static VALUE
|
545
743
|
frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
546
744
|
{
|
745
|
+
Analyzer *a;
|
547
746
|
GET_LOWER(false);
|
548
|
-
|
747
|
+
a = mb_whitespace_analyzer_create(lower);
|
549
748
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
550
749
|
object_add(a, self);
|
551
750
|
return self;
|
@@ -555,8 +754,9 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
555
754
|
static VALUE
|
556
755
|
frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
557
756
|
{
|
757
|
+
Analyzer *a;
|
558
758
|
GET_LOWER(true);
|
559
|
-
|
759
|
+
a = letter_analyzer_create(lower);
|
560
760
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
561
761
|
object_add(a, self);
|
562
762
|
return self;
|
@@ -566,8 +766,9 @@ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
566
766
|
static VALUE
|
567
767
|
frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
568
768
|
{
|
769
|
+
Analyzer *a;
|
569
770
|
GET_LOWER(true);
|
570
|
-
|
771
|
+
a = mb_letter_analyzer_create(lower);
|
571
772
|
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
572
773
|
object_add(a, self);
|
573
774
|
return self;
|
@@ -628,13 +829,29 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
628
829
|
return self;
|
629
830
|
}
|
630
831
|
|
832
|
+
void
|
833
|
+
frt_h_mark_values_i(void *key, void *value, void *arg)
|
834
|
+
{
|
835
|
+
frt_gc_mark(value);
|
836
|
+
}
|
837
|
+
|
838
|
+
void
|
839
|
+
frt_pfa_mark(void *p)
|
840
|
+
{
|
841
|
+
Analyzer *a = (Analyzer *)p;
|
842
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)a->data;
|
843
|
+
frt_gc_mark(pfa->def);
|
844
|
+
h_each(pfa->dict, &frt_h_mark_values_i, NULL);
|
845
|
+
}
|
846
|
+
|
631
847
|
/*** PerFieldAnalyzer ***/
|
848
|
+
|
632
849
|
static VALUE
|
633
850
|
frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
|
634
851
|
{
|
635
|
-
Analyzer *def =
|
636
|
-
Analyzer *a = per_field_analyzer_create(def
|
637
|
-
Frt_Wrap_Struct(self,
|
852
|
+
Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
|
853
|
+
Analyzer *a = per_field_analyzer_create(def);
|
854
|
+
Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
|
638
855
|
object_add(a, self);
|
639
856
|
return self;
|
640
857
|
}
|
@@ -644,42 +861,48 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
644
861
|
{
|
645
862
|
Analyzer *pfa, *a;
|
646
863
|
Data_Get_Struct(self, Analyzer, pfa);
|
647
|
-
|
864
|
+
a = frt_get_cwrapped_analyzer(ranalyzer);
|
648
865
|
|
649
866
|
pfa_add_field(pfa, StringValuePtr(rfield), a);
|
650
867
|
return self;
|
651
868
|
}
|
652
869
|
|
870
|
+
/*** RegExpAnalyzer ***/
|
653
871
|
|
654
|
-
|
655
|
-
|
656
|
-
static VALUE
|
657
|
-
frt_regex_analyzer_init(VALUE self)
|
872
|
+
static void
|
873
|
+
frt_re_analyzer_mark(Analyzer *a)
|
658
874
|
{
|
659
|
-
|
660
|
-
// keine Ahnung warum hier das Makro und nicht Data_Wrap_Struct:
|
661
|
-
Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
|
662
|
-
// wofuer?:
|
663
|
-
object_add(a, self);
|
664
|
-
return self;
|
875
|
+
frt_gc_mark(a->current_ts);
|
665
876
|
}
|
666
877
|
|
667
|
-
|
668
|
-
|
669
|
-
static VALUE
|
670
|
-
frt_regex_analyzer_token_stream(VALUE self, VALUE field, VALUE string)
|
878
|
+
static void
|
879
|
+
re_analyzer_destroy(Analyzer *a)
|
671
880
|
{
|
672
|
-
|
673
|
-
|
674
|
-
// already freed via analyzer's free()
|
675
|
-
VALUE token_stream = Data_Wrap_Struct(cTokenStream, NULL, NULL, ts);
|
676
|
-
return token_stream;
|
881
|
+
free(a->data);
|
882
|
+
a_standard_destroy(a);
|
677
883
|
}
|
678
|
-
*/
|
679
|
-
/** /RegexAnalyzer **/
|
680
884
|
|
681
|
-
|
682
|
-
|
885
|
+
static VALUE
|
886
|
+
frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
887
|
+
{
|
888
|
+
VALUE lower, rets, regex, proc;
|
889
|
+
Analyzer *a;
|
890
|
+
TokenStream *ts;
|
891
|
+
rb_scan_args(argc, argv, "02&", ®ex, &lower, &proc);
|
892
|
+
|
893
|
+
ts = rets_create(Qnil, regex, proc);
|
894
|
+
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
|
895
|
+
ref(ts);
|
896
|
+
rb_hash_aset(object_space, LONG2NUM((long)rets), rets);
|
897
|
+
object_add(ts, rets);
|
898
|
+
|
899
|
+
if (lower != Qfalse) ts = mb_lowercase_filter_create(ts);
|
900
|
+
|
901
|
+
a = analyzer_create(NULL, ts, &re_analyzer_destroy, NULL);
|
902
|
+
Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
|
903
|
+
object_add(a, self);
|
904
|
+
return self;
|
905
|
+
}
|
683
906
|
|
684
907
|
/****************************************************************************
|
685
908
|
*
|
@@ -710,10 +933,17 @@ static VALUE frt_setlocale(VALUE self, VALUE locale)
|
|
710
933
|
void
|
711
934
|
Init_analysis(void)
|
712
935
|
{
|
936
|
+
/* TokenStream Methods */
|
713
937
|
id_next = rb_intern("next");
|
714
938
|
id_reset = rb_intern("text=");
|
715
939
|
id_clone = rb_intern("clone");
|
716
940
|
|
941
|
+
/* Analyzer Methods */
|
942
|
+
id_token_stream = rb_intern("token_stream");
|
943
|
+
|
944
|
+
object_space = rb_hash_new();
|
945
|
+
rb_define_const(mFerret, "OBJECT_SPACE", object_space);
|
946
|
+
|
717
947
|
/*** * * Locale stuff * * ***/
|
718
948
|
frt_locale = setlocale(LC_ALL, "");
|
719
949
|
rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
|
@@ -790,6 +1020,18 @@ Init_analysis(void)
|
|
790
1020
|
rb_define_method(cStandardTokenizer, "initialize",
|
791
1021
|
frt_standard_tokenizer_init, 1);
|
792
1022
|
|
1023
|
+
/*** * * RegExpTokenizer * * ***/
|
1024
|
+
cRegExpTokenizer =
|
1025
|
+
rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1026
|
+
rtoken_re = rb_reg_new(token_re, strlen(token_re), 0);
|
1027
|
+
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1028
|
+
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
1029
|
+
rb_define_method(cRegExpTokenizer, "initialize",
|
1030
|
+
frt_rets_init, -1);
|
1031
|
+
rb_define_method(cRegExpTokenizer, "next", frt_ts_next, 0);
|
1032
|
+
rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
|
1033
|
+
rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
|
1034
|
+
|
793
1035
|
/***************/
|
794
1036
|
/*** Filters ***/
|
795
1037
|
/***************/
|
@@ -911,7 +1153,13 @@ Init_analysis(void)
|
|
911
1153
|
rb_define_method(cPerFieldAnalyzer, "[]=",
|
912
1154
|
frt_per_field_analyzer_add_field, 2);
|
913
1155
|
|
914
|
-
|
1156
|
+
/*** * * RegexAnalyzer * * ***/
|
1157
|
+
cRegExpAnalyzer =
|
1158
|
+
rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
1159
|
+
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
1160
|
+
rb_define_method(cRegExpAnalyzer, "initialize",
|
1161
|
+
frt_re_analyzer_init, -1);
|
1162
|
+
|
915
1163
|
/*
|
916
1164
|
cRegexAnalyzer =
|
917
1165
|
rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);
|