ferret 0.11.6 → 0.11.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
data/ext/q_term.c
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
+
#include "symbol.h"
|
1
2
|
#include <string.h>
|
2
3
|
#include "search.h"
|
4
|
+
#include "internal.h"
|
3
5
|
|
4
6
|
#define TQ(query) ((TermQuery *)(query))
|
5
7
|
#define TSc(scorer) ((TermScorer *)(scorer))
|
@@ -103,7 +105,7 @@ static Explanation *tsc_explain(Scorer *self, int doc_num)
|
|
103
105
|
}
|
104
106
|
return expl_new(sim_tf(self->similarity, (float)tf),
|
105
107
|
"tf(term_freq(%s:%s)=%d)",
|
106
|
-
TQ(query)->field, TQ(query)->term, tf);
|
108
|
+
S(TQ(query)->field), TQ(query)->term, tf);
|
107
109
|
}
|
108
110
|
|
109
111
|
static void tsc_destroy(Scorer *self)
|
@@ -144,9 +146,8 @@ static Scorer *tw_scorer(Weight *self, IndexReader *ir)
|
|
144
146
|
{
|
145
147
|
TermQuery *tq = TQ(self->query);
|
146
148
|
TermDocEnum *tde = ir_term_docs_for(ir, tq->field, tq->term);
|
147
|
-
|
148
|
-
|
149
|
-
}
|
149
|
+
/* ir_term_docs_for should always return a TermDocEnum */
|
150
|
+
assert(NULL != tde);
|
150
151
|
|
151
152
|
return tsc_new(self, tde, ir_get_norms(ir, tq->field));
|
152
153
|
}
|
@@ -161,10 +162,9 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
161
162
|
float field_norm;
|
162
163
|
Explanation *field_norm_expl;
|
163
164
|
|
164
|
-
char *query_str = self->query->to_s(self->query,
|
165
|
+
char *query_str = self->query->to_s(self->query, NULL);
|
165
166
|
TermQuery *tq = TQ(self->query);
|
166
167
|
char *term = tq->term;
|
167
|
-
char *field = tq->field;
|
168
168
|
|
169
169
|
Explanation *expl = expl_new(0.0, "weight(%s in %d), product of:",
|
170
170
|
query_str, doc_num);
|
@@ -172,9 +172,9 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
172
172
|
/* We need two of these as it's included in both the query explanation
|
173
173
|
* and the field explanation */
|
174
174
|
Explanation *idf_expl1 = expl_new(self->idf, "idf(doc_freq=%d)",
|
175
|
-
ir_doc_freq(ir, field, term));
|
175
|
+
ir_doc_freq(ir, tq->field, term));
|
176
176
|
Explanation *idf_expl2 = expl_new(self->idf, "idf(doc_freq=%d)",
|
177
|
-
ir_doc_freq(ir, field, term));
|
177
|
+
ir_doc_freq(ir, tq->field, term));
|
178
178
|
|
179
179
|
/* explain query weight */
|
180
180
|
Explanation *query_expl = expl_new(0.0, "query_weight(%s), product of:",
|
@@ -197,7 +197,7 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
197
197
|
|
198
198
|
/* explain field weight */
|
199
199
|
field_expl = expl_new(0.0, "field_weight(%s:%s in %d), product of:",
|
200
|
-
field, term, doc_num);
|
200
|
+
S(tq->field), term, doc_num);
|
201
201
|
|
202
202
|
scorer = self->scorer(self, ir);
|
203
203
|
tf_expl = scorer->explain(scorer, doc_num);
|
@@ -205,12 +205,12 @@ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
|
|
205
205
|
expl_add_detail(field_expl, tf_expl);
|
206
206
|
expl_add_detail(field_expl, idf_expl2);
|
207
207
|
|
208
|
-
field_norms = ir_get_norms(ir, field);
|
208
|
+
field_norms = ir_get_norms(ir, tq->field);
|
209
209
|
field_norm = (field_norms
|
210
210
|
? sim_decode_norm(self->similarity, field_norms[doc_num])
|
211
211
|
: (float)0.0);
|
212
212
|
field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
|
213
|
-
field, doc_num);
|
213
|
+
S(tq->field), doc_num);
|
214
214
|
|
215
215
|
expl_add_detail(field_expl, field_norm_expl);
|
216
216
|
|
@@ -259,18 +259,18 @@ static Weight *tw_new(Query *query, Searcher *searcher)
|
|
259
259
|
static void tq_destroy(Query *self)
|
260
260
|
{
|
261
261
|
free(TQ(self)->term);
|
262
|
-
free(TQ(self)->field);
|
263
262
|
q_destroy_i(self);
|
264
263
|
}
|
265
264
|
|
266
|
-
static char *tq_to_s(Query *self,
|
265
|
+
static char *tq_to_s(Query *self, Symbol default_field)
|
267
266
|
{
|
268
|
-
|
267
|
+
const char *field = S(TQ(self)->field);
|
268
|
+
size_t flen = strlen(field);
|
269
269
|
size_t tlen = strlen(TQ(self)->term);
|
270
270
|
char *buffer = ALLOC_N(char, 34 + flen + tlen);
|
271
271
|
char *b = buffer;
|
272
|
-
if (
|
273
|
-
memcpy(b,
|
272
|
+
if (default_field != TQ(self)->field) {
|
273
|
+
memcpy(b, field, sizeof(char) * flen);
|
274
274
|
b[flen] = ':';
|
275
275
|
b += flen + 1;
|
276
276
|
}
|
@@ -291,19 +291,19 @@ static void tq_extract_terms(Query *self, HashSet *terms)
|
|
291
291
|
|
292
292
|
static unsigned long tq_hash(Query *self)
|
293
293
|
{
|
294
|
-
return str_hash(TQ(self)->term) ^
|
294
|
+
return str_hash(TQ(self)->term) ^ sym_hash(TQ(self)->field);
|
295
295
|
}
|
296
296
|
|
297
297
|
static int tq_eq(Query *self, Query *o)
|
298
298
|
{
|
299
299
|
return (strcmp(TQ(self)->term, TQ(o)->term) == 0)
|
300
|
-
&& (
|
300
|
+
&& (TQ(self)->field == TQ(o)->field);
|
301
301
|
}
|
302
302
|
|
303
303
|
static MatchVector *tq_get_matchv_i(Query *self, MatchVector *mv,
|
304
304
|
TermVector *tv)
|
305
305
|
{
|
306
|
-
if (
|
306
|
+
if (tv->field == TQ(self)->field) {
|
307
307
|
int i;
|
308
308
|
TVTerm *tv_term = tv_get_tv_term(tv, TQ(self)->term);
|
309
309
|
if (tv_term) {
|
@@ -316,11 +316,11 @@ static MatchVector *tq_get_matchv_i(Query *self, MatchVector *mv,
|
|
316
316
|
return mv;
|
317
317
|
}
|
318
318
|
|
319
|
-
Query *tq_new(
|
319
|
+
Query *tq_new(Symbol field, const char *term)
|
320
320
|
{
|
321
321
|
Query *self = q_new(TermQuery);
|
322
322
|
|
323
|
-
TQ(self)->field =
|
323
|
+
TQ(self)->field = field;
|
324
324
|
TQ(self)->term = estrdup(term);
|
325
325
|
|
326
326
|
self->type = TERM_QUERY;
|
data/ext/q_wildcard.c
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include "search.h"
|
3
|
+
#include "symbol.h"
|
4
|
+
#include "internal.h"
|
3
5
|
|
4
6
|
/****************************************************************************
|
5
7
|
*
|
@@ -9,21 +11,17 @@
|
|
9
11
|
|
10
12
|
#define WCQ(query) ((WildCardQuery *)(query))
|
11
13
|
|
12
|
-
static char *wcq_to_s(Query *self,
|
14
|
+
static char *wcq_to_s(Query *self, Symbol default_field)
|
13
15
|
{
|
14
16
|
char *buffer, *bptr;
|
15
|
-
const char *
|
17
|
+
const char *field_str = S(WCQ(self)->field);
|
16
18
|
const char *pattern = WCQ(self)->pattern;
|
17
|
-
|
18
|
-
size_t plen = strlen(pattern);
|
19
|
-
bptr = buffer = ALLOC_N(char, plen + flen + 35);
|
19
|
+
bptr = buffer = ALLOC_N(char, strlen(pattern) + strlen(field_str) + 35);
|
20
20
|
|
21
|
-
if (
|
22
|
-
sprintf(bptr, "%s:",
|
23
|
-
bptr += flen + 1;
|
21
|
+
if (WCQ(self)->field != default_field) {
|
22
|
+
bptr += sprintf(bptr, "%s:", field_str);
|
24
23
|
}
|
25
|
-
sprintf(bptr, "%s", pattern);
|
26
|
-
bptr += plen;
|
24
|
+
bptr += sprintf(bptr, "%s", pattern);
|
27
25
|
|
28
26
|
if (self->boost != 1.0) {
|
29
27
|
*bptr = '^';
|
@@ -35,7 +33,7 @@ static char *wcq_to_s(Query *self, const char *current_field)
|
|
35
33
|
|
36
34
|
bool wc_match(const char *pattern, const char *text)
|
37
35
|
{
|
38
|
-
const char *p = pattern, *t = text, *xt;
|
36
|
+
const char *p = pattern, *t = text, *xt;
|
39
37
|
|
40
38
|
/* include '\0' as we need to match empty string */
|
41
39
|
const char *text_last = t + strlen(t);
|
@@ -83,18 +81,17 @@ bool wc_match(const char *pattern, const char *text)
|
|
83
81
|
static Query *wcq_rewrite(Query *self, IndexReader *ir)
|
84
82
|
{
|
85
83
|
Query *q;
|
86
|
-
const char *field = WCQ(self)->field;
|
87
84
|
const char *pattern = WCQ(self)->pattern;
|
88
85
|
const char *first_star = strchr(pattern, WILD_STRING);
|
89
86
|
const char *first_ques = strchr(pattern, WILD_CHAR);
|
90
87
|
|
91
88
|
if (NULL == first_star && NULL == first_ques) {
|
92
|
-
q = tq_new(field, pattern);
|
89
|
+
q = tq_new(WCQ(self)->field, pattern);
|
93
90
|
q->boost = self->boost;
|
94
91
|
}
|
95
92
|
else {
|
96
|
-
const int field_num = fis_get_field_num(ir->fis, field);
|
97
|
-
q = multi_tq_new_conf(field, MTQMaxTerms(self), 0.0);
|
93
|
+
const int field_num = fis_get_field_num(ir->fis, WCQ(self)->field);
|
94
|
+
q = multi_tq_new_conf(WCQ(self)->field, MTQMaxTerms(self), 0.0);
|
98
95
|
|
99
96
|
if (field_num >= 0) {
|
100
97
|
TermEnum *te;
|
@@ -116,8 +113,8 @@ static Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
116
113
|
if (te != NULL) {
|
117
114
|
const char *term = te->curr_term;
|
118
115
|
const char *pat_term = term + prefix_len;
|
119
|
-
do {
|
120
|
-
if (prefix && strncmp(term, prefix, prefix_len) != 0) {
|
116
|
+
do {
|
117
|
+
if (prefix[0] && strncmp(term, prefix, prefix_len) != 0) {
|
121
118
|
break;
|
122
119
|
}
|
123
120
|
|
@@ -135,27 +132,26 @@ static Query *wcq_rewrite(Query *self, IndexReader *ir)
|
|
135
132
|
|
136
133
|
static void wcq_destroy(Query *self)
|
137
134
|
{
|
138
|
-
free(WCQ(self)->field);
|
139
135
|
free(WCQ(self)->pattern);
|
140
136
|
q_destroy_i(self);
|
141
137
|
}
|
142
138
|
|
143
139
|
static unsigned long wcq_hash(Query *self)
|
144
140
|
{
|
145
|
-
return
|
141
|
+
return sym_hash(WCQ(self)->field) ^ str_hash(WCQ(self)->pattern);
|
146
142
|
}
|
147
143
|
|
148
144
|
static int wcq_eq(Query *self, Query *o)
|
149
145
|
{
|
150
|
-
return (strcmp(WCQ(self)->pattern, WCQ(o)->pattern) == 0)
|
151
|
-
&& (
|
146
|
+
return (strcmp(WCQ(self)->pattern, WCQ(o)->pattern) == 0)
|
147
|
+
&& (WCQ(self)->field == WCQ(o)->field);
|
152
148
|
}
|
153
149
|
|
154
|
-
Query *wcq_new(
|
150
|
+
Query *wcq_new(Symbol field, const char *pattern)
|
155
151
|
{
|
156
152
|
Query *self = q_new(WildCardQuery);
|
157
153
|
|
158
|
-
WCQ(self)->field =
|
154
|
+
WCQ(self)->field = field;
|
159
155
|
WCQ(self)->pattern = estrdup(pattern);
|
160
156
|
MTQMaxTerms(self) = WILD_CARD_QUERY_MAX_TERMS;
|
161
157
|
|
data/ext/r_analysis.c
CHANGED
@@ -1,10 +1,19 @@
|
|
1
|
-
#include
|
1
|
+
#include "lang.h"
|
2
|
+
#ifdef FRT_RUBY_VERSION_1_9
|
3
|
+
# include <ruby/re.h>
|
4
|
+
#else
|
5
|
+
# include <regex.h>
|
6
|
+
#endif
|
2
7
|
#include <locale.h>
|
3
|
-
#
|
8
|
+
#ifdef FRT_RUBY_VERSION_1_9
|
9
|
+
# include <ruby/st.h>
|
10
|
+
#else
|
11
|
+
# include <st.h>
|
12
|
+
#endif
|
4
13
|
#include "ferret.h"
|
5
14
|
#include "analysis.h"
|
6
15
|
|
7
|
-
static char *
|
16
|
+
static char *frb_locale = NULL;
|
8
17
|
|
9
18
|
static VALUE mAnalysis;
|
10
19
|
|
@@ -47,13 +56,19 @@ static ID id_token_stream;
|
|
47
56
|
|
48
57
|
static VALUE object_space;
|
49
58
|
|
59
|
+
#ifndef FRT_RUBY_VERSION_1_9
|
50
60
|
extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
|
51
61
|
int, struct re_registers *);
|
62
|
+
#endif
|
52
63
|
|
53
64
|
int
|
54
|
-
|
65
|
+
frb_rb_hash_size(VALUE hash)
|
55
66
|
{
|
67
|
+
#ifdef FRT_RUBY_VERSION_1_9
|
68
|
+
return RHASH(hash)->ntbl->num_entries;
|
69
|
+
#else
|
56
70
|
return RHASH(hash)->tbl->num_entries;
|
71
|
+
#endif
|
57
72
|
}
|
58
73
|
|
59
74
|
/****************************************************************************
|
@@ -69,11 +84,11 @@ get_stopwords(VALUE rstop_words)
|
|
69
84
|
int i, len;
|
70
85
|
VALUE rstr;
|
71
86
|
Check_Type(rstop_words, T_ARRAY);
|
72
|
-
len =
|
73
|
-
stop_words = ALLOC_N(char *,
|
87
|
+
len = RARRAY_LEN(rstop_words);
|
88
|
+
stop_words = ALLOC_N(char *, RARRAY_LEN(rstop_words) + 1);
|
74
89
|
stop_words[len] = NULL;
|
75
90
|
for (i = 0; i < len; i++) {
|
76
|
-
rstr = rb_obj_as_string(
|
91
|
+
rstr = rb_obj_as_string(RARRAY_PTR(rstop_words)[i]);
|
77
92
|
stop_words[i] = rs2s(rstr);
|
78
93
|
}
|
79
94
|
return stop_words;
|
@@ -93,22 +108,22 @@ typedef struct RToken {
|
|
93
108
|
} RToken;
|
94
109
|
|
95
110
|
static void
|
96
|
-
|
111
|
+
frb_token_free(void *p)
|
97
112
|
{
|
98
113
|
free(p);
|
99
114
|
}
|
100
115
|
|
101
116
|
static void
|
102
|
-
|
117
|
+
frb_token_mark(void *p)
|
103
118
|
{
|
104
119
|
RToken *token = (RToken *)p;
|
105
120
|
rb_gc_mark(token->text);
|
106
121
|
}
|
107
122
|
|
108
123
|
static VALUE
|
109
|
-
|
124
|
+
frb_token_alloc(VALUE klass)
|
110
125
|
{
|
111
|
-
return Data_Wrap_Struct(klass, &
|
126
|
+
return Data_Wrap_Struct(klass, &frb_token_mark, &frb_token_free,
|
112
127
|
ALLOC(RToken));
|
113
128
|
}
|
114
129
|
|
@@ -121,18 +136,18 @@ get_token(Token *tk)
|
|
121
136
|
token->start = tk->start;
|
122
137
|
token->end = tk->end;
|
123
138
|
token->pos_inc = tk->pos_inc;
|
124
|
-
return Data_Wrap_Struct(cToken, &
|
139
|
+
return Data_Wrap_Struct(cToken, &frb_token_mark, &frb_token_free, token);
|
125
140
|
}
|
126
141
|
|
127
142
|
Token *
|
128
|
-
|
143
|
+
frb_set_token(Token *tk, VALUE rt)
|
129
144
|
{
|
130
145
|
RToken *rtk;
|
131
146
|
|
132
147
|
if (rt == Qnil) return NULL;
|
133
148
|
|
134
149
|
Data_Get_Struct(rt, RToken, rtk);
|
135
|
-
tk_set(tk, rs2s(rtk->text),
|
150
|
+
tk_set(tk, rs2s(rtk->text), RSTRING_LEN(rtk->text),
|
136
151
|
rtk->start, rtk->end, rtk->pos_inc);
|
137
152
|
return tk;
|
138
153
|
}
|
@@ -171,7 +186,7 @@ frt_set_token(Token *tk, VALUE rt)
|
|
171
186
|
* return:: a newly created and assigned Token object
|
172
187
|
*/
|
173
188
|
static VALUE
|
174
|
-
|
189
|
+
frb_token_init(int argc, VALUE *argv, VALUE self)
|
175
190
|
{
|
176
191
|
RToken *token;
|
177
192
|
VALUE rtext, rstart, rend, rpos_inc, rtype;
|
@@ -201,7 +216,7 @@ frt_token_init(int argc, VALUE *argv, VALUE self)
|
|
201
216
|
* lexically by the token text.
|
202
217
|
*/
|
203
218
|
static VALUE
|
204
|
-
|
219
|
+
frb_token_cmp(VALUE self, VALUE rother)
|
205
220
|
{
|
206
221
|
RToken *token, *other;
|
207
222
|
int cmp;
|
@@ -230,7 +245,7 @@ frt_token_cmp(VALUE self, VALUE rother)
|
|
230
245
|
* Returns the text that this token represents
|
231
246
|
*/
|
232
247
|
static VALUE
|
233
|
-
|
248
|
+
frb_token_get_text(VALUE self)
|
234
249
|
{
|
235
250
|
RToken *token;
|
236
251
|
GET_TK(token, self);
|
@@ -244,7 +259,7 @@ frt_token_get_text(VALUE self)
|
|
244
259
|
* Set the text for this token.
|
245
260
|
*/
|
246
261
|
static VALUE
|
247
|
-
|
262
|
+
frb_token_set_text(VALUE self, VALUE rtext)
|
248
263
|
{
|
249
264
|
RToken *token;
|
250
265
|
GET_TK(token, self);
|
@@ -259,7 +274,7 @@ frt_token_set_text(VALUE self, VALUE rtext)
|
|
259
274
|
* Start byte-position of this token
|
260
275
|
*/
|
261
276
|
static VALUE
|
262
|
-
|
277
|
+
frb_token_get_start_offset(VALUE self)
|
263
278
|
{
|
264
279
|
RToken *token;
|
265
280
|
GET_TK(token, self);
|
@@ -273,7 +288,7 @@ frt_token_get_start_offset(VALUE self)
|
|
273
288
|
* End byte-position of this token
|
274
289
|
*/
|
275
290
|
static VALUE
|
276
|
-
|
291
|
+
frb_token_get_end_offset(VALUE self)
|
277
292
|
{
|
278
293
|
RToken *token;
|
279
294
|
GET_TK(token, self);
|
@@ -287,7 +302,7 @@ frt_token_get_end_offset(VALUE self)
|
|
287
302
|
* Position Increment for this token
|
288
303
|
*/
|
289
304
|
static VALUE
|
290
|
-
|
305
|
+
frb_token_get_pos_inc(VALUE self)
|
291
306
|
{
|
292
307
|
RToken *token;
|
293
308
|
GET_TK(token, self);
|
@@ -301,7 +316,7 @@ frt_token_get_pos_inc(VALUE self)
|
|
301
316
|
* Set start byte-position of this token
|
302
317
|
*/
|
303
318
|
static VALUE
|
304
|
-
|
319
|
+
frb_token_set_start_offset(VALUE self, VALUE rstart)
|
305
320
|
{
|
306
321
|
RToken *token;
|
307
322
|
GET_TK(token, self);
|
@@ -316,7 +331,7 @@ frt_token_set_start_offset(VALUE self, VALUE rstart)
|
|
316
331
|
* Set end byte-position of this token
|
317
332
|
*/
|
318
333
|
static VALUE
|
319
|
-
|
334
|
+
frb_token_set_end_offset(VALUE self, VALUE rend)
|
320
335
|
{
|
321
336
|
RToken *token;
|
322
337
|
GET_TK(token, self);
|
@@ -352,7 +367,7 @@ frt_token_set_end_offset(VALUE self, VALUE rend)
|
|
352
367
|
*
|
353
368
|
*/
|
354
369
|
static VALUE
|
355
|
-
|
370
|
+
frb_token_set_pos_inc(VALUE self, VALUE rpos_inc)
|
356
371
|
{
|
357
372
|
RToken *token;
|
358
373
|
GET_TK(token, self);
|
@@ -367,12 +382,12 @@ frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
|
|
367
382
|
* Return a string representation of the token
|
368
383
|
*/
|
369
384
|
static VALUE
|
370
|
-
|
385
|
+
frb_token_to_s(VALUE self)
|
371
386
|
{
|
372
387
|
RToken *token;
|
373
388
|
char *buf;
|
374
389
|
GET_TK(token, self);
|
375
|
-
buf = alloca(
|
390
|
+
buf = alloca(RSTRING_LEN(token->text) + 80);
|
376
391
|
sprintf(buf, "token[\"%s\":%d:%d:%d]", rs2s(token->text),
|
377
392
|
token->start, token->end, token->pos_inc);
|
378
393
|
return rb_str_new2(buf);
|
@@ -387,14 +402,14 @@ frt_token_to_s(VALUE self)
|
|
387
402
|
#define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
|
388
403
|
|
389
404
|
static void
|
390
|
-
|
405
|
+
frb_ts_mark(void *p)
|
391
406
|
{
|
392
407
|
TokenStream *ts = (TokenStream *)p;
|
393
|
-
if (ts->text)
|
408
|
+
if (ts->text) frb_gc_mark(&ts->text);
|
394
409
|
}
|
395
410
|
|
396
411
|
static void
|
397
|
-
|
412
|
+
frb_ts_free(TokenStream *ts)
|
398
413
|
{
|
399
414
|
if (object_get(&ts->text) != Qnil) {
|
400
415
|
object_del(&ts->text);
|
@@ -403,8 +418,8 @@ frt_ts_free(TokenStream *ts)
|
|
403
418
|
ts_deref(ts);
|
404
419
|
}
|
405
420
|
|
406
|
-
static void
|
407
|
-
static void
|
421
|
+
static void frb_rets_free(TokenStream *ts);
|
422
|
+
static void frb_rets_mark(TokenStream *ts);
|
408
423
|
static Token *rets_next(TokenStream *ts);
|
409
424
|
|
410
425
|
static VALUE
|
@@ -413,11 +428,11 @@ get_rb_token_stream(TokenStream *ts)
|
|
413
428
|
VALUE rts = object_get(ts);
|
414
429
|
if (rts == Qnil) {
|
415
430
|
if (ts->next == &rets_next) {
|
416
|
-
rts = Data_Wrap_Struct(cTokenStream, &
|
417
|
-
&
|
431
|
+
rts = Data_Wrap_Struct(cTokenStream, &frb_rets_mark,
|
432
|
+
&frb_rets_free, ts);
|
418
433
|
} else {
|
419
|
-
rts = Data_Wrap_Struct(cTokenStream, &
|
420
|
-
&
|
434
|
+
rts = Data_Wrap_Struct(cTokenStream, &frb_ts_mark,
|
435
|
+
&frb_ts_free, ts);
|
421
436
|
}
|
422
437
|
object_add(ts, rts);
|
423
438
|
}
|
@@ -429,7 +444,7 @@ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
|
|
429
444
|
{
|
430
445
|
StringValue(rstr);
|
431
446
|
ts->reset(ts, rs2s(rstr));
|
432
|
-
Frt_Wrap_Struct(self, &
|
447
|
+
Frt_Wrap_Struct(self, &frb_ts_mark, &frb_ts_free, ts);
|
433
448
|
object_add(&ts->text, rstr);
|
434
449
|
object_add(ts, self);
|
435
450
|
return self;
|
@@ -445,7 +460,7 @@ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
|
|
445
460
|
* token_stream.text = File.read(file_name)
|
446
461
|
*/
|
447
462
|
static VALUE
|
448
|
-
|
463
|
+
frb_ts_set_text(VALUE self, VALUE rtext)
|
449
464
|
{
|
450
465
|
TokenStream *ts;
|
451
466
|
Data_Get_Struct(self, TokenStream, ts);
|
@@ -465,7 +480,7 @@ frt_ts_set_text(VALUE self, VALUE rtext)
|
|
465
480
|
* Return the text that the TokenStream is tokenizing
|
466
481
|
*/
|
467
482
|
static VALUE
|
468
|
-
|
483
|
+
frb_ts_get_text(VALUE self)
|
469
484
|
{
|
470
485
|
VALUE rtext = Qnil;
|
471
486
|
TokenStream *ts;
|
@@ -487,7 +502,7 @@ frt_ts_get_text(VALUE self)
|
|
487
502
|
* tokens.
|
488
503
|
*/
|
489
504
|
static VALUE
|
490
|
-
|
505
|
+
frb_ts_next(VALUE self)
|
491
506
|
{
|
492
507
|
TokenStream *ts;
|
493
508
|
Token *next;
|
@@ -507,16 +522,16 @@ frt_ts_next(VALUE self)
|
|
507
522
|
#define TkFilt(filter) ((TokenFilter *)(filter))
|
508
523
|
|
509
524
|
static void
|
510
|
-
|
525
|
+
frb_tf_mark(void *p)
|
511
526
|
{
|
512
527
|
TokenStream *ts = (TokenStream *)p;
|
513
528
|
if (TkFilt(ts)->sub_ts) {
|
514
|
-
|
529
|
+
frb_gc_mark(&TkFilt(ts)->sub_ts);
|
515
530
|
}
|
516
531
|
}
|
517
532
|
|
518
533
|
static void
|
519
|
-
|
534
|
+
frb_tf_free(TokenStream *ts)
|
520
535
|
{
|
521
536
|
if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
|
522
537
|
object_del(&TkFilt(ts)->sub_ts);
|
@@ -545,7 +560,7 @@ cwrts_destroy_i(TokenStream *ts)
|
|
545
560
|
object_del(&ts->text);
|
546
561
|
}
|
547
562
|
rb_hash_delete(object_space, ((VALUE)ts)|1);
|
548
|
-
/*printf("rb_hash_size = %d\n",
|
563
|
+
/*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
|
549
564
|
free(ts);
|
550
565
|
}
|
551
566
|
|
@@ -553,7 +568,7 @@ static Token *
|
|
553
568
|
cwrts_next(TokenStream *ts)
|
554
569
|
{
|
555
570
|
VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
|
556
|
-
return
|
571
|
+
return frb_set_token(&(CachedTS(ts)->token), rtoken);
|
557
572
|
}
|
558
573
|
|
559
574
|
static TokenStream *
|
@@ -574,10 +589,10 @@ cwrts_clone_i(TokenStream *orig_ts)
|
|
574
589
|
}
|
575
590
|
|
576
591
|
static TokenStream *
|
577
|
-
|
592
|
+
frb_get_cwrapped_rts(VALUE rts)
|
578
593
|
{
|
579
594
|
TokenStream *ts;
|
580
|
-
if (
|
595
|
+
if (frb_is_cclass(rts) && DATA_PTR(rts)) {
|
581
596
|
GET_TS(ts, rts);
|
582
597
|
REF(ts);
|
583
598
|
}
|
@@ -621,7 +636,7 @@ typedef struct RegExpTokenStream {
|
|
621
636
|
VALUE rtext;
|
622
637
|
VALUE regex;
|
623
638
|
VALUE proc;
|
624
|
-
|
639
|
+
long curr_ind;
|
625
640
|
} RegExpTokenStream;
|
626
641
|
|
627
642
|
static void
|
@@ -631,12 +646,12 @@ rets_destroy_i(TokenStream *ts)
|
|
631
646
|
object_del(&ts->text);
|
632
647
|
}
|
633
648
|
rb_hash_delete(object_space, ((VALUE)ts)|1);
|
634
|
-
/*printf("rb_hash_size = %d\n",
|
649
|
+
/*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
|
635
650
|
free(ts);
|
636
651
|
}
|
637
652
|
|
638
653
|
static void
|
639
|
-
|
654
|
+
frb_rets_free(TokenStream *ts)
|
640
655
|
{
|
641
656
|
if (object_get(&ts->text) != Qnil) {
|
642
657
|
object_del(&ts->text);
|
@@ -646,9 +661,9 @@ frt_rets_free(TokenStream *ts)
|
|
646
661
|
}
|
647
662
|
|
648
663
|
static void
|
649
|
-
|
664
|
+
frb_rets_mark(TokenStream *ts)
|
650
665
|
{
|
651
|
-
if (ts->text)
|
666
|
+
if (ts->text) frb_gc_mark(&ts->text);
|
652
667
|
rb_gc_mark(RETS(ts)->rtext);
|
653
668
|
rb_gc_mark(RETS(ts)->regex);
|
654
669
|
rb_gc_mark(RETS(ts)->proc);
|
@@ -662,7 +677,7 @@ frt_rets_mark(TokenStream *ts)
|
|
662
677
|
* tokenize the text from the beginning.
|
663
678
|
*/
|
664
679
|
static VALUE
|
665
|
-
|
680
|
+
frb_rets_set_text(VALUE self, VALUE rtext)
|
666
681
|
{
|
667
682
|
TokenStream *ts;
|
668
683
|
GET_TS(ts, self);
|
@@ -682,23 +697,88 @@ frt_rets_set_text(VALUE self, VALUE rtext)
|
|
682
697
|
* Get the text being tokenized by the tokenizer.
|
683
698
|
*/
|
684
699
|
static VALUE
|
685
|
-
|
700
|
+
frb_rets_get_text(VALUE self)
|
686
701
|
{
|
687
702
|
TokenStream *ts;
|
688
703
|
GET_TS(ts, self);
|
689
704
|
return RETS(ts)->rtext;
|
690
705
|
}
|
691
706
|
|
707
|
+
#ifdef FRT_RUBY_VERSION_1_9
|
708
|
+
|
709
|
+
// partly lifted from ruby 1.9 string.c
|
710
|
+
#include <ruby/encoding.h>
|
711
|
+
#define BEG(no) regs->beg[no]
|
712
|
+
#define END(no) regs->end[no]
|
713
|
+
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
|
714
|
+
static VALUE
|
715
|
+
scan_once(VALUE str, VALUE pat, long *start)
|
716
|
+
{
|
717
|
+
VALUE match;
|
718
|
+
struct re_registers *regs;
|
719
|
+
|
720
|
+
if (rb_reg_search(pat, str, *start, 0) >= 0) {
|
721
|
+
match = rb_backref_get();
|
722
|
+
regs = RMATCH_REGS(match);
|
723
|
+
if (BEG(0) == END(0)) {
|
724
|
+
rb_encoding *enc = STR_ENC_GET(str);
|
725
|
+
/*
|
726
|
+
* Always consume at least one character of the input string
|
727
|
+
*/
|
728
|
+
if (RSTRING_LEN(str) > END(0))
|
729
|
+
*start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
|
730
|
+
RSTRING_END(str), enc);
|
731
|
+
else
|
732
|
+
*start = END(0)+1;
|
733
|
+
}
|
734
|
+
else {
|
735
|
+
*start = END(0);
|
736
|
+
}
|
737
|
+
return rb_reg_nth_match(0, match);
|
738
|
+
}
|
739
|
+
return Qnil;
|
740
|
+
}
|
741
|
+
//
|
742
|
+
|
743
|
+
static Token *
|
744
|
+
rets_next(TokenStream *ts)
|
745
|
+
{
|
746
|
+
VALUE ret;
|
747
|
+
long rtok_len;
|
748
|
+
int beg, end;
|
749
|
+
Check_Type(RETS(ts)->regex, T_REGEXP);
|
750
|
+
ret = scan_once(RETS(ts)->rtext, RETS(ts)->regex, &(RETS(ts)->curr_ind));
|
751
|
+
if (NIL_P(ret)) return NULL;
|
752
|
+
|
753
|
+
Check_Type(ret, T_STRING);
|
754
|
+
rtok_len = RSTRING_LEN(ret);
|
755
|
+
beg = RETS(ts)->curr_ind - rtok_len;
|
756
|
+
end = RETS(ts)->curr_ind;
|
757
|
+
|
758
|
+
if (NIL_P(RETS(ts)->proc)) {
|
759
|
+
return tk_set(&(CachedTS(ts)->token), rs2s(ret), rtok_len,
|
760
|
+
beg, end, 1);
|
761
|
+
} else {
|
762
|
+
VALUE rtok;
|
763
|
+
rtok = rb_funcall(RETS(ts)->proc, id_call, 1, ret);
|
764
|
+
return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
|
765
|
+
RSTRING_LEN(rtok), beg, end, 1);
|
766
|
+
}
|
767
|
+
}
|
768
|
+
|
769
|
+
#else
|
770
|
+
|
692
771
|
static Token *
|
693
772
|
rets_next(TokenStream *ts)
|
694
773
|
{
|
695
774
|
static struct re_registers regs;
|
696
775
|
int ret, beg, end;
|
697
|
-
|
776
|
+
long rtext_len = RSTRING_LEN(RETS(ts)->rtext);
|
777
|
+
char *rtext_ptr = RSTRING_PTR(RETS(ts)->rtext);
|
698
778
|
Check_Type(RETS(ts)->regex, T_REGEXP);
|
699
779
|
ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
|
700
|
-
|
701
|
-
RETS(ts)->curr_ind,
|
780
|
+
rtext_ptr, rtext_len,
|
781
|
+
RETS(ts)->curr_ind, rtext_len - RETS(ts)->curr_ind,
|
702
782
|
®s);
|
703
783
|
|
704
784
|
if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
|
@@ -707,16 +787,18 @@ rets_next(TokenStream *ts)
|
|
707
787
|
beg = regs.beg[0];
|
708
788
|
RETS(ts)->curr_ind = end = regs.end[0];
|
709
789
|
if (NIL_P(RETS(ts)->proc)) {
|
710
|
-
return tk_set(&(CachedTS(ts)->token),
|
790
|
+
return tk_set(&(CachedTS(ts)->token), rtext_ptr + beg, end - beg,
|
711
791
|
beg, end, 1);
|
712
792
|
} else {
|
713
|
-
VALUE rtok = rb_str_new(
|
793
|
+
VALUE rtok = rb_str_new(rtext_ptr + beg, end - beg);
|
714
794
|
rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
|
715
795
|
return tk_set(&(CachedTS(ts)->token), rs2s(rtok),
|
716
|
-
|
796
|
+
RSTRING_LEN(rtok), beg, end, 1);
|
717
797
|
}
|
718
798
|
}
|
719
799
|
|
800
|
+
#endif
|
801
|
+
|
720
802
|
static TokenStream *
|
721
803
|
rets_reset(TokenStream *ts, char *text)
|
722
804
|
{
|
@@ -770,7 +852,7 @@ rets_new(VALUE rtext, VALUE regex, VALUE proc)
|
|
770
852
|
* regexp:: regular expression used to recognize tokens in the input
|
771
853
|
*/
|
772
854
|
static VALUE
|
773
|
-
|
855
|
+
frb_rets_init(int argc, VALUE *argv, VALUE self)
|
774
856
|
{
|
775
857
|
VALUE rtext, regex, proc;
|
776
858
|
TokenStream *ts;
|
@@ -779,7 +861,7 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
|
|
779
861
|
|
780
862
|
ts = rets_new(rtext, regex, proc);
|
781
863
|
|
782
|
-
Frt_Wrap_Struct(self, &
|
864
|
+
Frt_Wrap_Struct(self, &frb_rets_mark, &frb_rets_free, ts);
|
783
865
|
object_add(ts, self);
|
784
866
|
return self;
|
785
867
|
}
|
@@ -801,7 +883,7 @@ lower = (argc ? RTEST(rlower) : dflt)
|
|
801
883
|
* Create a new AsciiLetterTokenizer
|
802
884
|
*/
|
803
885
|
static VALUE
|
804
|
-
|
886
|
+
frb_a_letter_tokenizer_init(VALUE self, VALUE rstr)
|
805
887
|
{
|
806
888
|
return get_wrapped_ts(self, rstr, letter_tokenizer_new());
|
807
889
|
}
|
@@ -816,11 +898,11 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
|
|
816
898
|
* lower:: set to false if you don't wish to downcase tokens
|
817
899
|
*/
|
818
900
|
static VALUE
|
819
|
-
|
901
|
+
frb_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
820
902
|
{
|
821
903
|
TS_ARGS(false);
|
822
904
|
#ifndef POSH_OS_WIN32
|
823
|
-
if (!
|
905
|
+
if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
|
824
906
|
#endif
|
825
907
|
return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
|
826
908
|
}
|
@@ -832,7 +914,7 @@ frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
|
832
914
|
* Create a new AsciiWhiteSpaceTokenizer
|
833
915
|
*/
|
834
916
|
static VALUE
|
835
|
-
|
917
|
+
frb_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
|
836
918
|
{
|
837
919
|
return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
|
838
920
|
}
|
@@ -847,11 +929,11 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
|
|
847
929
|
* lower:: set to false if you don't wish to downcase tokens
|
848
930
|
*/
|
849
931
|
static VALUE
|
850
|
-
|
932
|
+
frb_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
851
933
|
{
|
852
934
|
TS_ARGS(false);
|
853
935
|
#ifndef POSH_OS_WIN32
|
854
|
-
if (!
|
936
|
+
if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
|
855
937
|
#endif
|
856
938
|
return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
|
857
939
|
}
|
@@ -863,7 +945,7 @@ frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
|
|
863
945
|
* Create a new AsciiStandardTokenizer
|
864
946
|
*/
|
865
947
|
static VALUE
|
866
|
-
|
948
|
+
frb_a_standard_tokenizer_init(VALUE self, VALUE rstr)
|
867
949
|
{
|
868
950
|
return get_wrapped_ts(self, rstr, standard_tokenizer_new());
|
869
951
|
}
|
@@ -878,10 +960,10 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
878
960
|
* lower:: set to false if you don't wish to downcase tokens
|
879
961
|
*/
|
880
962
|
static VALUE
|
881
|
-
|
963
|
+
frb_standard_tokenizer_init(VALUE self, VALUE rstr)
|
882
964
|
{
|
883
965
|
#ifndef POSH_OS_WIN32
|
884
|
-
if (!
|
966
|
+
if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
|
885
967
|
#endif
|
886
968
|
return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
|
887
969
|
}
|
@@ -900,13 +982,13 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
900
982
|
* LowerCaseFilter.
|
901
983
|
*/
|
902
984
|
static VALUE
|
903
|
-
|
985
|
+
frb_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
904
986
|
{
|
905
|
-
TokenStream *ts =
|
987
|
+
TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
|
906
988
|
ts = lowercase_filter_new(ts);
|
907
989
|
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
908
990
|
|
909
|
-
Frt_Wrap_Struct(self, &
|
991
|
+
Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
|
910
992
|
object_add(ts, self);
|
911
993
|
return self;
|
912
994
|
}
|
@@ -919,16 +1001,16 @@ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
|
919
1001
|
* lowercase based on the current locale.
|
920
1002
|
*/
|
921
1003
|
static VALUE
|
922
|
-
|
1004
|
+
frb_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
923
1005
|
{
|
924
|
-
TokenStream *ts =
|
1006
|
+
TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
|
925
1007
|
#ifndef POSH_OS_WIN32
|
926
|
-
if (!
|
1008
|
+
if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
|
927
1009
|
#endif
|
928
1010
|
ts = mb_lowercase_filter_new(ts);
|
929
1011
|
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
930
1012
|
|
931
|
-
Frt_Wrap_Struct(self, &
|
1013
|
+
Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
|
932
1014
|
object_add(ts, self);
|
933
1015
|
return self;
|
934
1016
|
}
|
@@ -944,13 +1026,13 @@ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
|
944
1026
|
* used by default by the StandardAnalyzer.
|
945
1027
|
*/
|
946
1028
|
static VALUE
|
947
|
-
|
1029
|
+
frb_hyphen_filter_init(VALUE self, VALUE rsub_ts)
|
948
1030
|
{
|
949
|
-
TokenStream *ts =
|
1031
|
+
TokenStream *ts = frb_get_cwrapped_rts(rsub_ts);
|
950
1032
|
ts = hyphen_filter_new(ts);
|
951
1033
|
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
952
1034
|
|
953
|
-
Frt_Wrap_Struct(self, &
|
1035
|
+
Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
|
954
1036
|
object_add(ts, self);
|
955
1037
|
return self;
|
956
1038
|
}
|
@@ -969,12 +1051,12 @@ frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
|
|
969
1051
|
* Ferret::Analysis contains a number of stop-word lists.
|
970
1052
|
*/
|
971
1053
|
static VALUE
|
972
|
-
|
1054
|
+
frb_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
973
1055
|
{
|
974
1056
|
VALUE rsub_ts, rstop_words;
|
975
1057
|
TokenStream *ts;
|
976
1058
|
rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
|
977
|
-
ts =
|
1059
|
+
ts = frb_get_cwrapped_rts(rsub_ts);
|
978
1060
|
if (rstop_words != Qnil) {
|
979
1061
|
char **stop_words = get_stopwords(rstop_words);
|
980
1062
|
ts = stop_filter_new_with_words(ts, (const char **)stop_words);
|
@@ -985,12 +1067,13 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
|
985
1067
|
}
|
986
1068
|
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
987
1069
|
|
988
|
-
Frt_Wrap_Struct(self, &
|
1070
|
+
Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
|
989
1071
|
object_add(ts, self);
|
990
1072
|
return self;
|
991
1073
|
}
|
992
1074
|
|
993
|
-
static INLINE void
|
1075
|
+
static INLINE void frb_add_mapping_i(TokenStream *mf, VALUE from,
|
1076
|
+
const char *to)
|
994
1077
|
{
|
995
1078
|
switch (TYPE(from)) {
|
996
1079
|
case T_STRING:
|
@@ -1007,13 +1090,13 @@ static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
|
|
1007
1090
|
}
|
1008
1091
|
}
|
1009
1092
|
|
1010
|
-
static int
|
1093
|
+
static int frb_add_mappings_i(VALUE key, VALUE value, VALUE arg)
|
1011
1094
|
{
|
1012
1095
|
if (key == Qundef) {
|
1013
1096
|
return ST_CONTINUE;
|
1014
1097
|
} else {
|
1015
1098
|
TokenStream *mf = (TokenStream *)arg;
|
1016
|
-
char *to;
|
1099
|
+
const char *to;
|
1017
1100
|
switch (TYPE(value)) {
|
1018
1101
|
case T_STRING:
|
1019
1102
|
to = rs2s(value);
|
@@ -1029,12 +1112,12 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
|
|
1029
1112
|
}
|
1030
1113
|
if (TYPE(key) == T_ARRAY) {
|
1031
1114
|
int i;
|
1032
|
-
for (i =
|
1033
|
-
|
1115
|
+
for (i = RARRAY_LEN(key) - 1; i >= 0; i--) {
|
1116
|
+
frb_add_mapping_i(mf, RARRAY_PTR(key)[i], to);
|
1034
1117
|
}
|
1035
1118
|
}
|
1036
1119
|
else {
|
1037
|
-
|
1120
|
+
frb_add_mapping_i(mf, key, to);
|
1038
1121
|
}
|
1039
1122
|
}
|
1040
1123
|
return ST_CONTINUE;
|
@@ -1066,16 +1149,16 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
|
|
1066
1149
|
* })
|
1067
1150
|
*/
|
1068
1151
|
static VALUE
|
1069
|
-
|
1152
|
+
frb_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
|
1070
1153
|
{
|
1071
1154
|
TokenStream *ts;
|
1072
|
-
ts =
|
1155
|
+
ts = frb_get_cwrapped_rts(rsub_ts);
|
1073
1156
|
ts = mapping_filter_new(ts);
|
1074
|
-
rb_hash_foreach(mapping,
|
1157
|
+
rb_hash_foreach(mapping, frb_add_mappings_i, (VALUE)ts);
|
1075
1158
|
mulmap_compile(((MappingFilter *)ts)->mapper);
|
1076
1159
|
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
1077
1160
|
|
1078
|
-
Frt_Wrap_Struct(self, &
|
1161
|
+
Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
|
1079
1162
|
object_add(ts, self);
|
1080
1163
|
return self;
|
1081
1164
|
}
|
@@ -1096,14 +1179,14 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
|
|
1096
1179
|
* encoding:: The encoding of the data (default: "UTF-8")
|
1097
1180
|
*/
|
1098
1181
|
static VALUE
|
1099
|
-
|
1182
|
+
frb_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
1100
1183
|
{
|
1101
1184
|
VALUE rsub_ts, ralgorithm, rcharenc;
|
1102
1185
|
char *algorithm = "english";
|
1103
1186
|
char *charenc = NULL;
|
1104
1187
|
TokenStream *ts;
|
1105
1188
|
rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
|
1106
|
-
ts =
|
1189
|
+
ts = frb_get_cwrapped_rts(rsub_ts);
|
1107
1190
|
switch (argc) {
|
1108
1191
|
case 3: charenc = rs2s(rb_obj_as_string(rcharenc));
|
1109
1192
|
case 2: algorithm = rs2s(rb_obj_as_string(ralgorithm));
|
@@ -1111,8 +1194,12 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
|
|
1111
1194
|
ts = stem_filter_new(ts, algorithm, charenc);
|
1112
1195
|
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
1113
1196
|
|
1114
|
-
Frt_Wrap_Struct(self, &
|
1197
|
+
Frt_Wrap_Struct(self, &frb_tf_mark, &frb_tf_free, ts);
|
1115
1198
|
object_add(ts, self);
|
1199
|
+
if (((StemFilter *)ts)->stemmer == NULL) {
|
1200
|
+
rb_raise(rb_eArgError, "No stemmer could be found with the encoding "
|
1201
|
+
"%s and the language %s", charenc, algorithm);
|
1202
|
+
}
|
1116
1203
|
return self;
|
1117
1204
|
}
|
1118
1205
|
|
@@ -1139,28 +1226,28 @@ static void
|
|
1139
1226
|
cwa_destroy_i(Analyzer *a)
|
1140
1227
|
{
|
1141
1228
|
rb_hash_delete(object_space, ((VALUE)a)|1);
|
1142
|
-
/*printf("rb_hash_size = %d\n",
|
1229
|
+
/*printf("rb_hash_size = %d\n", frb_rb_hash_size(object_space)); */
|
1143
1230
|
free(a);
|
1144
1231
|
}
|
1145
1232
|
|
1146
1233
|
static TokenStream *
|
1147
|
-
cwa_get_ts(Analyzer *a,
|
1234
|
+
cwa_get_ts(Analyzer *a, Symbol field, char *text)
|
1148
1235
|
{
|
1149
1236
|
VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
|
1150
|
-
|
1151
|
-
return
|
1237
|
+
FSYM2SYM(field), rb_str_new2(text));
|
1238
|
+
return frb_get_cwrapped_rts(rts);
|
1152
1239
|
}
|
1153
1240
|
|
1154
1241
|
Analyzer *
|
1155
|
-
|
1242
|
+
frb_get_cwrapped_analyzer(VALUE ranalyzer)
|
1156
1243
|
{
|
1157
1244
|
Analyzer *a = NULL;
|
1158
|
-
if (
|
1245
|
+
if (frb_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
|
1159
1246
|
Data_Get_Struct(ranalyzer, Analyzer, a);
|
1160
1247
|
REF(a);
|
1161
1248
|
}
|
1162
1249
|
else {
|
1163
|
-
a = (Analyzer *)
|
1250
|
+
a = (Analyzer *)frt_ecalloc(sizeof(CWrappedAnalyzer));
|
1164
1251
|
a->destroy_i = &cwa_destroy_i;
|
1165
1252
|
a->get_ts = &cwa_get_ts;
|
1166
1253
|
a->ref_cnt = 1;
|
@@ -1172,20 +1259,20 @@ frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
|
1172
1259
|
}
|
1173
1260
|
|
1174
1261
|
static void
|
1175
|
-
|
1262
|
+
frb_analyzer_free(Analyzer *a)
|
1176
1263
|
{
|
1177
1264
|
object_del(a);
|
1178
1265
|
a_deref(a);
|
1179
1266
|
}
|
1180
1267
|
|
1181
1268
|
VALUE
|
1182
|
-
|
1269
|
+
frb_get_analyzer(Analyzer *a)
|
1183
1270
|
{
|
1184
1271
|
VALUE self = Qnil;
|
1185
1272
|
if (a) {
|
1186
1273
|
self = object_get(a);
|
1187
1274
|
if (self == Qnil) {
|
1188
|
-
self = Data_Wrap_Struct(cAnalyzer, NULL, &
|
1275
|
+
self = Data_Wrap_Struct(cAnalyzer, NULL, &frb_analyzer_free, a);
|
1189
1276
|
REF(a);
|
1190
1277
|
object_add(a, self);
|
1191
1278
|
}
|
@@ -1196,7 +1283,7 @@ frt_get_analyzer(Analyzer *a)
|
|
1196
1283
|
INLINE VALUE
|
1197
1284
|
get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
|
1198
1285
|
{
|
1199
|
-
TokenStream *ts = a_get_ts(a,
|
1286
|
+
TokenStream *ts = a_get_ts(a, frb_field(rfield), rs2s(rstring));
|
1200
1287
|
|
1201
1288
|
/* Make sure that there is no entry already */
|
1202
1289
|
object_set(&ts->text, rstring);
|
@@ -1215,10 +1302,10 @@ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
|
|
1215
1302
|
* input:: data from the field to be tokenized
|
1216
1303
|
*/
|
1217
1304
|
static VALUE
|
1218
|
-
|
1305
|
+
frb_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
1219
1306
|
{
|
1220
1307
|
/* NOTE: Any changes made to this method may also need to be applied to
|
1221
|
-
*
|
1308
|
+
* frb_re_analyzer_token_stream */
|
1222
1309
|
Analyzer *a;
|
1223
1310
|
GET_A(a, self);
|
1224
1311
|
|
@@ -1244,12 +1331,12 @@ lower = (argc ? RTEST(rlower) : dflt)
|
|
1244
1331
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1245
1332
|
*/
|
1246
1333
|
static VALUE
|
1247
|
-
|
1334
|
+
frb_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
1248
1335
|
{
|
1249
1336
|
Analyzer *a;
|
1250
1337
|
GET_LOWER(false);
|
1251
1338
|
a = whitespace_analyzer_new(lower);
|
1252
|
-
Frt_Wrap_Struct(self, NULL, &
|
1339
|
+
Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
|
1253
1340
|
object_add(a, self);
|
1254
1341
|
return self;
|
1255
1342
|
}
|
@@ -1265,15 +1352,15 @@ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1265
1352
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1266
1353
|
*/
|
1267
1354
|
static VALUE
|
1268
|
-
|
1355
|
+
frb_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
1269
1356
|
{
|
1270
1357
|
Analyzer *a;
|
1271
1358
|
GET_LOWER(false);
|
1272
1359
|
#ifndef POSH_OS_WIN32
|
1273
|
-
if (!
|
1360
|
+
if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
|
1274
1361
|
#endif
|
1275
1362
|
a = mb_whitespace_analyzer_new(lower);
|
1276
|
-
Frt_Wrap_Struct(self, NULL, &
|
1363
|
+
Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
|
1277
1364
|
object_add(a, self);
|
1278
1365
|
return self;
|
1279
1366
|
}
|
@@ -1289,12 +1376,12 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1289
1376
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1290
1377
|
*/
|
1291
1378
|
static VALUE
|
1292
|
-
|
1379
|
+
frb_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
1293
1380
|
{
|
1294
1381
|
Analyzer *a;
|
1295
1382
|
GET_LOWER(true);
|
1296
1383
|
a = letter_analyzer_new(lower);
|
1297
|
-
Frt_Wrap_Struct(self, NULL, &
|
1384
|
+
Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
|
1298
1385
|
object_add(a, self);
|
1299
1386
|
return self;
|
1300
1387
|
}
|
@@ -1310,15 +1397,15 @@ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1310
1397
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1311
1398
|
*/
|
1312
1399
|
static VALUE
|
1313
|
-
|
1400
|
+
frb_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
|
1314
1401
|
{
|
1315
1402
|
Analyzer *a;
|
1316
1403
|
GET_LOWER(true);
|
1317
1404
|
#ifndef POSH_OS_WIN32
|
1318
|
-
if (!
|
1405
|
+
if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
|
1319
1406
|
#endif
|
1320
1407
|
a = mb_letter_analyzer_new(lower);
|
1321
|
-
Frt_Wrap_Struct(self, NULL, &
|
1408
|
+
Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
|
1322
1409
|
object_add(a, self);
|
1323
1410
|
return self;
|
1324
1411
|
}
|
@@ -1350,7 +1437,7 @@ get_rstopwords(const char **stop_words)
|
|
1350
1437
|
* stop_words:: list of stop-words to pass to the StopFilter
|
1351
1438
|
*/
|
1352
1439
|
static VALUE
|
1353
|
-
|
1440
|
+
frb_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
1354
1441
|
{
|
1355
1442
|
bool lower;
|
1356
1443
|
VALUE rlower, rstop_words;
|
@@ -1364,7 +1451,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1364
1451
|
} else {
|
1365
1452
|
a = standard_analyzer_new(lower);
|
1366
1453
|
}
|
1367
|
-
Frt_Wrap_Struct(self, NULL, &
|
1454
|
+
Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
|
1368
1455
|
object_add(a, self);
|
1369
1456
|
return self;
|
1370
1457
|
}
|
@@ -1383,13 +1470,13 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1383
1470
|
* stop_words:: list of stop-words to pass to the StopFilter
|
1384
1471
|
*/
|
1385
1472
|
static VALUE
|
1386
|
-
|
1473
|
+
frb_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
1387
1474
|
{
|
1388
1475
|
bool lower;
|
1389
1476
|
VALUE rlower, rstop_words;
|
1390
1477
|
Analyzer *a;
|
1391
1478
|
#ifndef POSH_OS_WIN32
|
1392
|
-
if (!
|
1479
|
+
if (!frb_locale) frb_locale = setlocale(LC_CTYPE, "");
|
1393
1480
|
#endif
|
1394
1481
|
rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
|
1395
1482
|
lower = ((rlower == Qnil) ? true : RTEST(rlower));
|
@@ -1400,22 +1487,22 @@ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1400
1487
|
} else {
|
1401
1488
|
a = mb_standard_analyzer_new(lower);
|
1402
1489
|
}
|
1403
|
-
Frt_Wrap_Struct(self, NULL, &
|
1490
|
+
Frt_Wrap_Struct(self, NULL, &frb_analyzer_free, a);
|
1404
1491
|
object_add(a, self);
|
1405
1492
|
return self;
|
1406
1493
|
}
|
1407
1494
|
|
1408
1495
|
static void
|
1409
|
-
|
1496
|
+
frb_h_mark_values_i(void *key, void *value, void *arg)
|
1410
1497
|
{
|
1411
|
-
|
1498
|
+
frb_gc_mark(value);
|
1412
1499
|
}
|
1413
1500
|
|
1414
1501
|
static void
|
1415
|
-
|
1502
|
+
frb_pfa_mark(void *p)
|
1416
1503
|
{
|
1417
|
-
|
1418
|
-
h_each(PFA(p)->dict, &
|
1504
|
+
frb_gc_mark(PFA(p)->default_a);
|
1505
|
+
h_each(PFA(p)->dict, &frb_h_mark_values_i, NULL);
|
1419
1506
|
}
|
1420
1507
|
|
1421
1508
|
/*** PerFieldAnalyzer ***/
|
@@ -1431,11 +1518,11 @@ frt_pfa_mark(void *p)
|
|
1431
1518
|
* specified
|
1432
1519
|
*/
|
1433
1520
|
static VALUE
|
1434
|
-
|
1521
|
+
frb_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
|
1435
1522
|
{
|
1436
|
-
Analyzer *def =
|
1523
|
+
Analyzer *def = frb_get_cwrapped_analyzer(ranalyzer);
|
1437
1524
|
Analyzer *a = per_field_analyzer_new(def);
|
1438
|
-
Frt_Wrap_Struct(self, &
|
1525
|
+
Frt_Wrap_Struct(self, &frb_pfa_mark, &frb_analyzer_free, a);
|
1439
1526
|
object_add(a, self);
|
1440
1527
|
return self;
|
1441
1528
|
}
|
@@ -1452,13 +1539,13 @@ frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
|
|
1452
1539
|
* analyzer:: analyzer to be used on +field_name+
|
1453
1540
|
*/
|
1454
1541
|
static VALUE
|
1455
|
-
|
1542
|
+
frb_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
1456
1543
|
{
|
1457
1544
|
Analyzer *pfa, *a;
|
1458
1545
|
Data_Get_Struct(self, Analyzer, pfa);
|
1459
|
-
a =
|
1546
|
+
a = frb_get_cwrapped_analyzer(ranalyzer);
|
1460
1547
|
|
1461
|
-
pfa_add_field(pfa,
|
1548
|
+
pfa_add_field(pfa, frb_field(rfield), a);
|
1462
1549
|
return self;
|
1463
1550
|
}
|
1464
1551
|
|
@@ -1473,10 +1560,10 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
1473
1560
|
* input:: data from the field to be tokenized
|
1474
1561
|
*/
|
1475
1562
|
static VALUE
|
1476
|
-
|
1563
|
+
frb_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
1477
1564
|
{
|
1478
1565
|
Analyzer *pfa, *a;
|
1479
|
-
|
1566
|
+
Symbol field = frb_field(rfield);
|
1480
1567
|
GET_A(pfa, self);
|
1481
1568
|
|
1482
1569
|
StringValue(rstring);
|
@@ -1486,7 +1573,7 @@ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
|
1486
1573
|
}
|
1487
1574
|
if (a->get_ts == cwa_get_ts) {
|
1488
1575
|
return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
|
1489
|
-
|
1576
|
+
FSYM2SYM(field), rb_str_new2(rs2s(rstring)));
|
1490
1577
|
}
|
1491
1578
|
else {
|
1492
1579
|
return get_rb_ts_from_a(a, rfield, rstring);
|
@@ -1496,9 +1583,9 @@ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
|
1496
1583
|
/*** RegExpAnalyzer ***/
|
1497
1584
|
|
1498
1585
|
static void
|
1499
|
-
|
1586
|
+
frb_re_analyzer_mark(Analyzer *a)
|
1500
1587
|
{
|
1501
|
-
|
1588
|
+
frb_gc_mark(a->current_ts);
|
1502
1589
|
}
|
1503
1590
|
|
1504
1591
|
static void
|
@@ -1519,7 +1606,7 @@ re_analyzer_destroy_i(Analyzer *a)
|
|
1519
1606
|
* lower:: set to false if you don't want to downcase the tokens
|
1520
1607
|
*/
|
1521
1608
|
static VALUE
|
1522
|
-
|
1609
|
+
frb_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
1523
1610
|
{
|
1524
1611
|
VALUE lower, rets, regex, proc;
|
1525
1612
|
Analyzer *a;
|
@@ -1527,17 +1614,17 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1527
1614
|
rb_scan_args(argc, argv, "02&", ®ex, &lower, &proc);
|
1528
1615
|
|
1529
1616
|
ts = rets_new(Qnil, regex, proc);
|
1530
|
-
rets = Data_Wrap_Struct(cRegExpTokenizer, &
|
1617
|
+
rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts);
|
1531
1618
|
object_add(ts, rets);
|
1532
1619
|
|
1533
1620
|
if (lower != Qfalse) {
|
1534
|
-
rets =
|
1621
|
+
rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets);
|
1535
1622
|
ts = DATA_PTR(rets);
|
1536
1623
|
}
|
1537
1624
|
REF(ts);
|
1538
1625
|
|
1539
1626
|
a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
|
1540
|
-
Frt_Wrap_Struct(self, &
|
1627
|
+
Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a);
|
1541
1628
|
object_add(a, self);
|
1542
1629
|
return self;
|
1543
1630
|
}
|
@@ -1554,7 +1641,7 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1554
1641
|
* input:: data from the field to be tokenized
|
1555
1642
|
*/
|
1556
1643
|
static VALUE
|
1557
|
-
|
1644
|
+
frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
|
1558
1645
|
{
|
1559
1646
|
TokenStream *ts;
|
1560
1647
|
Analyzer *a;
|
@@ -1562,7 +1649,7 @@ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
|
|
1562
1649
|
|
1563
1650
|
StringValue(rtext);
|
1564
1651
|
|
1565
|
-
ts = a_get_ts(a,
|
1652
|
+
ts = a_get_ts(a, frb_field(rfield), rs2s(rtext));
|
1566
1653
|
|
1567
1654
|
/* Make sure that there is no entry already */
|
1568
1655
|
object_set(&ts->text, rtext);
|
@@ -1591,9 +1678,9 @@ frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
|
|
1591
1678
|
*
|
1592
1679
|
* puts Ferret.locale #=> "en_US.UTF-8"
|
1593
1680
|
*/
|
1594
|
-
static VALUE
|
1681
|
+
static VALUE frb_get_locale(VALUE self, VALUE locale)
|
1595
1682
|
{
|
1596
|
-
return (
|
1683
|
+
return (frb_locale ? rb_str_new2(frb_locale) : Qnil);
|
1597
1684
|
}
|
1598
1685
|
|
1599
1686
|
/*
|
@@ -1603,11 +1690,11 @@ static VALUE frt_get_locale(VALUE self, VALUE locale)
|
|
1603
1690
|
* Set the global locale. You should use this method to set different locales
|
1604
1691
|
* when indexing documents with different encodings.
|
1605
1692
|
*/
|
1606
|
-
static VALUE
|
1693
|
+
static VALUE frb_set_locale(VALUE self, VALUE locale)
|
1607
1694
|
{
|
1608
1695
|
char *l = ((locale == Qnil) ? NULL : rs2s(rb_obj_as_string(locale)));
|
1609
|
-
|
1610
|
-
return
|
1696
|
+
frb_locale = setlocale(LC_CTYPE, l);
|
1697
|
+
return frb_locale ? rb_str_new2(frb_locale) : Qnil;
|
1611
1698
|
}
|
1612
1699
|
|
1613
1700
|
/****************************************************************************
|
@@ -1645,25 +1732,27 @@ static VALUE frt_set_locale(VALUE self, VALUE locale)
|
|
1645
1732
|
static void Init_Token(void)
|
1646
1733
|
{
|
1647
1734
|
cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
|
1648
|
-
rb_define_alloc_func(cToken,
|
1735
|
+
rb_define_alloc_func(cToken, frb_token_alloc);
|
1649
1736
|
rb_include_module(cToken, rb_mComparable);
|
1650
1737
|
|
1651
|
-
rb_define_method(cToken, "initialize",
|
1652
|
-
rb_define_method(cToken, "<=>",
|
1653
|
-
rb_define_method(cToken, "text",
|
1654
|
-
rb_define_method(cToken, "text=",
|
1655
|
-
rb_define_method(cToken, "start",
|
1656
|
-
rb_define_method(cToken, "start=",
|
1657
|
-
rb_define_method(cToken, "end",
|
1658
|
-
rb_define_method(cToken, "end=",
|
1659
|
-
rb_define_method(cToken, "pos_inc",
|
1660
|
-
rb_define_method(cToken, "pos_inc=",
|
1661
|
-
rb_define_method(cToken, "to_s",
|
1738
|
+
rb_define_method(cToken, "initialize", frb_token_init, -1);
|
1739
|
+
rb_define_method(cToken, "<=>", frb_token_cmp, 1);
|
1740
|
+
rb_define_method(cToken, "text", frb_token_get_text, 0);
|
1741
|
+
rb_define_method(cToken, "text=", frb_token_set_text, 1);
|
1742
|
+
rb_define_method(cToken, "start", frb_token_get_start_offset, 0);
|
1743
|
+
rb_define_method(cToken, "start=", frb_token_set_start_offset, 1);
|
1744
|
+
rb_define_method(cToken, "end", frb_token_get_end_offset, 0);
|
1745
|
+
rb_define_method(cToken, "end=", frb_token_set_end_offset, 1);
|
1746
|
+
rb_define_method(cToken, "pos_inc", frb_token_get_pos_inc, 0);
|
1747
|
+
rb_define_method(cToken, "pos_inc=", frb_token_set_pos_inc, 1);
|
1748
|
+
rb_define_method(cToken, "to_s", frb_token_to_s, 0);
|
1662
1749
|
}
|
1663
1750
|
|
1664
1751
|
/*
|
1665
1752
|
* Document-class: Ferret::Analysis::TokenStream
|
1666
1753
|
*
|
1754
|
+
* == Summary
|
1755
|
+
*
|
1667
1756
|
* A TokenStream enumerates the sequence of tokens, either from
|
1668
1757
|
* fields of a document or from query text.
|
1669
1758
|
*
|
@@ -1675,15 +1764,17 @@ static void Init_Token(void)
|
|
1675
1764
|
static void Init_TokenStream(void)
|
1676
1765
|
{
|
1677
1766
|
cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
|
1678
|
-
|
1679
|
-
rb_define_method(cTokenStream, "next",
|
1680
|
-
rb_define_method(cTokenStream, "text=",
|
1681
|
-
rb_define_method(cTokenStream, "text",
|
1767
|
+
frb_mark_cclass(cTokenStream);
|
1768
|
+
rb_define_method(cTokenStream, "next", frb_ts_next, 0);
|
1769
|
+
rb_define_method(cTokenStream, "text=", frb_ts_set_text, 1);
|
1770
|
+
rb_define_method(cTokenStream, "text", frb_ts_get_text, 0);
|
1682
1771
|
}
|
1683
1772
|
|
1684
1773
|
/*
|
1685
1774
|
* Document-class: Ferret::Analysis::AsciiLetterTokenizer
|
1686
1775
|
*
|
1776
|
+
* == Summary
|
1777
|
+
*
|
1687
1778
|
* A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
|
1688
1779
|
* That is to say, it defines tokens as maximal strings of adjacent letters,
|
1689
1780
|
* as defined by the regular expression _/[A-Za-z]+/_.
|
@@ -1697,15 +1788,17 @@ static void Init_AsciiLetterTokenizer(void)
|
|
1697
1788
|
{
|
1698
1789
|
cAsciiLetterTokenizer =
|
1699
1790
|
rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
|
1700
|
-
|
1701
|
-
rb_define_alloc_func(cAsciiLetterTokenizer,
|
1791
|
+
frb_mark_cclass(cAsciiLetterTokenizer);
|
1792
|
+
rb_define_alloc_func(cAsciiLetterTokenizer, frb_data_alloc);
|
1702
1793
|
rb_define_method(cAsciiLetterTokenizer, "initialize",
|
1703
|
-
|
1794
|
+
frb_a_letter_tokenizer_init, 1);
|
1704
1795
|
}
|
1705
1796
|
|
1706
1797
|
/*
|
1707
1798
|
* Document-class: Ferret::Analysis::LetterTokenizer
|
1708
1799
|
*
|
1800
|
+
* == Summary
|
1801
|
+
*
|
1709
1802
|
* A LetterTokenizer is a tokenizer that divides text at non-letters. That is
|
1710
1803
|
* to say, it defines tokens as maximal strings of adjacent letters, as
|
1711
1804
|
* defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
|
@@ -1720,15 +1813,17 @@ static void Init_LetterTokenizer(void)
|
|
1720
1813
|
{
|
1721
1814
|
cLetterTokenizer =
|
1722
1815
|
rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
|
1723
|
-
|
1724
|
-
rb_define_alloc_func(cLetterTokenizer,
|
1816
|
+
frb_mark_cclass(cLetterTokenizer);
|
1817
|
+
rb_define_alloc_func(cLetterTokenizer, frb_data_alloc);
|
1725
1818
|
rb_define_method(cLetterTokenizer, "initialize",
|
1726
|
-
|
1819
|
+
frb_letter_tokenizer_init, -1);
|
1727
1820
|
}
|
1728
1821
|
|
1729
1822
|
/*
|
1730
1823
|
* Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
|
1731
1824
|
*
|
1825
|
+
* == Summary
|
1826
|
+
*
|
1732
1827
|
* A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
|
1733
1828
|
* Adjacent sequences of non-WhiteSpace characters form tokens.
|
1734
1829
|
*
|
@@ -1742,15 +1837,17 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
|
|
1742
1837
|
cAsciiWhiteSpaceTokenizer =
|
1743
1838
|
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
|
1744
1839
|
cTokenStream);
|
1745
|
-
|
1746
|
-
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer,
|
1840
|
+
frb_mark_cclass(cAsciiWhiteSpaceTokenizer);
|
1841
|
+
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frb_data_alloc);
|
1747
1842
|
rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
|
1748
|
-
|
1843
|
+
frb_a_whitespace_tokenizer_init, 1);
|
1749
1844
|
}
|
1750
1845
|
|
1751
1846
|
/*
|
1752
1847
|
* Document-class: Ferret::Analysis::WhiteSpaceTokenizer
|
1753
1848
|
*
|
1849
|
+
* == Summary
|
1850
|
+
*
|
1754
1851
|
* A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
|
1755
1852
|
* Adjacent sequences of non-WhiteSpace characters form tokens.
|
1756
1853
|
*
|
@@ -1763,15 +1860,17 @@ static void Init_WhiteSpaceTokenizer(void)
|
|
1763
1860
|
{
|
1764
1861
|
cWhiteSpaceTokenizer =
|
1765
1862
|
rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
|
1766
|
-
|
1767
|
-
rb_define_alloc_func(cWhiteSpaceTokenizer,
|
1863
|
+
frb_mark_cclass(cWhiteSpaceTokenizer);
|
1864
|
+
rb_define_alloc_func(cWhiteSpaceTokenizer, frb_data_alloc);
|
1768
1865
|
rb_define_method(cWhiteSpaceTokenizer, "initialize",
|
1769
|
-
|
1866
|
+
frb_whitespace_tokenizer_init, -1);
|
1770
1867
|
}
|
1771
1868
|
|
1772
1869
|
/*
|
1773
1870
|
* Document-class: Ferret::Analysis::AsciiStandardTokenizer
|
1774
1871
|
*
|
1872
|
+
* == Summary
|
1873
|
+
*
|
1775
1874
|
* The standard tokenizer is an advanced tokenizer which tokenizes most
|
1776
1875
|
* words correctly as well as tokenizing things like email addresses, web
|
1777
1876
|
* addresses, phone numbers, etc.
|
@@ -1785,15 +1884,17 @@ static void Init_AsciiStandardTokenizer(void)
|
|
1785
1884
|
{
|
1786
1885
|
cAsciiStandardTokenizer =
|
1787
1886
|
rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
1788
|
-
|
1789
|
-
rb_define_alloc_func(cAsciiStandardTokenizer,
|
1887
|
+
frb_mark_cclass(cAsciiStandardTokenizer);
|
1888
|
+
rb_define_alloc_func(cAsciiStandardTokenizer, frb_data_alloc);
|
1790
1889
|
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
1791
|
-
|
1890
|
+
frb_a_standard_tokenizer_init, 1);
|
1792
1891
|
}
|
1793
1892
|
|
1794
1893
|
/*
|
1795
1894
|
* Document-class: Ferret::Analysis::StandardTokenizer
|
1796
1895
|
*
|
1896
|
+
* == Summary
|
1897
|
+
*
|
1797
1898
|
* The standard tokenizer is an advanced tokenizer which tokenizes most
|
1798
1899
|
* words correctly as well as tokenizing things like email addresses, web
|
1799
1900
|
* addresses, phone numbers, etc.
|
@@ -1807,15 +1908,17 @@ static void Init_StandardTokenizer(void)
|
|
1807
1908
|
{
|
1808
1909
|
cStandardTokenizer =
|
1809
1910
|
rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
1810
|
-
|
1811
|
-
rb_define_alloc_func(cStandardTokenizer,
|
1911
|
+
frb_mark_cclass(cStandardTokenizer);
|
1912
|
+
rb_define_alloc_func(cStandardTokenizer, frb_data_alloc);
|
1812
1913
|
rb_define_method(cStandardTokenizer, "initialize",
|
1813
|
-
|
1914
|
+
frb_standard_tokenizer_init, 1);
|
1814
1915
|
}
|
1815
1916
|
|
1816
1917
|
/*
|
1817
1918
|
* Document-class: Ferret::Analysis::RegExpTokenizer
|
1818
1919
|
*
|
1920
|
+
* == Summary
|
1921
|
+
*
|
1819
1922
|
* A tokenizer that recognizes tokens based on a regular expression passed to
|
1820
1923
|
* the constructor. Most possible tokenizers can be created using this class.
|
1821
1924
|
*
|
@@ -1835,14 +1938,14 @@ static void Init_RegExpTokenizer(void)
|
|
1835
1938
|
{
|
1836
1939
|
cRegExpTokenizer =
|
1837
1940
|
rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1838
|
-
|
1941
|
+
frb_mark_cclass(cRegExpTokenizer);
|
1839
1942
|
rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
|
1840
1943
|
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1841
|
-
rb_define_alloc_func(cRegExpTokenizer,
|
1944
|
+
rb_define_alloc_func(cRegExpTokenizer, frb_data_alloc);
|
1842
1945
|
rb_define_method(cRegExpTokenizer, "initialize",
|
1843
|
-
|
1844
|
-
rb_define_method(cRegExpTokenizer, "text=",
|
1845
|
-
rb_define_method(cRegExpTokenizer, "text",
|
1946
|
+
frb_rets_init, -1);
|
1947
|
+
rb_define_method(cRegExpTokenizer, "text=", frb_rets_set_text, 1);
|
1948
|
+
rb_define_method(cRegExpTokenizer, "text", frb_rets_get_text, 0);
|
1846
1949
|
}
|
1847
1950
|
|
1848
1951
|
/***************/
|
@@ -1852,6 +1955,8 @@ static void Init_RegExpTokenizer(void)
|
|
1852
1955
|
/*
|
1853
1956
|
* Document-class: Ferret::Analysis::AsciiLowerCaseFilter
|
1854
1957
|
*
|
1958
|
+
* == Summary
|
1959
|
+
*
|
1855
1960
|
* AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
|
1856
1961
|
* ASCII characters. For other characters use LowerCaseFilter.
|
1857
1962
|
*
|
@@ -1864,15 +1969,17 @@ static void Init_AsciiLowerCaseFilter(void)
|
|
1864
1969
|
{
|
1865
1970
|
cAsciiLowerCaseFilter =
|
1866
1971
|
rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
1867
|
-
|
1868
|
-
rb_define_alloc_func(cAsciiLowerCaseFilter,
|
1972
|
+
frb_mark_cclass(cAsciiLowerCaseFilter);
|
1973
|
+
rb_define_alloc_func(cAsciiLowerCaseFilter, frb_data_alloc);
|
1869
1974
|
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
1870
|
-
|
1975
|
+
frb_a_lowercase_filter_init, 1);
|
1871
1976
|
}
|
1872
1977
|
|
1873
1978
|
/*
|
1874
1979
|
* Document-class: Ferret::Analysis::LowerCaseFilter
|
1875
1980
|
*
|
1981
|
+
* == Summary
|
1982
|
+
*
|
1876
1983
|
* LowerCaseFilter normalizes a token's text to lowercase based on the
|
1877
1984
|
* current locale.
|
1878
1985
|
*
|
@@ -1885,15 +1992,17 @@ static void Init_LowerCaseFilter(void)
|
|
1885
1992
|
{
|
1886
1993
|
cLowerCaseFilter =
|
1887
1994
|
rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
1888
|
-
|
1889
|
-
rb_define_alloc_func(cLowerCaseFilter,
|
1995
|
+
frb_mark_cclass(cLowerCaseFilter);
|
1996
|
+
rb_define_alloc_func(cLowerCaseFilter, frb_data_alloc);
|
1890
1997
|
rb_define_method(cLowerCaseFilter, "initialize",
|
1891
|
-
|
1998
|
+
frb_lowercase_filter_init, 1);
|
1892
1999
|
}
|
1893
2000
|
|
1894
2001
|
/*
|
1895
2002
|
* Document-class: Ferret::Analysis::HyphenFilter
|
1896
2003
|
*
|
2004
|
+
* == Summary
|
2005
|
+
*
|
1897
2006
|
* HyphenFilter filters hyphenated words by adding both the word concatenated
|
1898
2007
|
* into a single word and split into multiple words. ie "e-mail" becomes
|
1899
2008
|
* "email" and "e mail". This way a search for "e-mail", "email" and "mail"
|
@@ -1908,14 +2017,16 @@ static void Init_HyphenFilter(void)
|
|
1908
2017
|
{
|
1909
2018
|
cHyphenFilter =
|
1910
2019
|
rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
|
1911
|
-
|
1912
|
-
rb_define_alloc_func(cHyphenFilter,
|
1913
|
-
rb_define_method(cHyphenFilter, "initialize",
|
2020
|
+
frb_mark_cclass(cHyphenFilter);
|
2021
|
+
rb_define_alloc_func(cHyphenFilter, frb_data_alloc);
|
2022
|
+
rb_define_method(cHyphenFilter, "initialize", frb_hyphen_filter_init, 1);
|
1914
2023
|
}
|
1915
2024
|
|
1916
2025
|
/*
|
1917
2026
|
* Document-class: Ferret::Analysis::MappingFilter
|
1918
2027
|
*
|
2028
|
+
* == Summary
|
2029
|
+
*
|
1919
2030
|
* A MappingFilter maps strings in tokens. This is usually used to map UTF-8
|
1920
2031
|
* characters to ASCII characters for easier searching and better search
|
1921
2032
|
* recall. The mapping is compiled into a Deterministic Finite Automata so it
|
@@ -1956,15 +2067,17 @@ static void Init_MappingFilter(void)
|
|
1956
2067
|
{
|
1957
2068
|
cMappingFilter =
|
1958
2069
|
rb_define_class_under(mAnalysis, "MappingFilter", cTokenStream);
|
1959
|
-
|
1960
|
-
rb_define_alloc_func(cMappingFilter,
|
2070
|
+
frb_mark_cclass(cMappingFilter);
|
2071
|
+
rb_define_alloc_func(cMappingFilter, frb_data_alloc);
|
1961
2072
|
rb_define_method(cMappingFilter, "initialize",
|
1962
|
-
|
2073
|
+
frb_mapping_filter_init, 2);
|
1963
2074
|
}
|
1964
2075
|
|
1965
2076
|
/*
|
1966
2077
|
* Document-class: Ferret::Analysis::StopFilter
|
1967
2078
|
*
|
2079
|
+
* == Summary
|
2080
|
+
*
|
1968
2081
|
* A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
|
1969
2082
|
* that you don't wish to be index. Usually they will be common words like
|
1970
2083
|
* "the" and "and" although you can specify whichever words you want.
|
@@ -1977,10 +2090,10 @@ static void Init_StopFilter(void)
|
|
1977
2090
|
{
|
1978
2091
|
cStopFilter =
|
1979
2092
|
rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
1980
|
-
|
1981
|
-
rb_define_alloc_func(cStopFilter,
|
2093
|
+
frb_mark_cclass(cStopFilter);
|
2094
|
+
rb_define_alloc_func(cStopFilter, frb_data_alloc);
|
1982
2095
|
rb_define_method(cStopFilter, "initialize",
|
1983
|
-
|
2096
|
+
frb_stop_filter_init, -1);
|
1984
2097
|
}
|
1985
2098
|
|
1986
2099
|
/*
|
@@ -2004,13 +2117,25 @@ static void Init_StopFilter(void)
|
|
2004
2117
|
* "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
|
2005
2118
|
* "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
|
2006
2119
|
* "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
|
2120
|
+
* "hungarian", | "hu", "hun" | "ISO_8859_1", "UTF_8"
|
2007
2121
|
* "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
|
2008
2122
|
* "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
|
2009
2123
|
* "porter", | | "ISO_8859_1", "UTF_8"
|
2010
2124
|
* "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
|
2125
|
+
* "romanian", | "ro", "ron", "rum" | "ISO_8859_2", "UTF_8"
|
2011
2126
|
* "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
|
2012
2127
|
* "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
|
2013
2128
|
* "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
|
2129
|
+
* "turkish", | "tr", "tur" | "UTF_8"
|
2130
|
+
*
|
2131
|
+
*
|
2132
|
+
* === New Stemmers
|
2133
|
+
*
|
2134
|
+
* The following stemmers have recently benn added. Please try them out;
|
2135
|
+
*
|
2136
|
+
* * Hungarian
|
2137
|
+
* * Romanian
|
2138
|
+
* * Turkish
|
2014
2139
|
*
|
2015
2140
|
* === Example
|
2016
2141
|
*
|
@@ -2037,10 +2162,10 @@ static void Init_StemFilter(void)
|
|
2037
2162
|
{
|
2038
2163
|
cStemFilter =
|
2039
2164
|
rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
2040
|
-
|
2041
|
-
rb_define_alloc_func(cStemFilter,
|
2165
|
+
frb_mark_cclass(cStemFilter);
|
2166
|
+
rb_define_alloc_func(cStemFilter, frb_data_alloc);
|
2042
2167
|
rb_define_method(cStemFilter, "initialize",
|
2043
|
-
|
2168
|
+
frb_stem_filter_init, -1);
|
2044
2169
|
}
|
2045
2170
|
|
2046
2171
|
/*************************/
|
@@ -2081,10 +2206,10 @@ static void Init_Analyzer(void)
|
|
2081
2206
|
{
|
2082
2207
|
cAnalyzer =
|
2083
2208
|
rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
2084
|
-
|
2085
|
-
rb_define_alloc_func(cAnalyzer,
|
2086
|
-
rb_define_method(cAnalyzer, "initialize",
|
2087
|
-
rb_define_method(cAnalyzer, "token_stream",
|
2209
|
+
frb_mark_cclass(cAnalyzer);
|
2210
|
+
rb_define_alloc_func(cAnalyzer, frb_data_alloc);
|
2211
|
+
rb_define_method(cAnalyzer, "initialize", frb_letter_analyzer_init, -1);
|
2212
|
+
rb_define_method(cAnalyzer, "token_stream", frb_analyzer_token_stream, 2);
|
2088
2213
|
}
|
2089
2214
|
|
2090
2215
|
/*
|
@@ -2119,10 +2244,10 @@ static void Init_AsciiLetterAnalyzer(void)
|
|
2119
2244
|
{
|
2120
2245
|
cAsciiLetterAnalyzer =
|
2121
2246
|
rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
2122
|
-
|
2123
|
-
rb_define_alloc_func(cAsciiLetterAnalyzer,
|
2247
|
+
frb_mark_cclass(cAsciiLetterAnalyzer);
|
2248
|
+
rb_define_alloc_func(cAsciiLetterAnalyzer, frb_data_alloc);
|
2124
2249
|
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
2125
|
-
|
2250
|
+
frb_a_letter_analyzer_init, -1);
|
2126
2251
|
}
|
2127
2252
|
|
2128
2253
|
/*
|
@@ -2150,10 +2275,10 @@ static void Init_LetterAnalyzer(void)
|
|
2150
2275
|
{
|
2151
2276
|
cLetterAnalyzer =
|
2152
2277
|
rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
2153
|
-
|
2154
|
-
rb_define_alloc_func(cLetterAnalyzer,
|
2278
|
+
frb_mark_cclass(cLetterAnalyzer);
|
2279
|
+
rb_define_alloc_func(cLetterAnalyzer, frb_data_alloc);
|
2155
2280
|
rb_define_method(cLetterAnalyzer, "initialize",
|
2156
|
-
|
2281
|
+
frb_letter_analyzer_init, -1);
|
2157
2282
|
}
|
2158
2283
|
|
2159
2284
|
/*
|
@@ -2187,10 +2312,10 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
|
|
2187
2312
|
{
|
2188
2313
|
cAsciiWhiteSpaceAnalyzer =
|
2189
2314
|
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
2190
|
-
|
2191
|
-
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer,
|
2315
|
+
frb_mark_cclass(cAsciiWhiteSpaceAnalyzer);
|
2316
|
+
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frb_data_alloc);
|
2192
2317
|
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
2193
|
-
|
2318
|
+
frb_a_white_space_analyzer_init, -1);
|
2194
2319
|
}
|
2195
2320
|
|
2196
2321
|
/*
|
@@ -2218,10 +2343,10 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
2218
2343
|
{
|
2219
2344
|
cWhiteSpaceAnalyzer =
|
2220
2345
|
rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
2221
|
-
|
2222
|
-
rb_define_alloc_func(cWhiteSpaceAnalyzer,
|
2346
|
+
frb_mark_cclass(cWhiteSpaceAnalyzer);
|
2347
|
+
rb_define_alloc_func(cWhiteSpaceAnalyzer, frb_data_alloc);
|
2223
2348
|
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
2224
|
-
|
2349
|
+
frb_white_space_analyzer_init, -1);
|
2225
2350
|
}
|
2226
2351
|
|
2227
2352
|
/*
|
@@ -2255,10 +2380,10 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
2255
2380
|
{
|
2256
2381
|
cAsciiStandardAnalyzer =
|
2257
2382
|
rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
2258
|
-
|
2259
|
-
rb_define_alloc_func(cAsciiStandardAnalyzer,
|
2383
|
+
frb_mark_cclass(cAsciiStandardAnalyzer);
|
2384
|
+
rb_define_alloc_func(cAsciiStandardAnalyzer, frb_data_alloc);
|
2260
2385
|
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
2261
|
-
|
2386
|
+
frb_a_standard_analyzer_init, -1);
|
2262
2387
|
}
|
2263
2388
|
|
2264
2389
|
/*
|
@@ -2290,10 +2415,10 @@ static void Init_StandardAnalyzer(void)
|
|
2290
2415
|
{
|
2291
2416
|
cStandardAnalyzer =
|
2292
2417
|
rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
2293
|
-
|
2294
|
-
rb_define_alloc_func(cStandardAnalyzer,
|
2418
|
+
frb_mark_cclass(cStandardAnalyzer);
|
2419
|
+
rb_define_alloc_func(cStandardAnalyzer, frb_data_alloc);
|
2295
2420
|
rb_define_method(cStandardAnalyzer, "initialize",
|
2296
|
-
|
2421
|
+
frb_standard_analyzer_init, -1);
|
2297
2422
|
}
|
2298
2423
|
|
2299
2424
|
/*
|
@@ -2320,16 +2445,16 @@ static void Init_PerFieldAnalyzer(void)
|
|
2320
2445
|
{
|
2321
2446
|
cPerFieldAnalyzer =
|
2322
2447
|
rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
2323
|
-
|
2324
|
-
rb_define_alloc_func(cPerFieldAnalyzer,
|
2448
|
+
frb_mark_cclass(cPerFieldAnalyzer);
|
2449
|
+
rb_define_alloc_func(cPerFieldAnalyzer, frb_data_alloc);
|
2325
2450
|
rb_define_method(cPerFieldAnalyzer, "initialize",
|
2326
|
-
|
2451
|
+
frb_per_field_analyzer_init, 1);
|
2327
2452
|
rb_define_method(cPerFieldAnalyzer, "add_field",
|
2328
|
-
|
2453
|
+
frb_per_field_analyzer_add_field, 2);
|
2329
2454
|
rb_define_method(cPerFieldAnalyzer, "[]=",
|
2330
|
-
|
2455
|
+
frb_per_field_analyzer_add_field, 2);
|
2331
2456
|
rb_define_method(cPerFieldAnalyzer, "token_stream",
|
2332
|
-
|
2457
|
+
frb_pfa_analyzer_token_stream, 2);
|
2333
2458
|
}
|
2334
2459
|
|
2335
2460
|
/*
|
@@ -2363,12 +2488,12 @@ static void Init_RegExpAnalyzer(void)
|
|
2363
2488
|
{
|
2364
2489
|
cRegExpAnalyzer =
|
2365
2490
|
rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
2366
|
-
|
2367
|
-
rb_define_alloc_func(cRegExpAnalyzer,
|
2491
|
+
frb_mark_cclass(cRegExpAnalyzer);
|
2492
|
+
rb_define_alloc_func(cRegExpAnalyzer, frb_data_alloc);
|
2368
2493
|
rb_define_method(cRegExpAnalyzer, "initialize",
|
2369
|
-
|
2494
|
+
frb_re_analyzer_init, -1);
|
2370
2495
|
rb_define_method(cRegExpAnalyzer, "token_stream",
|
2371
|
-
|
2496
|
+
frb_re_analyzer_token_stream, 2);
|
2372
2497
|
}
|
2373
2498
|
|
2374
2499
|
/* rdoc hack
|
@@ -2433,8 +2558,8 @@ Init_Analysis(void)
|
|
2433
2558
|
rb_define_const(mFerret, "OBJECT_SPACE", object_space);
|
2434
2559
|
|
2435
2560
|
/*** * * Locale stuff * * ***/
|
2436
|
-
rb_define_singleton_method(mFerret, "locale=",
|
2437
|
-
rb_define_singleton_method(mFerret, "locale",
|
2561
|
+
rb_define_singleton_method(mFerret, "locale=", frb_set_locale, 1);
|
2562
|
+
rb_define_singleton_method(mFerret, "locale", frb_get_locale, 0);
|
2438
2563
|
|
2439
2564
|
rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
|
2440
2565
|
get_rstopwords(ENGLISH_STOP_WORDS));
|
@@ -2464,6 +2589,8 @@ Init_Analysis(void)
|
|
2464
2589
|
get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
|
2465
2590
|
rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
|
2466
2591
|
get_rstopwords(FULL_FINNISH_STOP_WORDS));
|
2592
|
+
rb_define_const(mAnalysis, "FULL_HUNGARIAN_STOP_WORDS",
|
2593
|
+
get_rstopwords(FULL_HUNGARIAN_STOP_WORDS));
|
2467
2594
|
|
2468
2595
|
Init_Token();
|
2469
2596
|
Init_TokenStream();
|