ferret 0.11.6 → 0.11.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
@@ -25,8 +25,8 @@ extern void lose_s(symbol * p) {
|
|
25
25
|
}
|
26
26
|
|
27
27
|
/*
|
28
|
-
new_p =
|
29
|
-
if n +ve, or n characters backwards from p +c - 1 if n -ve. new_p is the new
|
28
|
+
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
|
29
|
+
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
|
30
30
|
position, or 0 on failure.
|
31
31
|
|
32
32
|
-- used to implement hop and next in the utf8 case.
|
@@ -76,7 +76,7 @@ static int get_utf8(const symbol * p, int c, int l, int * slot) {
|
|
76
76
|
if (b0 < 0xE0 || c == l) { /* 1110 0000 */
|
77
77
|
* slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
|
78
78
|
}
|
79
|
-
* slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (
|
79
|
+
* slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
|
80
80
|
}
|
81
81
|
|
82
82
|
static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
|
@@ -90,94 +90,126 @@ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
|
|
90
90
|
if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
|
91
91
|
* slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
|
92
92
|
}
|
93
|
-
* slot = (
|
93
|
+
* slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
|
94
94
|
}
|
95
95
|
|
96
|
-
extern int in_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
96
|
+
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
97
|
+
do {
|
98
|
+
int ch;
|
99
|
+
int w = get_utf8(z->p, z->c, z->l, & ch);
|
100
|
+
unless (w) return -1;
|
101
|
+
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
102
|
+
return w;
|
103
|
+
z->c += w;
|
104
|
+
} while (repeat);
|
105
|
+
return 0;
|
102
106
|
}
|
103
107
|
|
104
|
-
extern int in_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
108
|
+
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
109
|
+
do {
|
110
|
+
int ch;
|
111
|
+
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
|
112
|
+
unless (w) return -1;
|
113
|
+
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
114
|
+
return w;
|
115
|
+
z->c -= w;
|
116
|
+
} while (repeat);
|
117
|
+
return 0;
|
110
118
|
}
|
111
119
|
|
112
|
-
extern int out_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
120
|
+
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
121
|
+
do {
|
122
|
+
int ch;
|
123
|
+
int w = get_utf8(z->p, z->c, z->l, & ch);
|
124
|
+
unless (w) return -1;
|
125
|
+
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
126
|
+
return w;
|
127
|
+
z->c += w;
|
128
|
+
} while (repeat);
|
129
|
+
return 0;
|
118
130
|
}
|
119
131
|
|
120
|
-
extern int out_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
132
|
+
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
133
|
+
do {
|
134
|
+
int ch;
|
135
|
+
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
|
136
|
+
unless (w) return -1;
|
137
|
+
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
138
|
+
return w;
|
139
|
+
z->c -= w;
|
140
|
+
} while (repeat);
|
141
|
+
return 0;
|
126
142
|
}
|
127
143
|
|
128
144
|
/* Code for character groupings: non-utf8 cases */
|
129
145
|
|
130
|
-
extern int in_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
146
|
+
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
147
|
+
do {
|
148
|
+
int ch;
|
149
|
+
if (z->c >= z->l) return -1;
|
150
|
+
ch = z->p[z->c];
|
151
|
+
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
152
|
+
return 1;
|
153
|
+
z->c++;
|
154
|
+
} while (repeat);
|
155
|
+
return 0;
|
136
156
|
}
|
137
157
|
|
138
|
-
extern int in_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
158
|
+
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
159
|
+
do {
|
160
|
+
int ch;
|
161
|
+
if (z->c <= z->lb) return -1;
|
162
|
+
ch = z->p[z->c - 1];
|
163
|
+
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
164
|
+
return 1;
|
165
|
+
z->c--;
|
166
|
+
} while (repeat);
|
167
|
+
return 0;
|
144
168
|
}
|
145
169
|
|
146
|
-
extern int out_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
170
|
+
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
171
|
+
do {
|
172
|
+
int ch;
|
173
|
+
if (z->c >= z->l) return -1;
|
174
|
+
ch = z->p[z->c];
|
175
|
+
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
176
|
+
return 1;
|
177
|
+
z->c++;
|
178
|
+
} while (repeat);
|
179
|
+
return 0;
|
152
180
|
}
|
153
181
|
|
154
|
-
extern int out_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
182
|
+
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
|
183
|
+
do {
|
184
|
+
int ch;
|
185
|
+
if (z->c <= z->lb) return -1;
|
186
|
+
ch = z->p[z->c - 1];
|
187
|
+
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
|
188
|
+
return 1;
|
189
|
+
z->c--;
|
190
|
+
} while (repeat);
|
191
|
+
return 0;
|
160
192
|
}
|
161
193
|
|
162
|
-
extern int eq_s(struct SN_env * z, int s_size, symbol * s) {
|
194
|
+
extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
|
163
195
|
if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
|
164
196
|
z->c += s_size; return 1;
|
165
197
|
}
|
166
198
|
|
167
|
-
extern int eq_s_b(struct SN_env * z, int s_size, symbol * s) {
|
199
|
+
extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
|
168
200
|
if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
|
169
201
|
z->c -= s_size; return 1;
|
170
202
|
}
|
171
203
|
|
172
|
-
extern int eq_v(struct SN_env * z, symbol * p) {
|
204
|
+
extern int eq_v(struct SN_env * z, const symbol * p) {
|
173
205
|
return eq_s(z, SIZE(p), p);
|
174
206
|
}
|
175
207
|
|
176
|
-
extern int eq_v_b(struct SN_env * z, symbol * p) {
|
208
|
+
extern int eq_v_b(struct SN_env * z, const symbol * p) {
|
177
209
|
return eq_s_b(z, SIZE(p), p);
|
178
210
|
}
|
179
211
|
|
180
|
-
extern int find_among(struct SN_env * z, struct among * v, int v_size) {
|
212
|
+
extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
|
181
213
|
|
182
214
|
int i = 0;
|
183
215
|
int j = v_size;
|
@@ -185,7 +217,7 @@ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
|
|
185
217
|
int c = z->c; int l = z->l;
|
186
218
|
symbol * q = z->p + c;
|
187
219
|
|
188
|
-
struct among * w;
|
220
|
+
const struct among * w;
|
189
221
|
|
190
222
|
int common_i = 0;
|
191
223
|
int common_j = 0;
|
@@ -198,9 +230,9 @@ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
|
|
198
230
|
int common = common_i < common_j ? common_i : common_j; /* smaller */
|
199
231
|
w = v + k;
|
200
232
|
{
|
201
|
-
int
|
233
|
+
int i2; for (i2 = common; i2 < w->s_size; i2++) {
|
202
234
|
if (c + common == l) { diff = -1; break; }
|
203
|
-
diff = q[common] - w->s[
|
235
|
+
diff = q[common] - w->s[i2];
|
204
236
|
if (diff != 0) break;
|
205
237
|
common++;
|
206
238
|
}
|
@@ -237,7 +269,7 @@ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
|
|
237
269
|
|
238
270
|
/* find_among_b is for backwards processing. Same comments apply */
|
239
271
|
|
240
|
-
extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
|
272
|
+
extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
|
241
273
|
|
242
274
|
int i = 0;
|
243
275
|
int j = v_size;
|
@@ -245,7 +277,7 @@ extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
|
|
245
277
|
int c = z->c; int lb = z->lb;
|
246
278
|
symbol * q = z->p + c - 1;
|
247
279
|
|
248
|
-
struct among * w;
|
280
|
+
const struct among * w;
|
249
281
|
|
250
282
|
int common_i = 0;
|
251
283
|
int common_j = 0;
|
@@ -258,9 +290,9 @@ extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
|
|
258
290
|
int common = common_i < common_j ? common_i : common_j;
|
259
291
|
w = v + k;
|
260
292
|
{
|
261
|
-
int
|
293
|
+
int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
|
262
294
|
if (c - common == lb) { diff = -1; break; }
|
263
|
-
diff = q[- common] - w->s[
|
295
|
+
diff = q[- common] - w->s[i2];
|
264
296
|
if (diff != 0) break;
|
265
297
|
common++;
|
266
298
|
}
|
@@ -362,12 +394,12 @@ static int slice_check(struct SN_env * z) {
|
|
362
394
|
return 0;
|
363
395
|
}
|
364
396
|
|
365
|
-
extern int slice_from_s(struct SN_env * z, int s_size, symbol * s) {
|
397
|
+
extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
|
366
398
|
if (slice_check(z)) return -1;
|
367
399
|
return replace_s(z, z->bra, z->ket, s_size, s, NULL);
|
368
400
|
}
|
369
401
|
|
370
|
-
extern int slice_from_v(struct SN_env * z, symbol * p) {
|
402
|
+
extern int slice_from_v(struct SN_env * z, const symbol * p) {
|
371
403
|
return slice_from_s(z, SIZE(p), p);
|
372
404
|
}
|
373
405
|
|
@@ -375,7 +407,7 @@ extern int slice_del(struct SN_env * z) {
|
|
375
407
|
return slice_from_s(z, 0, 0);
|
376
408
|
}
|
377
409
|
|
378
|
-
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s) {
|
410
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
|
379
411
|
int adjustment;
|
380
412
|
if (replace_s(z, bra, ket, s_size, s, &adjustment))
|
381
413
|
return -1;
|
@@ -384,7 +416,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s)
|
|
384
416
|
return 0;
|
385
417
|
}
|
386
418
|
|
387
|
-
extern int insert_v(struct SN_env * z, int bra, int ket, symbol * p) {
|
419
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
|
388
420
|
int adjustment;
|
389
421
|
if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
|
390
422
|
return -1;
|
data/ext/analysis.c
CHANGED
@@ -5,6 +5,8 @@
|
|
5
5
|
#include <ctype.h>
|
6
6
|
#include <wctype.h>
|
7
7
|
#include <wchar.h>
|
8
|
+
#include "internal.h"
|
9
|
+
#include "scanner.h"
|
8
10
|
|
9
11
|
/****************************************************************************
|
10
12
|
*
|
@@ -27,8 +29,8 @@ INLINE Token *tk_set(Token *tk,
|
|
27
29
|
return tk;
|
28
30
|
}
|
29
31
|
|
30
|
-
INLINE Token *tk_set_ts(Token *tk,
|
31
|
-
|
32
|
+
static INLINE Token *tk_set_ts(Token *tk, char *start, char *end,
|
33
|
+
char *text, int pos_inc)
|
32
34
|
{
|
33
35
|
return tk_set(tk, start, (int)(end - start),
|
34
36
|
(off_t)(start - text), (off_t)(end - text), pos_inc);
|
@@ -40,8 +42,8 @@ INLINE Token *tk_set_no_len(Token *tk,
|
|
40
42
|
return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
|
41
43
|
}
|
42
44
|
|
43
|
-
INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start,
|
44
|
-
|
45
|
+
static INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start,
|
46
|
+
off_t end, int pos_inc)
|
45
47
|
{
|
46
48
|
int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
|
47
49
|
tk->text[len] = '\0';
|
@@ -121,7 +123,7 @@ TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
|
|
121
123
|
|
122
124
|
TokenStream *ts_new_i(size_t size)
|
123
125
|
{
|
124
|
-
TokenStream *ts = ecalloc(size);
|
126
|
+
TokenStream *ts = (TokenStream *)ecalloc(size);
|
125
127
|
|
126
128
|
ts->destroy_i = (void (*)(TokenStream *))&free;
|
127
129
|
ts->reset = &ts_reset;
|
@@ -152,7 +154,7 @@ static TokenStream *cts_new()
|
|
152
154
|
|
153
155
|
#define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
|
154
156
|
|
155
|
-
INLINE int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
|
157
|
+
static INLINE int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
|
156
158
|
{
|
157
159
|
int num_bytes;
|
158
160
|
if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
|
@@ -180,7 +182,7 @@ static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
|
|
180
182
|
return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
|
181
183
|
}
|
182
184
|
|
183
|
-
TokenStream *mb_ts_new()
|
185
|
+
static TokenStream *mb_ts_new()
|
184
186
|
{
|
185
187
|
TokenStream *ts = ts_new(MultiByteTokenStream);
|
186
188
|
ts->reset = &mb_ts_reset;
|
@@ -210,7 +212,9 @@ static void a_standard_destroy_i(Analyzer *a)
|
|
210
212
|
free(a);
|
211
213
|
}
|
212
214
|
|
213
|
-
static TokenStream *a_standard_get_ts(Analyzer *a,
|
215
|
+
static TokenStream *a_standard_get_ts(Analyzer *a,
|
216
|
+
Symbol field,
|
217
|
+
char *text)
|
214
218
|
{
|
215
219
|
TokenStream *ts;
|
216
220
|
(void)field;
|
@@ -220,7 +224,8 @@ static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
|
220
224
|
|
221
225
|
Analyzer *analyzer_new(TokenStream *ts,
|
222
226
|
void (*destroy_i)(Analyzer *a),
|
223
|
-
TokenStream *(*get_ts)(Analyzer *a,
|
227
|
+
TokenStream *(*get_ts)(Analyzer *a,
|
228
|
+
Symbol field,
|
224
229
|
char *text))
|
225
230
|
{
|
226
231
|
Analyzer *a = ALLOC(Analyzer);
|
@@ -414,7 +419,7 @@ Analyzer *mb_whitespace_analyzer_new(bool lowercase)
|
|
414
419
|
/*
|
415
420
|
* LetterTokenizer
|
416
421
|
*/
|
417
|
-
Token *lt_next(TokenStream *ts)
|
422
|
+
static Token *lt_next(TokenStream *ts)
|
418
423
|
{
|
419
424
|
char *start;
|
420
425
|
char *t = ts->t;
|
@@ -446,7 +451,7 @@ TokenStream *letter_tokenizer_new()
|
|
446
451
|
/*
|
447
452
|
* Multi-byte LetterTokenizer
|
448
453
|
*/
|
449
|
-
Token *mb_lt_next(TokenStream *ts)
|
454
|
+
static Token *mb_lt_next(TokenStream *ts)
|
450
455
|
{
|
451
456
|
int i;
|
452
457
|
char *start;
|
@@ -478,7 +483,7 @@ Token *mb_lt_next(TokenStream *ts)
|
|
478
483
|
/*
|
479
484
|
* Lowercasing Multi-byte LetterTokenizer
|
480
485
|
*/
|
481
|
-
Token *mb_lt_next_lc(TokenStream *ts)
|
486
|
+
static Token *mb_lt_next_lc(TokenStream *ts)
|
482
487
|
{
|
483
488
|
int i;
|
484
489
|
char *start;
|
@@ -554,43 +559,88 @@ Analyzer *mb_letter_analyzer_new(bool lowercase)
|
|
554
559
|
/*
|
555
560
|
* StandardTokenizer
|
556
561
|
*/
|
557
|
-
static
|
562
|
+
static Token *std_next(TokenStream *ts)
|
558
563
|
{
|
559
|
-
|
560
|
-
char *
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
564
|
+
StandardTokenizer *std_tz = STDTS(ts);
|
565
|
+
const char *start = NULL;
|
566
|
+
const char *end = NULL;
|
567
|
+
int len;
|
568
|
+
Token *tk = &(CTS(ts)->token);
|
569
|
+
|
570
|
+
switch (std_tz->type) {
|
571
|
+
case STT_ASCII:
|
572
|
+
frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1,
|
573
|
+
&start, &end, &len);
|
574
|
+
break;
|
575
|
+
case STT_MB:
|
576
|
+
frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1,
|
577
|
+
&start, &end, &len);
|
578
|
+
break;
|
579
|
+
case STT_UTF8:
|
580
|
+
frt_std_scan_utf8(ts->t, tk->text, sizeof(tk->text) - 1,
|
581
|
+
&start, &end, &len);
|
582
|
+
break;
|
566
583
|
}
|
567
|
-
|
584
|
+
|
585
|
+
if (len == 0)
|
586
|
+
return NULL;
|
587
|
+
|
588
|
+
ts->t = (char *)end;
|
589
|
+
tk->len = len;
|
590
|
+
tk->start = start - ts->text;
|
591
|
+
tk->end = end - ts->text;
|
592
|
+
tk->pos_inc = 1;
|
593
|
+
return &(CTS(ts)->token);
|
568
594
|
}
|
569
595
|
|
570
|
-
static
|
596
|
+
static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
|
571
597
|
{
|
572
|
-
|
573
|
-
|
574
|
-
int i;
|
575
|
-
mbstate_t state; ZEROSET(&state, mbstate_t);
|
598
|
+
return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
|
599
|
+
}
|
576
600
|
|
577
|
-
|
601
|
+
static TokenStream *std_ts_new()
|
602
|
+
{
|
603
|
+
TokenStream *ts = ts_new(StandardTokenizer);
|
578
604
|
|
579
|
-
|
580
|
-
|
581
|
-
i = mb_next_char(&wchr, t, &state);
|
582
|
-
}
|
605
|
+
ts->clone_i = &std_ts_clone_i;
|
606
|
+
ts->next = &std_next;
|
583
607
|
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
608
|
+
return ts;
|
609
|
+
}
|
610
|
+
|
611
|
+
TokenStream *standard_tokenizer_new()
|
612
|
+
{
|
613
|
+
TokenStream *ts = std_ts_new();
|
614
|
+
STDTS(ts)->type = STT_ASCII;
|
615
|
+
return ts;
|
616
|
+
}
|
617
|
+
|
618
|
+
TokenStream *mb_standard_tokenizer_new()
|
619
|
+
{
|
620
|
+
TokenStream *ts = std_ts_new();
|
621
|
+
STDTS(ts)->type = STT_MB;
|
622
|
+
return ts;
|
590
623
|
}
|
591
624
|
|
625
|
+
TokenStream *utf8_standard_tokenizer_new()
|
626
|
+
{
|
627
|
+
TokenStream *ts = std_ts_new();
|
628
|
+
STDTS(ts)->type = STT_UTF8;
|
629
|
+
return ts;
|
630
|
+
}
|
631
|
+
|
632
|
+
/****************************************************************************
|
633
|
+
*
|
634
|
+
* LegacyStandard
|
635
|
+
*
|
636
|
+
****************************************************************************/
|
637
|
+
|
638
|
+
#define LSTDTS(token_stream) ((LegacyStandardTokenizer *)(token_stream))
|
639
|
+
|
592
640
|
/*
|
593
|
-
|
641
|
+
* LegacyStandardTokenizer
|
642
|
+
*/
|
643
|
+
static int legacy_std_get_alpha(TokenStream *ts, char *token)
|
594
644
|
{
|
595
645
|
int i = 0;
|
596
646
|
char *t = ts->t;
|
@@ -603,7 +653,7 @@ static int std_get_alnum(TokenStream *ts, char *token)
|
|
603
653
|
return i;
|
604
654
|
}
|
605
655
|
|
606
|
-
static int
|
656
|
+
static int mb_legacy_std_get_alpha(TokenStream *ts, char *token)
|
607
657
|
{
|
608
658
|
char *t = ts->t;
|
609
659
|
wchar_t wchr;
|
@@ -624,7 +674,6 @@ static int mb_std_get_alnum(TokenStream *ts, char *token)
|
|
624
674
|
memcpy(token, ts->t, i);
|
625
675
|
return i;
|
626
676
|
}
|
627
|
-
*/
|
628
677
|
|
629
678
|
static int isnumpunc(char c)
|
630
679
|
{
|
@@ -659,7 +708,7 @@ static int isurlxatc(char c)
|
|
659
708
|
|| isalnum(c));
|
660
709
|
}
|
661
710
|
|
662
|
-
static bool
|
711
|
+
static bool legacy_std_is_tok_char(char *c)
|
663
712
|
{
|
664
713
|
if (isspace(*c)) {
|
665
714
|
return false; /* most common so check first. */
|
@@ -671,11 +720,11 @@ static bool std_is_tok_char(char *c)
|
|
671
720
|
return false;
|
672
721
|
}
|
673
722
|
|
674
|
-
static bool
|
723
|
+
static bool mb_legacy_std_is_tok_char(char *t)
|
675
724
|
{
|
676
725
|
wchar_t c;
|
677
726
|
mbstate_t state; ZEROSET(&state, mbstate_t);
|
678
|
-
|
727
|
+
|
679
728
|
if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
|
680
729
|
/* error which we can handle next time round. For now just return
|
681
730
|
* false so that we can return a token */
|
@@ -696,7 +745,7 @@ static bool mb_std_is_tok_char(char *t)
|
|
696
745
|
* (alnum) = [a-zA-Z0-9]
|
697
746
|
* (punc) = [_\/.,-]
|
698
747
|
*/
|
699
|
-
static int
|
748
|
+
static int legacy_std_get_number(char *input)
|
700
749
|
{
|
701
750
|
int i = 0;
|
702
751
|
int count = 0;
|
@@ -732,7 +781,7 @@ static int std_get_number(char *input)
|
|
732
781
|
}
|
733
782
|
}
|
734
783
|
|
735
|
-
static int
|
784
|
+
static int legacy_std_get_apostrophe(char *input)
|
736
785
|
{
|
737
786
|
char *t = input;
|
738
787
|
|
@@ -743,7 +792,7 @@ static int std_get_apostrophe(char *input)
|
|
743
792
|
return (int)(t - input);
|
744
793
|
}
|
745
794
|
|
746
|
-
static int
|
795
|
+
static int mb_legacy_std_get_apostrophe(char *input)
|
747
796
|
{
|
748
797
|
char *t = input;
|
749
798
|
wchar_t wchr;
|
@@ -759,8 +808,9 @@ static int mb_std_get_apostrophe(char *input)
|
|
759
808
|
return (int)(t - input);
|
760
809
|
}
|
761
810
|
|
762
|
-
static
|
811
|
+
static char *std_get_url(char *input, char *token, int i, int *len)
|
763
812
|
{
|
813
|
+
char *next = NULL;
|
764
814
|
while (isurlc(input[i])) {
|
765
815
|
if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
|
766
816
|
break; /* can't have two puncs in a row */
|
@@ -770,18 +820,26 @@ static int std_get_url(char *input, char *token, int i)
|
|
770
820
|
}
|
771
821
|
i++;
|
772
822
|
}
|
823
|
+
next = input + i;
|
824
|
+
|
825
|
+
/* We don't want to index past the end of the token capacity) */
|
826
|
+
if (i >= MAX_WORD_SIZE) {
|
827
|
+
i = MAX_WORD_SIZE - 1;
|
828
|
+
}
|
773
829
|
|
774
830
|
/* strip trailing puncs */
|
775
831
|
while (isurlpunc(input[i - 1])) {
|
776
832
|
i--;
|
777
833
|
}
|
834
|
+
*len = i;
|
835
|
+
token[i] = '\0';
|
778
836
|
|
779
|
-
return
|
837
|
+
return next;
|
780
838
|
}
|
781
839
|
|
782
840
|
/* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
|
783
841
|
*/
|
784
|
-
static int
|
842
|
+
static int legacy_std_get_company_name(char *input)
|
785
843
|
{
|
786
844
|
int i = 0;
|
787
845
|
while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
|
@@ -791,25 +849,7 @@ static int std_get_company_name(char *input)
|
|
791
849
|
return i;
|
792
850
|
}
|
793
851
|
|
794
|
-
|
795
|
-
static int mb_std_get_company_name(char *input, TokenStream *ts)
|
796
|
-
{
|
797
|
-
char *t = input;
|
798
|
-
wchar_t wchr;
|
799
|
-
int i;
|
800
|
-
mbstate_t state; ZEROSET(&state, mbstate_t);
|
801
|
-
|
802
|
-
i = mb_next_char(&wchr, t, &state);
|
803
|
-
while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
|
804
|
-
t += i;
|
805
|
-
i = mb_next_char(&wchr, t, &state);
|
806
|
-
}
|
807
|
-
|
808
|
-
return (int)(t - input);
|
809
|
-
}
|
810
|
-
*/
|
811
|
-
|
812
|
-
static bool std_advance_to_start(TokenStream *ts)
|
852
|
+
static bool legacy_std_advance_to_start(TokenStream *ts)
|
813
853
|
{
|
814
854
|
char *t = ts->t;
|
815
855
|
while (*t != '\0' && !isalnum(*t)) {
|
@@ -822,7 +862,7 @@ static bool std_advance_to_start(TokenStream *ts)
|
|
822
862
|
return (*t != '\0');
|
823
863
|
}
|
824
864
|
|
825
|
-
static bool
|
865
|
+
static bool mb_legacy_std_advance_to_start(TokenStream *ts)
|
826
866
|
{
|
827
867
|
int i;
|
828
868
|
wchar_t wchr;
|
@@ -839,9 +879,9 @@ static bool mb_std_advance_to_start(TokenStream *ts)
|
|
839
879
|
return (wchr != 0);
|
840
880
|
}
|
841
881
|
|
842
|
-
static Token *
|
882
|
+
static Token *legacy_std_next(TokenStream *ts)
|
843
883
|
{
|
844
|
-
|
884
|
+
LegacyStandardTokenizer *std_tz = LSTDTS(ts);
|
845
885
|
char *s;
|
846
886
|
char *t;
|
847
887
|
char *start = NULL;
|
@@ -890,13 +930,13 @@ static Token *std_next(TokenStream *ts)
|
|
890
930
|
}
|
891
931
|
|
892
932
|
if (*t == '&') { /* apostrophe case. */
|
893
|
-
t +=
|
933
|
+
t += legacy_std_get_company_name(t);
|
894
934
|
ts->t = t;
|
895
935
|
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
896
936
|
}
|
897
937
|
|
898
|
-
if ((isdigit(*
|
899
|
-
&& (len =
|
938
|
+
if ((isdigit(*start) || isnumpunc(*start)) /* possibly a number */
|
939
|
+
&& ((len = legacy_std_get_number(start)) > 0)) {
|
900
940
|
num_end = start + len;
|
901
941
|
if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
|
902
942
|
ts->t = num_end;
|
@@ -909,6 +949,7 @@ static Token *std_next(TokenStream *ts)
|
|
909
949
|
/* check for a known url start */
|
910
950
|
token[token_i] = '\0';
|
911
951
|
t += 3;
|
952
|
+
token_i += 3;
|
912
953
|
while (*t == '/') {
|
913
954
|
t++;
|
914
955
|
}
|
@@ -917,17 +958,16 @@ static Token *std_next(TokenStream *ts)
|
|
917
958
|
memcmp(token, "http", 4) == 0 ||
|
918
959
|
memcmp(token, "https", 5) == 0 ||
|
919
960
|
memcmp(token, "file", 4) == 0)) {
|
920
|
-
|
961
|
+
ts->t = std_get_url(t, token, 0, &len); /* dispose of first part of the URL */
|
921
962
|
}
|
922
963
|
else { /* still treat as url but keep the first part */
|
923
964
|
token_i = (int)(t - start);
|
924
965
|
memcpy(token, start, token_i * sizeof(char));
|
925
|
-
|
966
|
+
ts->t = std_get_url(start, token, token_i, &len); /* keep start */
|
926
967
|
}
|
927
|
-
ts->
|
928
|
-
|
929
|
-
|
930
|
-
(off_t)(ts->t - ts->text), 1);
|
968
|
+
return tk_set(&(CTS(ts)->token), token, len,
|
969
|
+
(off_t)(start - ts->text),
|
970
|
+
(off_t)(ts->t - ts->text), 1);
|
931
971
|
}
|
932
972
|
|
933
973
|
/* now see how long a url we can find. */
|
@@ -989,41 +1029,41 @@ static Token *std_next(TokenStream *ts)
|
|
989
1029
|
return &(CTS(ts)->token);
|
990
1030
|
}
|
991
1031
|
|
992
|
-
static TokenStream *
|
1032
|
+
static TokenStream *legacy_std_ts_clone_i(TokenStream *orig_ts)
|
993
1033
|
{
|
994
|
-
return ts_clone_size(orig_ts, sizeof(
|
1034
|
+
return ts_clone_size(orig_ts, sizeof(LegacyStandardTokenizer));
|
995
1035
|
}
|
996
1036
|
|
997
|
-
static TokenStream *
|
1037
|
+
static TokenStream *legacy_std_ts_new()
|
998
1038
|
{
|
999
|
-
TokenStream *ts = ts_new(
|
1039
|
+
TokenStream *ts = ts_new(LegacyStandardTokenizer);
|
1000
1040
|
|
1001
|
-
ts->clone_i = &
|
1002
|
-
ts->next = &
|
1041
|
+
ts->clone_i = &legacy_std_ts_clone_i;
|
1042
|
+
ts->next = &legacy_std_next;
|
1003
1043
|
|
1004
1044
|
return ts;
|
1005
1045
|
}
|
1006
1046
|
|
1007
|
-
TokenStream *
|
1047
|
+
TokenStream *legacy_standard_tokenizer_new()
|
1008
1048
|
{
|
1009
|
-
TokenStream *ts =
|
1049
|
+
TokenStream *ts = legacy_std_ts_new();
|
1010
1050
|
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1051
|
+
LSTDTS(ts)->advance_to_start = &legacy_std_advance_to_start;
|
1052
|
+
LSTDTS(ts)->get_alpha = &legacy_std_get_alpha;
|
1053
|
+
LSTDTS(ts)->is_tok_char = &legacy_std_is_tok_char;
|
1054
|
+
LSTDTS(ts)->get_apostrophe = &legacy_std_get_apostrophe;
|
1015
1055
|
|
1016
1056
|
return ts;
|
1017
1057
|
}
|
1018
1058
|
|
1019
|
-
TokenStream *
|
1059
|
+
TokenStream *mb_legacy_standard_tokenizer_new()
|
1020
1060
|
{
|
1021
|
-
TokenStream *ts =
|
1061
|
+
TokenStream *ts = legacy_std_ts_new();
|
1022
1062
|
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1063
|
+
LSTDTS(ts)->advance_to_start = &mb_legacy_std_advance_to_start;
|
1064
|
+
LSTDTS(ts)->get_alpha = &mb_legacy_std_get_alpha;
|
1065
|
+
LSTDTS(ts)->is_tok_char = &mb_legacy_std_is_tok_char;
|
1066
|
+
LSTDTS(ts)->get_apostrophe = &mb_legacy_std_get_apostrophe;
|
1027
1067
|
|
1028
1068
|
return ts;
|
1029
1069
|
}
|
@@ -1060,7 +1100,6 @@ static void filter_destroy_i(TokenStream *ts)
|
|
1060
1100
|
free(ts);
|
1061
1101
|
}
|
1062
1102
|
|
1063
|
-
#define tf_new(type, sub) tf_new_i(sizeof(type), sub)
|
1064
1103
|
TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
|
1065
1104
|
{
|
1066
1105
|
TokenStream *ts = (TokenStream *)ecalloc(size);
|
@@ -1097,7 +1136,7 @@ static TokenStream *sf_clone_i(TokenStream *orig_ts)
|
|
1097
1136
|
static Token *sf_next(TokenStream *ts)
|
1098
1137
|
{
|
1099
1138
|
int pos_inc = 0;
|
1100
|
-
|
1139
|
+
Hash *words = StopFilt(ts)->words;
|
1101
1140
|
TokenFilter *tf = TkFilt(ts);
|
1102
1141
|
Token *tk = tf->sub_ts->next(tf->sub_ts);
|
1103
1142
|
|
@@ -1118,7 +1157,7 @@ TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
|
|
1118
1157
|
{
|
1119
1158
|
int i;
|
1120
1159
|
char *word;
|
1121
|
-
|
1160
|
+
Hash *word_table = h_new_str(&free, (free_ft) NULL);
|
1122
1161
|
TokenStream *ts = tf_new(StopFilter, sub_ts);
|
1123
1162
|
|
1124
1163
|
for (i = 0; i < len; i++) {
|
@@ -1136,7 +1175,7 @@ TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
|
|
1136
1175
|
const char **words)
|
1137
1176
|
{
|
1138
1177
|
char *word;
|
1139
|
-
|
1178
|
+
Hash *word_table = h_new_str(&free, (free_ft) NULL);
|
1140
1179
|
TokenStream *ts = tf_new(StopFilter, sub_ts);
|
1141
1180
|
|
1142
1181
|
while (*words) {
|
@@ -1234,7 +1273,7 @@ static Token *hf_next(TokenStream *ts)
|
|
1234
1273
|
HyphenFilter *hf = HyphenFilt(ts);
|
1235
1274
|
TokenFilter *tf = TkFilt(ts);
|
1236
1275
|
Token *tk = hf->tk;
|
1237
|
-
|
1276
|
+
|
1238
1277
|
if (hf->pos < hf->len) {
|
1239
1278
|
const int pos = hf->pos;
|
1240
1279
|
const int text_len = strlen(hf->text + pos);
|
@@ -1301,7 +1340,7 @@ TokenStream *hyphen_filter_new(TokenStream *sub_ts)
|
|
1301
1340
|
****************************************************************************/
|
1302
1341
|
|
1303
1342
|
|
1304
|
-
Token *mb_lcf_next(TokenStream *ts)
|
1343
|
+
static Token *mb_lcf_next(TokenStream *ts)
|
1305
1344
|
{
|
1306
1345
|
wchar_t wbuf[MAX_WORD_SIZE + 1], *wchr;
|
1307
1346
|
Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
|
@@ -1334,7 +1373,7 @@ TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
|
|
1334
1373
|
return ts;
|
1335
1374
|
}
|
1336
1375
|
|
1337
|
-
Token *lcf_next(TokenStream *ts)
|
1376
|
+
static Token *lcf_next(TokenStream *ts)
|
1338
1377
|
{
|
1339
1378
|
int i = 0;
|
1340
1379
|
Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
|
@@ -1361,7 +1400,7 @@ TokenStream *lowercase_filter_new(TokenStream *sub_ts)
|
|
1361
1400
|
|
1362
1401
|
#define StemFilt(filter) ((StemFilter *)(filter))
|
1363
1402
|
|
1364
|
-
void stemf_destroy_i(TokenStream *ts)
|
1403
|
+
static void stemf_destroy_i(TokenStream *ts)
|
1365
1404
|
{
|
1366
1405
|
sb_stemmer_delete(StemFilt(ts)->stemmer);
|
1367
1406
|
free(StemFilt(ts)->algorithm);
|
@@ -1369,7 +1408,7 @@ void stemf_destroy_i(TokenStream *ts)
|
|
1369
1408
|
filter_destroy_i(ts);
|
1370
1409
|
}
|
1371
1410
|
|
1372
|
-
Token *stemf_next(TokenStream *ts)
|
1411
|
+
static Token *stemf_next(TokenStream *ts)
|
1373
1412
|
{
|
1374
1413
|
int len;
|
1375
1414
|
const sb_symbol *stemmed;
|
@@ -1391,7 +1430,7 @@ Token *stemf_next(TokenStream *ts)
|
|
1391
1430
|
return tk;
|
1392
1431
|
}
|
1393
1432
|
|
1394
|
-
TokenStream *stemf_clone_i(TokenStream *orig_ts)
|
1433
|
+
static TokenStream *stemf_clone_i(TokenStream *orig_ts)
|
1395
1434
|
{
|
1396
1435
|
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StemFilter));
|
1397
1436
|
StemFilter *stemf = StemFilt(new_ts);
|
@@ -1409,10 +1448,35 @@ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
|
|
1409
1448
|
const char *charenc)
|
1410
1449
|
{
|
1411
1450
|
TokenStream *tf = tf_new(StemFilter, ts);
|
1451
|
+
char *my_algorithm = NULL;
|
1452
|
+
char *my_charenc = NULL;
|
1453
|
+
char *s = NULL;
|
1454
|
+
|
1455
|
+
if (algorithm) {
|
1456
|
+
my_algorithm = estrdup(algorithm);
|
1457
|
+
|
1458
|
+
/* algorithms are lowercase */
|
1459
|
+
s = my_algorithm;
|
1460
|
+
while (*s) {
|
1461
|
+
*s = tolower(*s);
|
1462
|
+
s++;
|
1463
|
+
}
|
1464
|
+
StemFilt(tf)->algorithm = my_algorithm;
|
1465
|
+
}
|
1466
|
+
|
1467
|
+
if (charenc) {
|
1468
|
+
my_charenc = estrdup(charenc);
|
1412
1469
|
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1470
|
+
/* encodings are uppercase and use '_' instead of '-' */
|
1471
|
+
s = my_charenc;
|
1472
|
+
while (*s) {
|
1473
|
+
*s = (*s == '-') ? '_' : toupper(*s);
|
1474
|
+
s++;
|
1475
|
+
}
|
1476
|
+
StemFilt(tf)->charenc = my_charenc;
|
1477
|
+
}
|
1478
|
+
|
1479
|
+
StemFilt(tf)->stemmer = sb_stemmer_new(my_algorithm, my_charenc);
|
1416
1480
|
|
1417
1481
|
tf->next = &stemf_next;
|
1418
1482
|
tf->destroy_i = &stemf_destroy_i;
|
@@ -1474,6 +1538,28 @@ Analyzer *mb_standard_analyzer_new_with_words(const char **words,
|
|
1474
1538
|
return analyzer_new(ts, NULL, NULL);
|
1475
1539
|
}
|
1476
1540
|
|
1541
|
+
Analyzer *utf8_standard_analyzer_new_with_words_len(const char **words,
|
1542
|
+
int len, bool lowercase)
|
1543
|
+
{
|
1544
|
+
TokenStream *ts = utf8_standard_tokenizer_new();
|
1545
|
+
if (lowercase) {
|
1546
|
+
ts = mb_lowercase_filter_new(ts);
|
1547
|
+
}
|
1548
|
+
ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
|
1549
|
+
return analyzer_new(ts, NULL, NULL);
|
1550
|
+
}
|
1551
|
+
|
1552
|
+
Analyzer *utf8_standard_analyzer_new_with_words(const char **words,
|
1553
|
+
bool lowercase)
|
1554
|
+
{
|
1555
|
+
TokenStream *ts = utf8_standard_tokenizer_new();
|
1556
|
+
if (lowercase) {
|
1557
|
+
ts = mb_lowercase_filter_new(ts);
|
1558
|
+
}
|
1559
|
+
ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
|
1560
|
+
return analyzer_new(ts, NULL, NULL);
|
1561
|
+
}
|
1562
|
+
|
1477
1563
|
Analyzer *standard_analyzer_new(bool lowercase)
|
1478
1564
|
{
|
1479
1565
|
return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
|
@@ -1486,14 +1572,79 @@ Analyzer *mb_standard_analyzer_new(bool lowercase)
|
|
1486
1572
|
lowercase);
|
1487
1573
|
}
|
1488
1574
|
|
1575
|
+
Analyzer *utf8_standard_analyzer_new(bool lowercase)
|
1576
|
+
{
|
1577
|
+
return utf8_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
|
1578
|
+
lowercase);
|
1579
|
+
}
|
1580
|
+
|
1581
|
+
/****************************************************************************
|
1582
|
+
* Legacy
|
1583
|
+
****************************************************************************/
|
1584
|
+
|
1585
|
+
Analyzer *legacy_standard_analyzer_new_with_words_len(const char **words, int len,
|
1586
|
+
bool lowercase)
|
1587
|
+
{
|
1588
|
+
TokenStream *ts = legacy_standard_tokenizer_new();
|
1589
|
+
if (lowercase) {
|
1590
|
+
ts = lowercase_filter_new(ts);
|
1591
|
+
}
|
1592
|
+
ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
|
1593
|
+
return analyzer_new(ts, NULL, NULL);
|
1594
|
+
}
|
1595
|
+
|
1596
|
+
Analyzer *legacy_standard_analyzer_new_with_words(const char **words,
|
1597
|
+
bool lowercase)
|
1598
|
+
{
|
1599
|
+
TokenStream *ts = legacy_standard_tokenizer_new();
|
1600
|
+
if (lowercase) {
|
1601
|
+
ts = lowercase_filter_new(ts);
|
1602
|
+
}
|
1603
|
+
ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
|
1604
|
+
return analyzer_new(ts, NULL, NULL);
|
1605
|
+
}
|
1606
|
+
|
1607
|
+
Analyzer *mb_legacy_standard_analyzer_new_with_words_len(const char **words,
|
1608
|
+
int len, bool lowercase)
|
1609
|
+
{
|
1610
|
+
TokenStream *ts = mb_legacy_standard_tokenizer_new();
|
1611
|
+
if (lowercase) {
|
1612
|
+
ts = mb_lowercase_filter_new(ts);
|
1613
|
+
}
|
1614
|
+
ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
|
1615
|
+
return analyzer_new(ts, NULL, NULL);
|
1616
|
+
}
|
1617
|
+
|
1618
|
+
Analyzer *mb_legacy_standard_analyzer_new_with_words(const char **words,
|
1619
|
+
bool lowercase)
|
1620
|
+
{
|
1621
|
+
TokenStream *ts = mb_legacy_standard_tokenizer_new();
|
1622
|
+
if (lowercase) {
|
1623
|
+
ts = mb_lowercase_filter_new(ts);
|
1624
|
+
}
|
1625
|
+
ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
|
1626
|
+
return analyzer_new(ts, NULL, NULL);
|
1627
|
+
}
|
1628
|
+
|
1629
|
+
Analyzer *legacy_standard_analyzer_new(bool lowercase)
|
1630
|
+
{
|
1631
|
+
return legacy_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
|
1632
|
+
lowercase);
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
Analyzer *mb_legacy_standard_analyzer_new(bool lowercase)
|
1636
|
+
{
|
1637
|
+
return mb_legacy_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
|
1638
|
+
lowercase);
|
1639
|
+
}
|
1640
|
+
|
1489
1641
|
/****************************************************************************
|
1490
1642
|
*
|
1491
1643
|
* PerFieldAnalyzer
|
1492
1644
|
*
|
1493
1645
|
****************************************************************************/
|
1494
1646
|
|
1495
|
-
|
1496
|
-
void pfa_destroy_i(Analyzer *self)
|
1647
|
+
static void pfa_destroy_i(Analyzer *self)
|
1497
1648
|
{
|
1498
1649
|
h_destroy(PFA(self)->dict);
|
1499
1650
|
|
@@ -1501,24 +1652,27 @@ void pfa_destroy_i(Analyzer *self)
|
|
1501
1652
|
free(self);
|
1502
1653
|
}
|
1503
1654
|
|
1504
|
-
TokenStream *pfa_get_ts(Analyzer *self,
|
1655
|
+
static TokenStream *pfa_get_ts(Analyzer *self,
|
1656
|
+
Symbol field, char *text)
|
1505
1657
|
{
|
1506
|
-
Analyzer *a = h_get(PFA(self)->dict, field);
|
1658
|
+
Analyzer *a = (Analyzer *)h_get(PFA(self)->dict, field);
|
1507
1659
|
if (a == NULL) {
|
1508
1660
|
a = PFA(self)->default_a;
|
1509
1661
|
}
|
1510
1662
|
return a_get_ts(a, field, text);
|
1511
1663
|
}
|
1512
1664
|
|
1513
|
-
void pfa_sub_a_destroy_i(void *p)
|
1665
|
+
static void pfa_sub_a_destroy_i(void *p)
|
1514
1666
|
{
|
1515
1667
|
Analyzer *a = (Analyzer *) p;
|
1516
1668
|
a_deref(a);
|
1517
1669
|
}
|
1518
1670
|
|
1519
|
-
void pfa_add_field(Analyzer *self,
|
1671
|
+
void pfa_add_field(Analyzer *self,
|
1672
|
+
Symbol field,
|
1673
|
+
Analyzer *analyzer)
|
1520
1674
|
{
|
1521
|
-
h_set(PFA(self)->dict,
|
1675
|
+
h_set(PFA(self)->dict, field, analyzer);
|
1522
1676
|
}
|
1523
1677
|
|
1524
1678
|
Analyzer *per_field_analyzer_new(Analyzer *default_a)
|
@@ -1526,22 +1680,23 @@ Analyzer *per_field_analyzer_new(Analyzer *default_a)
|
|
1526
1680
|
Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
|
1527
1681
|
|
1528
1682
|
PFA(a)->default_a = default_a;
|
1529
|
-
PFA(a)->dict =
|
1683
|
+
PFA(a)->dict = h_new_ptr(&pfa_sub_a_destroy_i);
|
1530
1684
|
|
1531
1685
|
a->destroy_i = &pfa_destroy_i;
|
1532
1686
|
a->get_ts = pfa_get_ts;
|
1533
1687
|
a->ref_cnt = 1;
|
1534
|
-
|
1688
|
+
|
1535
1689
|
return a;
|
1536
1690
|
}
|
1537
1691
|
|
1538
|
-
#ifdef
|
1692
|
+
#ifdef TOKENIZE
|
1539
1693
|
int main(int argc, char **argv)
|
1540
1694
|
{
|
1541
1695
|
char buf[10000];
|
1542
1696
|
Analyzer *a = standard_analyzer_new(true);
|
1543
1697
|
TokenStream *ts;
|
1544
1698
|
Token *tk;
|
1699
|
+
(void)argc; (void)argv;
|
1545
1700
|
while (fgets(buf, 9999, stdin) != NULL) {
|
1546
1701
|
ts = a_get_ts(a, "hello", buf);
|
1547
1702
|
while ((tk = ts->next(ts)) != NULL) {
|