jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/ext/analysis.c ADDED
@@ -0,0 +1,1710 @@
1
+ #include "analysis.h"
2
+ #include "hash.h"
3
+ #include "libstemmer.h"
4
+ #include <string.h>
5
+ #include <ctype.h>
6
+ #include <wctype.h>
7
+ #include <wchar.h>
8
+ #include "internal.h"
9
+ #include "scanner.h"
10
+
11
+ /****************************************************************************
12
+ *
13
+ * Token
14
+ *
15
+ ****************************************************************************/
16
+
17
+ INLINE Token *tk_set(Token *tk,
18
+ char *text, int tlen, off_t start, off_t end, int pos_inc)
19
+ {
20
+ if (tlen >= MAX_WORD_SIZE) {
21
+ tlen = MAX_WORD_SIZE - 1;
22
+ }
23
+ memcpy(tk->text, text, sizeof(char) * tlen);
24
+ tk->text[tlen] = '\0';
25
+ tk->len = tlen;
26
+ tk->start = start;
27
+ tk->end = end;
28
+ tk->pos_inc = pos_inc;
29
+ return tk;
30
+ }
31
+
32
+ static INLINE Token *tk_set_ts(Token *tk, char *start, char *end,
33
+ char *text, int pos_inc)
34
+ {
35
+ return tk_set(tk, start, (int)(end - start),
36
+ (off_t)(start - text), (off_t)(end - text), pos_inc);
37
+ }
38
+
39
+ INLINE Token *tk_set_no_len(Token *tk,
40
+ char *text, off_t start, off_t end, int pos_inc)
41
+ {
42
+ return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
43
+ }
44
+
45
+ static INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start,
46
+ off_t end, int pos_inc)
47
+ {
48
+ int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
49
+ tk->text[len] = '\0';
50
+ tk->len = len;
51
+ tk->start = start;
52
+ tk->end = end;
53
+ tk->pos_inc = pos_inc;
54
+ return tk;
55
+ }
56
+
57
+ int tk_eq(Token *tk1, Token *tk2)
58
+ {
59
+ return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
60
+ tk1->start == tk2->start && tk1->end == tk2->end &&
61
+ tk1->pos_inc == tk2->pos_inc);
62
+ }
63
+
64
+ int tk_cmp(Token *tk1, Token *tk2)
65
+ {
66
+ int cmp;
67
+ if (tk1->start > tk2->start) {
68
+ cmp = 1;
69
+ }
70
+ else if (tk1->start < tk2->start) {
71
+ cmp = -1;
72
+ }
73
+ else {
74
+ if (tk1->end > tk2->end) {
75
+ cmp = 1;
76
+ }
77
+ else if (tk1->end < tk2->end) {
78
+ cmp = -1;
79
+ }
80
+ else {
81
+ cmp = strcmp((char *)tk1->text, (char *)tk2->text);
82
+ }
83
+ }
84
+ return cmp;
85
+ }
86
+
87
+ void tk_destroy(void *p)
88
+ {
89
+ free(p);
90
+ }
91
+
92
+ Token *tk_new()
93
+ {
94
+ return ALLOC(Token);
95
+ }
96
+
97
+ /****************************************************************************
98
+ *
99
+ * TokenStream
100
+ *
101
+ ****************************************************************************/
102
+
103
+ void ts_deref(TokenStream *ts)
104
+ {
105
+ if (--ts->ref_cnt <= 0) {
106
+ ts->destroy_i(ts);
107
+ }
108
+ }
109
+
110
+ static TokenStream *ts_reset(TokenStream *ts, char *text)
111
+ {
112
+ ts->t = ts->text = text;
113
+ return ts;
114
+ }
115
+
116
+ TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
117
+ {
118
+ TokenStream *ts = (TokenStream *)ecalloc(size);
119
+ memcpy(ts, orig_ts, size);
120
+ ts->ref_cnt = 1;
121
+ return ts;
122
+ }
123
+
124
+ TokenStream *ts_new_i(size_t size)
125
+ {
126
+ TokenStream *ts = (TokenStream *)ecalloc(size);
127
+
128
+ ts->destroy_i = (void (*)(TokenStream *))&free;
129
+ ts->reset = &ts_reset;
130
+ ts->ref_cnt = 1;
131
+
132
+ return ts;
133
+ }
134
+
135
+ /****************************************************************************
136
+ * CachedTokenStream
137
+ ****************************************************************************/
138
+
139
+ #define CTS(token_stream) ((CachedTokenStream *)(token_stream))
140
+
141
+ static TokenStream *cts_clone_i(TokenStream *orig_ts)
142
+ {
143
+ return ts_clone_size(orig_ts, sizeof(CachedTokenStream));
144
+ }
145
+
146
+ static TokenStream *cts_new()
147
+ {
148
+ TokenStream *ts = ts_new(CachedTokenStream);
149
+ ts->clone_i = &cts_clone_i;
150
+ return ts;
151
+ }
152
+
153
+ /* * Multi-byte TokenStream * */
154
+
155
+ #define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
156
+
157
+ static INLINE int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
158
+ {
159
+ int num_bytes;
160
+ if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
161
+ const char *t = s;
162
+ do {
163
+ t++;
164
+ ZEROSET(state, mbstate_t);
165
+ num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
166
+ } while ((num_bytes < 0) && (*t != 0));
167
+ num_bytes = t - s;
168
+ if (*t == 0) *wchr = 0;
169
+ }
170
+ return num_bytes;
171
+ }
172
+
173
+ static TokenStream *mb_ts_reset(TokenStream *ts, char *text)
174
+ {
175
+ ZEROSET(&(MBTS(ts)->state), mbstate_t);
176
+ ts_reset(ts, text);
177
+ return ts;
178
+ }
179
+
180
+ static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
181
+ {
182
+ return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
183
+ }
184
+
185
+ static TokenStream *mb_ts_new()
186
+ {
187
+ TokenStream *ts = ts_new(MultiByteTokenStream);
188
+ ts->reset = &mb_ts_reset;
189
+ ts->clone_i = &mb_ts_clone_i;
190
+ ts->ref_cnt = 1;
191
+ return ts;
192
+ }
193
+
194
+ /****************************************************************************
195
+ *
196
+ * Analyzer
197
+ *
198
+ ****************************************************************************/
199
+
200
+ void a_deref(Analyzer *a)
201
+ {
202
+ if (--a->ref_cnt <= 0) {
203
+ a->destroy_i(a);
204
+ }
205
+ }
206
+
207
+ static void a_standard_destroy_i(Analyzer *a)
208
+ {
209
+ if (a->current_ts) {
210
+ ts_deref(a->current_ts);
211
+ }
212
+ free(a);
213
+ }
214
+
215
+ static TokenStream *a_standard_get_ts(Analyzer *a,
216
+ Symbol field,
217
+ char *text)
218
+ {
219
+ TokenStream *ts;
220
+ (void)field;
221
+ ts = ts_clone(a->current_ts);
222
+ return ts->reset(ts, text);
223
+ }
224
+
225
+ Analyzer *analyzer_new(TokenStream *ts,
226
+ void (*destroy_i)(Analyzer *a),
227
+ TokenStream *(*get_ts)(Analyzer *a,
228
+ Symbol field,
229
+ char *text))
230
+ {
231
+ Analyzer *a = ALLOC(Analyzer);
232
+ a->current_ts = ts;
233
+ a->destroy_i = (destroy_i ? destroy_i : &a_standard_destroy_i);
234
+ a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
235
+ a->ref_cnt = 1;
236
+ return a;
237
+ }
238
+
239
+ /****************************************************************************
240
+ *
241
+ * Non
242
+ *
243
+ ****************************************************************************/
244
+
245
+ /*
246
+ * NonTokenizer
247
+ */
248
+ static Token *nt_next(TokenStream *ts)
249
+ {
250
+ if (ts->t) {
251
+ size_t len = strlen(ts->t);
252
+ ts->t = NULL;
253
+
254
+ return tk_set(&(CTS(ts)->token), ts->text, len, 0, len, 1);
255
+ }
256
+ else {
257
+ return NULL;
258
+ }
259
+ }
260
+
261
+ TokenStream *non_tokenizer_new()
262
+ {
263
+ TokenStream *ts = cts_new();
264
+ ts->next = &nt_next;
265
+ return ts;
266
+ }
267
+
268
+ /*
269
+ * NonAnalyzer
270
+ */
271
+ Analyzer *non_analyzer_new()
272
+ {
273
+ return analyzer_new(non_tokenizer_new(), NULL, NULL);
274
+ }
275
+
276
+ /****************************************************************************
277
+ *
278
+ * Whitespace
279
+ *
280
+ ****************************************************************************/
281
+
282
+ /*
283
+ * WhitespaceTokenizer
284
+ */
285
+ static Token *wst_next(TokenStream *ts)
286
+ {
287
+ char *t = ts->t;
288
+ char *start;
289
+
290
+ while (*t != '\0' && isspace(*t)) {
291
+ t++;
292
+ }
293
+
294
+ if (*t == '\0') {
295
+ return NULL;
296
+ }
297
+
298
+ start = t;
299
+ while (*t != '\0' && !isspace(*t)) {
300
+ t++;
301
+ }
302
+
303
+ ts->t = t;
304
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
305
+ }
306
+
307
+ TokenStream *whitespace_tokenizer_new()
308
+ {
309
+ TokenStream *ts = cts_new();
310
+ ts->next = &wst_next;
311
+ return ts;
312
+ }
313
+
314
+ /*
315
+ * Multi-byte WhitespaceTokenizer
316
+ */
317
+ static Token *mb_wst_next(TokenStream *ts)
318
+ {
319
+ int i;
320
+ char *start;
321
+ char *t = ts->t;
322
+ wchar_t wchr;
323
+ mbstate_t *state = &(MBTS(ts)->state);
324
+
325
+ i = mb_next_char(&wchr, t, state);
326
+ while (wchr != 0 && iswspace(wchr)) {
327
+ t += i;
328
+ i = mb_next_char(&wchr, t, state);
329
+ }
330
+ if (wchr == 0) {
331
+ return NULL;
332
+ }
333
+
334
+ start = t;
335
+ t += i;
336
+ i = mb_next_char(&wchr, t, state);
337
+ while (wchr != 0 && !iswspace(wchr)) {
338
+ t += i;
339
+ i = mb_next_char(&wchr, t, state);
340
+ }
341
+ ts->t = t;
342
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
343
+ }
344
+
345
+ /*
346
+ * Lowercasing Multi-byte WhitespaceTokenizer
347
+ */
348
+ static Token *mb_wst_next_lc(TokenStream *ts)
349
+ {
350
+ int i;
351
+ char *start;
352
+ char *t = ts->t;
353
+ wchar_t wchr;
354
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
355
+ mbstate_t *state = &(MBTS(ts)->state);
356
+
357
+ w = wbuf;
358
+ w_end = &wbuf[MAX_WORD_SIZE];
359
+
360
+ i = mb_next_char(&wchr, t, state);
361
+ while (wchr != 0 && iswspace(wchr)) {
362
+ t += i;
363
+ i = mb_next_char(&wchr, t, state);
364
+ }
365
+ if (wchr == 0) {
366
+ return NULL;
367
+ }
368
+
369
+ start = t;
370
+ t += i;
371
+ *w++ = towlower(wchr);
372
+ i = mb_next_char(&wchr, t, state);
373
+ while (wchr != 0 && !iswspace(wchr)) {
374
+ if (w < w_end) {
375
+ *w++ = towlower(wchr);
376
+ }
377
+ t += i;
378
+ i = mb_next_char(&wchr, t, state);
379
+ }
380
+ *w = 0;
381
+ ts->t = t;
382
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
383
+ (off_t)(t - ts->text), 1);
384
+ }
385
+
386
+ TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
387
+ {
388
+ TokenStream *ts = mb_ts_new();
389
+ ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
390
+ return ts;
391
+ }
392
+
393
+ /*
394
+ * WhitespaceAnalyzers
395
+ */
396
+ Analyzer *whitespace_analyzer_new(bool lowercase)
397
+ {
398
+ TokenStream *ts;
399
+ if (lowercase) {
400
+ ts = lowercase_filter_new(whitespace_tokenizer_new());
401
+ }
402
+ else {
403
+ ts = whitespace_tokenizer_new();
404
+ }
405
+ return analyzer_new(ts, NULL, NULL);
406
+ }
407
+
408
+ Analyzer *mb_whitespace_analyzer_new(bool lowercase)
409
+ {
410
+ return analyzer_new(mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
411
+ }
412
+
413
+ /****************************************************************************
414
+ *
415
+ * Letter
416
+ *
417
+ ****************************************************************************/
418
+
419
+ /*
420
+ * LetterTokenizer
421
+ */
422
+ static Token *lt_next(TokenStream *ts)
423
+ {
424
+ char *start;
425
+ char *t = ts->t;
426
+
427
+ while (*t != '\0' && !isalpha(*t)) {
428
+ t++;
429
+ }
430
+
431
+ if (*t == '\0') {
432
+ return NULL;
433
+ }
434
+
435
+ start = t;
436
+ while (*t != '\0' && isalpha(*t)) {
437
+ t++;
438
+ }
439
+
440
+ ts->t = t;
441
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
442
+ }
443
+
444
+ TokenStream *letter_tokenizer_new()
445
+ {
446
+ TokenStream *ts = cts_new();
447
+ ts->next = &lt_next;
448
+ return ts;
449
+ }
450
+
451
+ /*
452
+ * Multi-byte LetterTokenizer
453
+ */
454
+ static Token *mb_lt_next(TokenStream *ts)
455
+ {
456
+ int i;
457
+ char *start;
458
+ char *t = ts->t;
459
+ wchar_t wchr;
460
+ mbstate_t *state = &(MBTS(ts)->state);
461
+
462
+ i = mb_next_char(&wchr, t, state);
463
+ while (wchr != 0 && !iswalpha(wchr)) {
464
+ t += i;
465
+ i = mb_next_char(&wchr, t, state);
466
+ }
467
+
468
+ if (wchr == 0) {
469
+ return NULL;
470
+ }
471
+
472
+ start = t;
473
+ t += i;
474
+ i = mb_next_char(&wchr, t, state);
475
+ while (wchr != 0 && iswalpha(wchr)) {
476
+ t += i;
477
+ i = mb_next_char(&wchr, t, state);
478
+ }
479
+ ts->t = t;
480
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
481
+ }
482
+
483
+ /*
484
+ * Lowercasing Multi-byte LetterTokenizer
485
+ */
486
+ static Token *mb_lt_next_lc(TokenStream *ts)
487
+ {
488
+ int i;
489
+ char *start;
490
+ char *t = ts->t;
491
+ wchar_t wchr;
492
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
493
+ mbstate_t *state = &(MBTS(ts)->state);
494
+
495
+ w = wbuf;
496
+ w_end = &wbuf[MAX_WORD_SIZE];
497
+
498
+ i = mb_next_char(&wchr, t, state);
499
+ while (wchr != 0 && !iswalpha(wchr)) {
500
+ t += i;
501
+ i = mb_next_char(&wchr, t, state);
502
+ }
503
+ if (wchr == 0) {
504
+ return NULL;
505
+ }
506
+
507
+ start = t;
508
+ t += i;
509
+ *w++ = towlower(wchr);
510
+ i = mb_next_char(&wchr, t, state);
511
+ while (wchr != 0 && iswalpha(wchr)) {
512
+ if (w < w_end) {
513
+ *w++ = towlower(wchr);
514
+ }
515
+ t += i;
516
+ i = mb_next_char(&wchr, t, state);
517
+ }
518
+ *w = 0;
519
+ ts->t = t;
520
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
521
+ (off_t)(t - ts->text), 1);
522
+ }
523
+
524
+ TokenStream *mb_letter_tokenizer_new(bool lowercase)
525
+ {
526
+ TokenStream *ts = mb_ts_new();
527
+ ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
528
+ return ts;
529
+ }
530
+
531
+ /*
532
+ * LetterAnalyzers
533
+ */
534
+ Analyzer *letter_analyzer_new(bool lowercase)
535
+ {
536
+ TokenStream *ts;
537
+ if (lowercase) {
538
+ ts = lowercase_filter_new(letter_tokenizer_new());
539
+ }
540
+ else {
541
+ ts = letter_tokenizer_new();
542
+ }
543
+ return analyzer_new(ts, NULL, NULL);
544
+ }
545
+
546
+ Analyzer *mb_letter_analyzer_new(bool lowercase)
547
+ {
548
+ return analyzer_new(mb_letter_tokenizer_new(lowercase), NULL, NULL);
549
+ }
550
+
551
+ /****************************************************************************
552
+ *
553
+ * Standard
554
+ *
555
+ ****************************************************************************/
556
+
557
+ #define STDTS(token_stream) ((StandardTokenizer *)(token_stream))
558
+
559
+ /*
560
+ * StandardTokenizer
561
+ */
562
+ static Token *std_next(TokenStream *ts)
563
+ {
564
+ StandardTokenizer *std_tz = STDTS(ts);
565
+ const char *start = NULL;
566
+ const char *end = NULL;
567
+ int len;
568
+ Token *tk = &(CTS(ts)->token);
569
+
570
+ switch (std_tz->type) {
571
+ case STT_ASCII:
572
+ frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1,
573
+ &start, &end, &len);
574
+ break;
575
+ case STT_MB:
576
+ frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1,
577
+ &start, &end, &len);
578
+ break;
579
+ case STT_UTF8:
580
+ frt_std_scan_utf8(ts->t, tk->text, sizeof(tk->text) - 1,
581
+ &start, &end, &len);
582
+ break;
583
+ }
584
+
585
+ if (len == 0)
586
+ return NULL;
587
+
588
+ ts->t = (char *)end;
589
+ tk->len = len;
590
+ tk->start = start - ts->text;
591
+ tk->end = end - ts->text;
592
+ tk->pos_inc = 1;
593
+ return &(CTS(ts)->token);
594
+ }
595
+
596
+ static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
597
+ {
598
+ return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
599
+ }
600
+
601
+ static TokenStream *std_ts_new()
602
+ {
603
+ TokenStream *ts = ts_new(StandardTokenizer);
604
+
605
+ ts->clone_i = &std_ts_clone_i;
606
+ ts->next = &std_next;
607
+
608
+ return ts;
609
+ }
610
+
611
+ TokenStream *standard_tokenizer_new()
612
+ {
613
+ TokenStream *ts = std_ts_new();
614
+ STDTS(ts)->type = STT_ASCII;
615
+ return ts;
616
+ }
617
+
618
+ TokenStream *mb_standard_tokenizer_new()
619
+ {
620
+ TokenStream *ts = std_ts_new();
621
+ STDTS(ts)->type = STT_MB;
622
+ return ts;
623
+ }
624
+
625
+ TokenStream *utf8_standard_tokenizer_new()
626
+ {
627
+ TokenStream *ts = std_ts_new();
628
+ STDTS(ts)->type = STT_UTF8;
629
+ return ts;
630
+ }
631
+
632
+ /****************************************************************************
633
+ *
634
+ * LegacyStandard
635
+ *
636
+ ****************************************************************************/
637
+
638
+ #define LSTDTS(token_stream) ((LegacyStandardTokenizer *)(token_stream))
639
+
640
+ /*
641
+ * LegacyStandardTokenizer
642
+ */
643
+ static int legacy_std_get_alpha(TokenStream *ts, char *token)
644
+ {
645
+ int i = 0;
646
+ char *t = ts->t;
647
+ while (t[i] != '\0' && isalnum(t[i])) {
648
+ if (i < MAX_WORD_SIZE) {
649
+ token[i] = t[i];
650
+ }
651
+ i++;
652
+ }
653
+ return i;
654
+ }
655
+
656
+ static int mb_legacy_std_get_alpha(TokenStream *ts, char *token)
657
+ {
658
+ char *t = ts->t;
659
+ wchar_t wchr;
660
+ int i;
661
+ mbstate_t state; ZEROSET(&state, mbstate_t);
662
+
663
+ i = mb_next_char(&wchr, t, &state);
664
+
665
+ while (wchr != 0 && iswalnum(wchr)) {
666
+ t += i;
667
+ i = mb_next_char(&wchr, t, &state);
668
+ }
669
+
670
+ i = (int)(t - ts->t);
671
+ if (i > MAX_WORD_SIZE) {
672
+ i = MAX_WORD_SIZE - 1;
673
+ }
674
+ memcpy(token, ts->t, i);
675
+ return i;
676
+ }
677
+
678
+ static int isnumpunc(char c)
679
+ {
680
+ return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
681
+ || c == '-');
682
+ }
683
+
684
+ static int w_isnumpunc(wchar_t c)
685
+ {
686
+ return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
687
+ || c == L'-');
688
+ }
689
+
690
+ static int isurlpunc(char c)
691
+ {
692
+ return (c == '.' || c == '/' || c == '-' || c == '_');
693
+ }
694
+
695
+ static int isurlc(char c)
696
+ {
697
+ return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
698
+ }
699
+
700
+ static int isurlxatpunc(char c)
701
+ {
702
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
703
+ }
704
+
705
+ static int isurlxatc(char c)
706
+ {
707
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
708
+ || isalnum(c));
709
+ }
710
+
711
+ static bool legacy_std_is_tok_char(char *c)
712
+ {
713
+ if (isspace(*c)) {
714
+ return false; /* most common so check first. */
715
+ }
716
+ if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
717
+ *c == '@' || *c == '\'' || *c == ':') {
718
+ return true;
719
+ }
720
+ return false;
721
+ }
722
+
723
+ static bool mb_legacy_std_is_tok_char(char *t)
724
+ {
725
+ wchar_t c;
726
+ mbstate_t state; ZEROSET(&state, mbstate_t);
727
+
728
+ if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
729
+ /* error which we can handle next time round. For now just return
730
+ * false so that we can return a token */
731
+ return false;
732
+ }
733
+ if (iswspace(c)) {
734
+ return false; /* most common so check first. */
735
+ }
736
+ if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
737
+ || c == L':') {
738
+ return true;
739
+ }
740
+ return false;
741
+ }
742
+
743
+ /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
744
+ * least one digit.
745
+ * (alnum) = [a-zA-Z0-9]
746
+ * (punc) = [_\/.,-]
747
+ */
748
+ static int legacy_std_get_number(char *input)
749
+ {
750
+ int i = 0;
751
+ int count = 0;
752
+ int last_seen_digit = 2;
753
+ int seen_digit = false;
754
+
755
+ while (last_seen_digit >= 0) {
756
+ while ((input[i] != '\0') && isalnum(input[i])) {
757
+ if ((last_seen_digit < 2) && isdigit(input[i])) {
758
+ last_seen_digit = 2;
759
+ }
760
+ if ((seen_digit == false) && isdigit(input[i])) {
761
+ seen_digit = true;
762
+ }
763
+ i++;
764
+ }
765
+ last_seen_digit--;
766
+ if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
767
+
768
+ if (last_seen_digit >= 0) {
769
+ count = i;
770
+ }
771
+ break;
772
+ }
773
+ count = i;
774
+ i++;
775
+ }
776
+ if (seen_digit) {
777
+ return count;
778
+ }
779
+ else {
780
+ return 0;
781
+ }
782
+ }
783
+
784
+ static int legacy_std_get_apostrophe(char *input)
785
+ {
786
+ char *t = input;
787
+
788
+ while (isalpha(*t) || *t == '\'') {
789
+ t++;
790
+ }
791
+
792
+ return (int)(t - input);
793
+ }
794
+
795
+ static int mb_legacy_std_get_apostrophe(char *input)
796
+ {
797
+ char *t = input;
798
+ wchar_t wchr;
799
+ int i;
800
+ mbstate_t state; ZEROSET(&state, mbstate_t);
801
+
802
+ i = mb_next_char(&wchr, t, &state);
803
+
804
+ while (iswalpha(wchr) || wchr == L'\'') {
805
+ t += i;
806
+ i = mb_next_char(&wchr, t, &state);
807
+ }
808
+ return (int)(t - input);
809
+ }
810
+
811
+ static char *std_get_url(char *input, char *token, int i, int *len)
812
+ {
813
+ char *next = NULL;
814
+ while (isurlc(input[i])) {
815
+ if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
816
+ break; /* can't have two puncs in a row */
817
+ }
818
+ if (i < MAX_WORD_SIZE) {
819
+ token[i] = input[i];
820
+ }
821
+ i++;
822
+ }
823
+ next = input + i;
824
+
825
+ /* We don't want to index past the end of the token capacity) */
826
+ if (i >= MAX_WORD_SIZE) {
827
+ i = MAX_WORD_SIZE - 1;
828
+ }
829
+
830
+ /* strip trailing puncs */
831
+ while (isurlpunc(input[i - 1])) {
832
+ i--;
833
+ }
834
+ *len = i;
835
+ token[i] = '\0';
836
+
837
+ return next;
838
+ }
839
+
840
+ /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
841
+ */
842
+ static int legacy_std_get_company_name(char *input)
843
+ {
844
+ int i = 0;
845
+ while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
846
+ i++;
847
+ }
848
+
849
+ return i;
850
+ }
851
+
852
+ static bool legacy_std_advance_to_start(TokenStream *ts)
853
+ {
854
+ char *t = ts->t;
855
+ while (*t != '\0' && !isalnum(*t)) {
856
+ if (isnumpunc(*t) && isdigit(t[1])) break;
857
+ t++;
858
+ }
859
+
860
+ ts->t = t;
861
+
862
+ return (*t != '\0');
863
+ }
864
+
865
+ static bool mb_legacy_std_advance_to_start(TokenStream *ts)
866
+ {
867
+ int i;
868
+ wchar_t wchr;
869
+ mbstate_t state; ZEROSET(&state, mbstate_t);
870
+
871
+ i = mb_next_char(&wchr, ts->t, &state);
872
+
873
+ while (wchr != 0 && !iswalnum(wchr)) {
874
+ if (isnumpunc(*ts->t) && isdigit(ts->t[1])) break;
875
+ ts->t += i;
876
+ i = mb_next_char(&wchr, ts->t, &state);
877
+ }
878
+
879
+ return (wchr != 0);
880
+ }
881
+
882
+ static Token *legacy_std_next(TokenStream *ts)
883
+ {
884
+ LegacyStandardTokenizer *std_tz = LSTDTS(ts);
885
+ char *s;
886
+ char *t;
887
+ char *start = NULL;
888
+ char *num_end = NULL;
889
+ char token[MAX_WORD_SIZE + 1];
890
+ int token_i = 0;
891
+ int len;
892
+ bool is_acronym;
893
+ bool seen_at_symbol;
894
+
895
+
896
+ if (!std_tz->advance_to_start(ts)) {
897
+ return NULL;
898
+ }
899
+
900
+ start = t = ts->t;
901
+ token_i = std_tz->get_alpha(ts, token);
902
+ t += token_i;
903
+
904
+ if (!std_tz->is_tok_char(t)) {
905
+ /* very common case, ie a plain word, so check and return */
906
+ ts->t = t;
907
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
908
+ }
909
+
910
+ if (*t == '\'') { /* apostrophe case. */
911
+ t += std_tz->get_apostrophe(t);
912
+ ts->t = t;
913
+ len = (int)(t - start);
914
+ /* strip possesive */
915
+ if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
916
+ t -= 2;
917
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
918
+ CTS(ts)->token.end += 2;
919
+ }
920
+ else if (t[-1] == '\'') {
921
+ t -= 1;
922
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
923
+ CTS(ts)->token.end += 1;
924
+ }
925
+ else {
926
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
927
+ }
928
+
929
+ return &(CTS(ts)->token);
930
+ }
931
+
932
+ if (*t == '&') { /* apostrophe case. */
933
+ t += legacy_std_get_company_name(t);
934
+ ts->t = t;
935
+ return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
936
+ }
937
+
938
+ if ((isdigit(*start) || isnumpunc(*start)) /* possibly a number */
939
+ && ((len = legacy_std_get_number(start)) > 0)) {
940
+ num_end = start + len;
941
+ if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
942
+ ts->t = num_end;
943
+ return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
944
+ }
945
+ /* else there may be a longer token so check */
946
+ }
947
+
948
+ if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
949
+ /* check for a known url start */
950
+ token[token_i] = '\0';
951
+ t += 3;
952
+ token_i += 3;
953
+ while (*t == '/') {
954
+ t++;
955
+ }
956
+ if (isalpha(*t) &&
957
+ (memcmp(token, "ftp", 3) == 0 ||
958
+ memcmp(token, "http", 4) == 0 ||
959
+ memcmp(token, "https", 5) == 0 ||
960
+ memcmp(token, "file", 4) == 0)) {
961
+ ts->t = std_get_url(t, token, 0, &len); /* dispose of first part of the URL */
962
+ }
963
+ else { /* still treat as url but keep the first part */
964
+ token_i = (int)(t - start);
965
+ memcpy(token, start, token_i * sizeof(char));
966
+ ts->t = std_get_url(start, token, token_i, &len); /* keep start */
967
+ }
968
+ return tk_set(&(CTS(ts)->token), token, len,
969
+ (off_t)(start - ts->text),
970
+ (off_t)(ts->t - ts->text), 1);
971
+ }
972
+
973
+ /* now see how long a url we can find. */
974
+ is_acronym = true;
975
+ seen_at_symbol = false;
976
+ while (isurlxatc(*t)) {
977
+ if (is_acronym && !isalpha(*t) && (*t != '.')) {
978
+ is_acronym = false;
979
+ }
980
+ if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
981
+ break; /* can't have two punctuation characters in a row */
982
+ }
983
+ if (*t == '@') {
984
+ if (seen_at_symbol) {
985
+ break; /* we can only have one @ symbol */
986
+ }
987
+ else {
988
+ seen_at_symbol = true;
989
+ }
990
+ }
991
+ t++;
992
+ }
993
+ while (isurlxatpunc(t[-1]) && t > ts->t) {
994
+ t--; /* strip trailing punctuation */
995
+ }
996
+
997
+ if (t < ts->t || (num_end != NULL && num_end < ts->t)) {
998
+ fprintf(stderr, "Warning: encoding error. Please check that you are using the correct locale for your input");
999
+ return NULL;
1000
+ } else if (num_end == NULL || t > num_end) {
1001
+ ts->t = t;
1002
+
1003
+ if (is_acronym) { /* check it is one letter followed by one '.' */
1004
+ for (s = start; s < t - 1; s++) {
1005
+ if (isalpha(*s) && (s[1] != '.'))
1006
+ is_acronym = false;
1007
+ }
1008
+ }
1009
+ if (is_acronym) { /* strip '.'s */
1010
+ for (s = start + token_i; s < t; s++) {
1011
+ if (*s != '.') {
1012
+ token[token_i] = *s;
1013
+ token_i++;
1014
+ }
1015
+ }
1016
+ tk_set(&(CTS(ts)->token), token, token_i,
1017
+ (off_t)(start - ts->text),
1018
+ (off_t)(t - ts->text), 1);
1019
+ }
1020
+ else { /* just return the url as is */
1021
+ tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
1022
+ }
1023
+ }
1024
+ else { /* return the number */
1025
+ ts->t = num_end;
1026
+ tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
1027
+ }
1028
+
1029
+ return &(CTS(ts)->token);
1030
+ }
1031
+
1032
+ static TokenStream *legacy_std_ts_clone_i(TokenStream *orig_ts)
1033
+ {
1034
+ return ts_clone_size(orig_ts, sizeof(LegacyStandardTokenizer));
1035
+ }
1036
+
1037
+ static TokenStream *legacy_std_ts_new()
1038
+ {
1039
+ TokenStream *ts = ts_new(LegacyStandardTokenizer);
1040
+
1041
+ ts->clone_i = &legacy_std_ts_clone_i;
1042
+ ts->next = &legacy_std_next;
1043
+
1044
+ return ts;
1045
+ }
1046
+
1047
+ TokenStream *legacy_standard_tokenizer_new()
1048
+ {
1049
+ TokenStream *ts = legacy_std_ts_new();
1050
+
1051
+ LSTDTS(ts)->advance_to_start = &legacy_std_advance_to_start;
1052
+ LSTDTS(ts)->get_alpha = &legacy_std_get_alpha;
1053
+ LSTDTS(ts)->is_tok_char = &legacy_std_is_tok_char;
1054
+ LSTDTS(ts)->get_apostrophe = &legacy_std_get_apostrophe;
1055
+
1056
+ return ts;
1057
+ }
1058
+
1059
+ TokenStream *mb_legacy_standard_tokenizer_new()
1060
+ {
1061
+ TokenStream *ts = legacy_std_ts_new();
1062
+
1063
+ LSTDTS(ts)->advance_to_start = &mb_legacy_std_advance_to_start;
1064
+ LSTDTS(ts)->get_alpha = &mb_legacy_std_get_alpha;
1065
+ LSTDTS(ts)->is_tok_char = &mb_legacy_std_is_tok_char;
1066
+ LSTDTS(ts)->get_apostrophe = &mb_legacy_std_get_apostrophe;
1067
+
1068
+ return ts;
1069
+ }
1070
+
1071
+ /****************************************************************************
1072
+ *
1073
+ * Filters
1074
+ *
1075
+ ****************************************************************************/
1076
+
1077
+ #define TkFilt(filter) ((TokenFilter *)(filter))
1078
+
1079
+ TokenStream *filter_clone_size(TokenStream *ts, size_t size)
1080
+ {
1081
+ TokenStream *ts_new = ts_clone_size(ts, size);
1082
+ TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
1083
+ return ts_new;
1084
+ }
1085
+
1086
+ static TokenStream *filter_clone_i(TokenStream *ts)
1087
+ {
1088
+ return filter_clone_size(ts, sizeof(TokenFilter));
1089
+ }
1090
+
1091
+ static TokenStream *filter_reset(TokenStream *ts, char *text)
1092
+ {
1093
+ TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
1094
+ return ts;
1095
+ }
1096
+
1097
+ static void filter_destroy_i(TokenStream *ts)
1098
+ {
1099
+ ts_deref(TkFilt(ts)->sub_ts);
1100
+ free(ts);
1101
+ }
1102
+
1103
+ TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
1104
+ {
1105
+ TokenStream *ts = (TokenStream *)ecalloc(size);
1106
+
1107
+ TkFilt(ts)->sub_ts = sub_ts;
1108
+
1109
+ ts->clone_i = &filter_clone_i;
1110
+ ts->destroy_i = &filter_destroy_i;
1111
+ ts->reset = &filter_reset;
1112
+ ts->ref_cnt = 1;
1113
+
1114
+ return ts;
1115
+ }
1116
+
1117
+ /****************************************************************************
1118
+ * StopFilter
1119
+ ****************************************************************************/
1120
+
1121
+ #define StopFilt(filter) ((StopFilter *)(filter))
1122
+
1123
+ static void sf_destroy_i(TokenStream *ts)
1124
+ {
1125
+ h_destroy(StopFilt(ts)->words);
1126
+ filter_destroy_i(ts);
1127
+ }
1128
+
1129
+ static TokenStream *sf_clone_i(TokenStream *orig_ts)
1130
+ {
1131
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
1132
+ REF(StopFilt(new_ts)->words);
1133
+ return new_ts;
1134
+ }
1135
+
1136
+ static Token *sf_next(TokenStream *ts)
1137
+ {
1138
+ int pos_inc = 0;
1139
+ Hash *words = StopFilt(ts)->words;
1140
+ TokenFilter *tf = TkFilt(ts);
1141
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1142
+
1143
+ while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
1144
+ pos_inc += tk->pos_inc;
1145
+ tk = tf->sub_ts->next(tf->sub_ts);
1146
+ }
1147
+
1148
+ if (tk != NULL) {
1149
+ tk->pos_inc += pos_inc;
1150
+ }
1151
+
1152
+ return tk;
1153
+ }
1154
+
1155
+ TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
1156
+ const char **words, int len)
1157
+ {
1158
+ int i;
1159
+ char *word;
1160
+ Hash *word_table = h_new_str(&free, (free_ft) NULL);
1161
+ TokenStream *ts = tf_new(StopFilter, sub_ts);
1162
+
1163
+ for (i = 0; i < len; i++) {
1164
+ word = estrdup(words[i]);
1165
+ h_set(word_table, word, word);
1166
+ }
1167
+ StopFilt(ts)->words = word_table;
1168
+ ts->next = &sf_next;
1169
+ ts->destroy_i = &sf_destroy_i;
1170
+ ts->clone_i = &sf_clone_i;
1171
+ return ts;
1172
+ }
1173
+
1174
+ TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
1175
+ const char **words)
1176
+ {
1177
+ char *word;
1178
+ Hash *word_table = h_new_str(&free, (free_ft) NULL);
1179
+ TokenStream *ts = tf_new(StopFilter, sub_ts);
1180
+
1181
+ while (*words) {
1182
+ word = estrdup(*words);
1183
+ h_set(word_table, word, word);
1184
+ words++;
1185
+ }
1186
+
1187
+ StopFilt(ts)->words = word_table;
1188
+ ts->next = &sf_next;
1189
+ ts->destroy_i = &sf_destroy_i;
1190
+ ts->clone_i = &sf_clone_i;
1191
+ return ts;
1192
+ }
1193
+
1194
+ TokenStream *stop_filter_new(TokenStream *ts)
1195
+ {
1196
+ return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
1197
+ }
1198
+
1199
+ /****************************************************************************
1200
+ * MappingFilter
1201
+ ****************************************************************************/
1202
+
1203
+ #define MFilt(filter) ((MappingFilter *)(filter))
1204
+
1205
+ static void mf_destroy_i(TokenStream *ts)
1206
+ {
1207
+ mulmap_destroy(MFilt(ts)->mapper);
1208
+ filter_destroy_i(ts);
1209
+ }
1210
+
1211
+ static TokenStream *mf_clone_i(TokenStream *orig_ts)
1212
+ {
1213
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(MappingFilter));
1214
+ REF(MFilt(new_ts)->mapper);
1215
+ return new_ts;
1216
+ }
1217
+
1218
+ static Token *mf_next(TokenStream *ts)
1219
+ {
1220
+ char buf[MAX_WORD_SIZE + 1];
1221
+ MultiMapper *mapper = MFilt(ts)->mapper;
1222
+ TokenFilter *tf = TkFilt(ts);
1223
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1224
+ if (tk != NULL) {
1225
+ tk->len = mulmap_map_len(mapper, buf, tk->text, MAX_WORD_SIZE);
1226
+ memcpy(tk->text, buf, tk->len + 1);
1227
+ }
1228
+ return tk;
1229
+ }
1230
+
1231
+ static TokenStream *mf_reset(TokenStream *ts, char *text)
1232
+ {
1233
+ MultiMapper *mm = MFilt(ts)->mapper;
1234
+ if (mm->d_size == 0) {
1235
+ mulmap_compile(MFilt(ts)->mapper);
1236
+ }
1237
+ filter_reset(ts, text);
1238
+ return ts;
1239
+ }
1240
+
1241
+ TokenStream *mapping_filter_new(TokenStream *sub_ts)
1242
+ {
1243
+ TokenStream *ts = tf_new(MappingFilter, sub_ts);
1244
+ MFilt(ts)->mapper = mulmap_new();
1245
+ ts->next = &mf_next;
1246
+ ts->destroy_i = &mf_destroy_i;
1247
+ ts->clone_i = &mf_clone_i;
1248
+ ts->reset = &mf_reset;
1249
+ return ts;
1250
+ }
1251
+
1252
+ TokenStream *mapping_filter_add(TokenStream *ts, const char *pattern,
1253
+ const char *replacement)
1254
+ {
1255
+ mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
1256
+ return ts;
1257
+ }
1258
+
1259
+ /****************************************************************************
1260
+ * HyphenFilter
1261
+ ****************************************************************************/
1262
+
1263
+ #define HyphenFilt(filter) ((HyphenFilter *)(filter))
1264
+
1265
+ static TokenStream *hf_clone_i(TokenStream *orig_ts)
1266
+ {
1267
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(HyphenFilter));
1268
+ return new_ts;
1269
+ }
1270
+
1271
+ static Token *hf_next(TokenStream *ts)
1272
+ {
1273
+ HyphenFilter *hf = HyphenFilt(ts);
1274
+ TokenFilter *tf = TkFilt(ts);
1275
+ Token *tk = hf->tk;
1276
+
1277
+ if (hf->pos < hf->len) {
1278
+ const int pos = hf->pos;
1279
+ const int text_len = strlen(hf->text + pos);
1280
+ strcpy(tk->text, hf->text + pos);
1281
+ tk->pos_inc = ((pos != 0) ? 1 : 0);
1282
+ tk->start = hf->start + pos;
1283
+ tk->end = tk->start + text_len;
1284
+ hf->pos += text_len + 1;
1285
+ tk->len = text_len;
1286
+ return tk;
1287
+ }
1288
+ else {
1289
+ char *p;
1290
+ bool seen_hyphen = false;
1291
+ bool seen_other_punc = false;
1292
+ hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
1293
+ if (NULL == tk) return NULL;
1294
+ p = tk->text + 1;
1295
+ while (*p) {
1296
+ if (*p == '-') {
1297
+ seen_hyphen = true;
1298
+ }
1299
+ else if (!isalpha(*p)) {
1300
+ seen_other_punc = true;
1301
+ break;
1302
+ }
1303
+ p++;
1304
+ }
1305
+ if (seen_hyphen && !seen_other_punc) {
1306
+ char *q = hf->text;
1307
+ char *r = tk->text;
1308
+ p = tk->text;
1309
+ while (*p) {
1310
+ if (*p == '-') {
1311
+ *q = '\0';
1312
+ }
1313
+ else {
1314
+ *r = *q = *p;
1315
+ r++;
1316
+ }
1317
+ q++;
1318
+ p++;
1319
+ }
1320
+ *r = *q = '\0';
1321
+ hf->start = tk->start;
1322
+ hf->pos = 0;
1323
+ hf->len = q - hf->text;
1324
+ tk->len = r - tk->text;
1325
+ }
1326
+ }
1327
+ return tk;
1328
+ }
1329
+
1330
+ TokenStream *hyphen_filter_new(TokenStream *sub_ts)
1331
+ {
1332
+ TokenStream *ts = tf_new(HyphenFilter, sub_ts);
1333
+ ts->next = &hf_next;
1334
+ ts->clone_i = &hf_clone_i;
1335
+ return ts;
1336
+ }
1337
+
1338
+ /****************************************************************************
1339
+ * LowerCaseFilter
1340
+ ****************************************************************************/
1341
+
1342
+
1343
+ static Token *mb_lcf_next(TokenStream *ts)
1344
+ {
1345
+ wchar_t wbuf[MAX_WORD_SIZE + 1], *wchr;
1346
+ Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1347
+ int x;
1348
+ wbuf[MAX_WORD_SIZE] = 0;
1349
+
1350
+ if (tk == NULL) {
1351
+ return tk;
1352
+ }
1353
+
1354
+ if ((x=mbstowcs(wbuf, tk->text, MAX_WORD_SIZE)) <= 0) return tk;
1355
+ wchr = wbuf;
1356
+ while (*wchr != 0) {
1357
+ *wchr = towlower(*wchr);
1358
+ wchr++;
1359
+ }
1360
+ tk->len = wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
1361
+ if (tk->len <= 0) {
1362
+ strcpy(tk->text, "BAD_DATA");
1363
+ tk->len = 8;
1364
+ }
1365
+ tk->text[tk->len] = '\0';
1366
+ return tk;
1367
+ }
1368
+
1369
+ TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
1370
+ {
1371
+ TokenStream *ts = tf_new(TokenFilter, sub_ts);
1372
+ ts->next = &mb_lcf_next;
1373
+ return ts;
1374
+ }
1375
+
1376
+ static Token *lcf_next(TokenStream *ts)
1377
+ {
1378
+ int i = 0;
1379
+ Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1380
+ if (tk == NULL) {
1381
+ return tk;
1382
+ }
1383
+ while (tk->text[i] != '\0') {
1384
+ tk->text[i] = tolower(tk->text[i]);
1385
+ i++;
1386
+ }
1387
+ return tk;
1388
+ }
1389
+
1390
+ TokenStream *lowercase_filter_new(TokenStream *sub_ts)
1391
+ {
1392
+ TokenStream *ts = tf_new(TokenFilter, sub_ts);
1393
+ ts->next = &lcf_next;
1394
+ return ts;
1395
+ }
1396
+
1397
+ /****************************************************************************
1398
+ * StemFilter
1399
+ ****************************************************************************/
1400
+
1401
+ #define StemFilt(filter) ((StemFilter *)(filter))
1402
+
1403
+ static void stemf_destroy_i(TokenStream *ts)
1404
+ {
1405
+ sb_stemmer_delete(StemFilt(ts)->stemmer);
1406
+ free(StemFilt(ts)->algorithm);
1407
+ free(StemFilt(ts)->charenc);
1408
+ filter_destroy_i(ts);
1409
+ }
1410
+
1411
+ static Token *stemf_next(TokenStream *ts)
1412
+ {
1413
+ int len;
1414
+ const sb_symbol *stemmed;
1415
+ struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
1416
+ TokenFilter *tf = TkFilt(ts);
1417
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
1418
+ if (tk == NULL) {
1419
+ return tk;
1420
+ }
1421
+ stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
1422
+ len = sb_stemmer_length(stemmer);
1423
+ if (len >= MAX_WORD_SIZE) {
1424
+ len = MAX_WORD_SIZE - 1;
1425
+ }
1426
+
1427
+ memcpy(tk->text, stemmed, len);
1428
+ tk->text[len] = '\0';
1429
+ tk->len = len;
1430
+ return tk;
1431
+ }
1432
+
1433
+ static TokenStream *stemf_clone_i(TokenStream *orig_ts)
1434
+ {
1435
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StemFilter));
1436
+ StemFilter *stemf = StemFilt(new_ts);
1437
+ StemFilter *orig_stemf = StemFilt(orig_ts);
1438
+ stemf->stemmer =
1439
+ sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1440
+ stemf->algorithm =
1441
+ orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
1442
+ stemf->charenc =
1443
+ orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
1444
+ return new_ts;
1445
+ }
1446
+
1447
+ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
1448
+ const char *charenc)
1449
+ {
1450
+ TokenStream *tf = tf_new(StemFilter, ts);
1451
+ char *my_algorithm = NULL;
1452
+ char *my_charenc = NULL;
1453
+ char *s = NULL;
1454
+
1455
+ if (algorithm) {
1456
+ my_algorithm = estrdup(algorithm);
1457
+
1458
+ /* algorithms are lowercase */
1459
+ s = my_algorithm;
1460
+ while (*s) {
1461
+ *s = tolower(*s);
1462
+ s++;
1463
+ }
1464
+ StemFilt(tf)->algorithm = my_algorithm;
1465
+ }
1466
+
1467
+ if (charenc) {
1468
+ my_charenc = estrdup(charenc);
1469
+
1470
+ /* encodings are uppercase and use '_' instead of '-' */
1471
+ s = my_charenc;
1472
+ while (*s) {
1473
+ *s = (*s == '-') ? '_' : toupper(*s);
1474
+ s++;
1475
+ }
1476
+ StemFilt(tf)->charenc = my_charenc;
1477
+ }
1478
+
1479
+ StemFilt(tf)->stemmer = sb_stemmer_new(my_algorithm, my_charenc);
1480
+
1481
+ tf->next = &stemf_next;
1482
+ tf->destroy_i = &stemf_destroy_i;
1483
+ tf->clone_i = &stemf_clone_i;
1484
+ return tf;
1485
+ }
1486
+
1487
+ /****************************************************************************
1488
+ *
1489
+ * Analyzers
1490
+ *
1491
+ ****************************************************************************/
1492
+
1493
+ /****************************************************************************
1494
+ * Standard
1495
+ ****************************************************************************/
1496
+
1497
+ Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
1498
+ bool lowercase)
1499
+ {
1500
+ TokenStream *ts = standard_tokenizer_new();
1501
+ if (lowercase) {
1502
+ ts = lowercase_filter_new(ts);
1503
+ }
1504
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1505
+ return analyzer_new(ts, NULL, NULL);
1506
+ }
1507
+
1508
+ Analyzer *standard_analyzer_new_with_words(const char **words,
1509
+ bool lowercase)
1510
+ {
1511
+ TokenStream *ts = standard_tokenizer_new();
1512
+ if (lowercase) {
1513
+ ts = lowercase_filter_new(ts);
1514
+ }
1515
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1516
+ return analyzer_new(ts, NULL, NULL);
1517
+ }
1518
+
1519
+ Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
1520
+ int len, bool lowercase)
1521
+ {
1522
+ TokenStream *ts = mb_standard_tokenizer_new();
1523
+ if (lowercase) {
1524
+ ts = mb_lowercase_filter_new(ts);
1525
+ }
1526
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1527
+ return analyzer_new(ts, NULL, NULL);
1528
+ }
1529
+
1530
+ Analyzer *mb_standard_analyzer_new_with_words(const char **words,
1531
+ bool lowercase)
1532
+ {
1533
+ TokenStream *ts = mb_standard_tokenizer_new();
1534
+ if (lowercase) {
1535
+ ts = mb_lowercase_filter_new(ts);
1536
+ }
1537
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1538
+ return analyzer_new(ts, NULL, NULL);
1539
+ }
1540
+
1541
+ Analyzer *utf8_standard_analyzer_new_with_words_len(const char **words,
1542
+ int len, bool lowercase)
1543
+ {
1544
+ TokenStream *ts = utf8_standard_tokenizer_new();
1545
+ if (lowercase) {
1546
+ ts = mb_lowercase_filter_new(ts);
1547
+ }
1548
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1549
+ return analyzer_new(ts, NULL, NULL);
1550
+ }
1551
+
1552
+ Analyzer *utf8_standard_analyzer_new_with_words(const char **words,
1553
+ bool lowercase)
1554
+ {
1555
+ TokenStream *ts = utf8_standard_tokenizer_new();
1556
+ if (lowercase) {
1557
+ ts = mb_lowercase_filter_new(ts);
1558
+ }
1559
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1560
+ return analyzer_new(ts, NULL, NULL);
1561
+ }
1562
+
1563
+ Analyzer *standard_analyzer_new(bool lowercase)
1564
+ {
1565
+ return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1566
+ lowercase);
1567
+ }
1568
+
1569
+ Analyzer *mb_standard_analyzer_new(bool lowercase)
1570
+ {
1571
+ return mb_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1572
+ lowercase);
1573
+ }
1574
+
1575
+ Analyzer *utf8_standard_analyzer_new(bool lowercase)
1576
+ {
1577
+ return utf8_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1578
+ lowercase);
1579
+ }
1580
+
1581
+ /****************************************************************************
1582
+ * Legacy
1583
+ ****************************************************************************/
1584
+
1585
+ Analyzer *legacy_standard_analyzer_new_with_words_len(const char **words, int len,
1586
+ bool lowercase)
1587
+ {
1588
+ TokenStream *ts = legacy_standard_tokenizer_new();
1589
+ if (lowercase) {
1590
+ ts = lowercase_filter_new(ts);
1591
+ }
1592
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1593
+ return analyzer_new(ts, NULL, NULL);
1594
+ }
1595
+
1596
+ Analyzer *legacy_standard_analyzer_new_with_words(const char **words,
1597
+ bool lowercase)
1598
+ {
1599
+ TokenStream *ts = legacy_standard_tokenizer_new();
1600
+ if (lowercase) {
1601
+ ts = lowercase_filter_new(ts);
1602
+ }
1603
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1604
+ return analyzer_new(ts, NULL, NULL);
1605
+ }
1606
+
1607
+ Analyzer *mb_legacy_standard_analyzer_new_with_words_len(const char **words,
1608
+ int len, bool lowercase)
1609
+ {
1610
+ TokenStream *ts = mb_legacy_standard_tokenizer_new();
1611
+ if (lowercase) {
1612
+ ts = mb_lowercase_filter_new(ts);
1613
+ }
1614
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1615
+ return analyzer_new(ts, NULL, NULL);
1616
+ }
1617
+
1618
+ Analyzer *mb_legacy_standard_analyzer_new_with_words(const char **words,
1619
+ bool lowercase)
1620
+ {
1621
+ TokenStream *ts = mb_legacy_standard_tokenizer_new();
1622
+ if (lowercase) {
1623
+ ts = mb_lowercase_filter_new(ts);
1624
+ }
1625
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1626
+ return analyzer_new(ts, NULL, NULL);
1627
+ }
1628
+
1629
+ Analyzer *legacy_standard_analyzer_new(bool lowercase)
1630
+ {
1631
+ return legacy_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1632
+ lowercase);
1633
+ }
1634
+
1635
+ Analyzer *mb_legacy_standard_analyzer_new(bool lowercase)
1636
+ {
1637
+ return mb_legacy_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1638
+ lowercase);
1639
+ }
1640
+
1641
+ /****************************************************************************
1642
+ *
1643
+ * PerFieldAnalyzer
1644
+ *
1645
+ ****************************************************************************/
1646
+
1647
+ static void pfa_destroy_i(Analyzer *self)
1648
+ {
1649
+ h_destroy(PFA(self)->dict);
1650
+
1651
+ a_deref(PFA(self)->default_a);
1652
+ free(self);
1653
+ }
1654
+
1655
+ static TokenStream *pfa_get_ts(Analyzer *self,
1656
+ Symbol field, char *text)
1657
+ {
1658
+ Analyzer *a = (Analyzer *)h_get(PFA(self)->dict, field);
1659
+ if (a == NULL) {
1660
+ a = PFA(self)->default_a;
1661
+ }
1662
+ return a_get_ts(a, field, text);
1663
+ }
1664
+
1665
+ static void pfa_sub_a_destroy_i(void *p)
1666
+ {
1667
+ Analyzer *a = (Analyzer *) p;
1668
+ a_deref(a);
1669
+ }
1670
+
1671
+ void pfa_add_field(Analyzer *self,
1672
+ Symbol field,
1673
+ Analyzer *analyzer)
1674
+ {
1675
+ h_set(PFA(self)->dict, field, analyzer);
1676
+ }
1677
+
1678
+ Analyzer *per_field_analyzer_new(Analyzer *default_a)
1679
+ {
1680
+ Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
1681
+
1682
+ PFA(a)->default_a = default_a;
1683
+ PFA(a)->dict = h_new_ptr(&pfa_sub_a_destroy_i);
1684
+
1685
+ a->destroy_i = &pfa_destroy_i;
1686
+ a->get_ts = pfa_get_ts;
1687
+ a->ref_cnt = 1;
1688
+
1689
+ return a;
1690
+ }
1691
+
1692
+ #ifdef TOKENIZE
1693
+ int main(int argc, char **argv)
1694
+ {
1695
+ char buf[10000];
1696
+ Analyzer *a = standard_analyzer_new(true);
1697
+ TokenStream *ts;
1698
+ Token *tk;
1699
+ (void)argc; (void)argv;
1700
+ while (fgets(buf, 9999, stdin) != NULL) {
1701
+ ts = a_get_ts(a, "hello", buf);
1702
+ while ((tk = ts->next(ts)) != NULL) {
1703
+ printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
1704
+ }
1705
+ printf("\n");
1706
+ ts_deref(ts);
1707
+ }
1708
+ return 0;
1709
+ }
1710
+ #endif